Browse Source

first commit

Former-commit-id: 08bc23ba02 [formerly 0795edd483] [formerly b5010b4254 [formerly a7ca09c2c3]] [formerly 615058473a [formerly 743d8dfc68] [formerly bb0ea98b1e [formerly 960a69da74]]] [formerly 2fa3be52c1 [formerly 2054589f08] [formerly 3b53730e8a [formerly 8a2fa8ab7b]] [formerly 87d1e3a07a [formerly f331916385] [formerly 69fb3c78a4 [formerly 386086f05a]]]]
Former-commit-id: a00aed8c93 [formerly 26fdeca29c] [formerly 0e3170d41a [formerly 3c12ad4c88]] [formerly d5894f84f2 [formerly ad3e7bc670] [formerly 69fb3c78a4]]
Former-commit-id: 3c19c9fae6 [formerly 467ddc0549] [formerly 5fa518345d [formerly 3530e130b9]]
Former-commit-id: 9f5d473d42 [formerly be0b25c4ec]
Former-commit-id: 9eca71cb73
master
lhenry15 4 years ago
commit
10a4f0dd78
100 changed files with 19209 additions and 0 deletions
  1. +116
    -0
      .gitignore
  2. +143
    -0
      README.md
  3. +108
    -0
      axolotl/.gitignore
  4. +33
    -0
      axolotl/.gitlab-ci.yml
  5. +3
    -0
      axolotl/.gitmodules
  6. +201
    -0
      axolotl/LICENSE
  7. +41
    -0
      axolotl/README.md
  8. +2
    -0
      axolotl/axolotl/__init__.py
  9. +0
    -0
      axolotl/axolotl/algorithms/__init__.py
  10. +82
    -0
      axolotl/axolotl/algorithms/autokeras_integration/__init__.py
  11. +205
    -0
      axolotl/axolotl/algorithms/autokeras_integration/block.py
  12. +23
    -0
      axolotl/axolotl/algorithms/autokeras_integration/constants.py
  13. +122
    -0
      axolotl/axolotl/algorithms/autokeras_integration/mapping.py
  14. +126
    -0
      axolotl/axolotl/algorithms/autokeras_integration/steps.py
  15. +145
    -0
      axolotl/axolotl/algorithms/autokeras_search.py
  16. +241
    -0
      axolotl/axolotl/algorithms/base.py
  17. +27
    -0
      axolotl/axolotl/algorithms/bayesian_search.py
  18. +1086
    -0
      axolotl/axolotl/algorithms/data_driven_search.py
  19. +87
    -0
      axolotl/axolotl/algorithms/dummy.py
  20. +27
    -0
      axolotl/axolotl/algorithms/random_search.py
  21. +0
    -0
      axolotl/axolotl/algorithms/tuners/__init__.py
  22. +198
    -0
      axolotl/axolotl/algorithms/tuners/bayesian_oracle.py
  23. +535
    -0
      axolotl/axolotl/algorithms/tuners/custom_hps.py
  24. +195
    -0
      axolotl/axolotl/algorithms/tuners/hyperparameters.py
  25. +104
    -0
      axolotl/axolotl/algorithms/tuners/oracle.py
  26. +66
    -0
      axolotl/axolotl/algorithms/tuners/random_search_oracle.py
  27. +258
    -0
      axolotl/axolotl/algorithms/tuners/tunable_base.py
  28. +0
    -0
      axolotl/axolotl/backend/__init__.py
  29. +313
    -0
      axolotl/axolotl/backend/base.py
  30. +269
    -0
      axolotl/axolotl/backend/ray.py
  31. +178
    -0
      axolotl/axolotl/backend/simple.py
  32. +0
    -0
      axolotl/axolotl/d3m_grpc/__init__.py
  33. +127
    -0
      axolotl/axolotl/d3m_grpc/constants.py
  34. +854
    -0
      axolotl/axolotl/d3m_grpc/server.py
  35. +133
    -0
      axolotl/axolotl/predefined_pipelines/__init__.py
  36. +278
    -0
      axolotl/axolotl/predefined_pipelines/base_preprocessor.py
  37. +350
    -0
      axolotl/axolotl/predefined_pipelines/preprocessor.py
  38. +0
    -0
      axolotl/axolotl/utils/__init__.py
  39. +340
    -0
      axolotl/axolotl/utils/data_problem.py
  40. +542
    -0
      axolotl/axolotl/utils/pipeline.py
  41. +31
    -0
      axolotl/axolotl/utils/resources.py
  42. +31
    -0
      axolotl/axolotl/utils/resources/blocklist.json
  43. +64
    -0
      axolotl/axolotl/utils/resources/default_pipelines.json
  44. +31
    -0
      axolotl/axolotl/utils/resources/scoring_pipeline.yml
  45. +7
    -0
      axolotl/axolotl/utils/resources/splitting_pipelines.json
  46. +472
    -0
      axolotl/axolotl/utils/schemas.py
  47. +284
    -0
      axolotl/examples/build_search_algorithm.ipynb
  48. +424
    -0
      axolotl/examples/load_csv.ipynb
  49. +1
    -0
      axolotl/examples/random_search/oracle.json
  50. +31
    -0
      axolotl/examples/run.py
  51. +1
    -0
      axolotl/examples/synthetic_data_bayesian_hp_tunning.ipynb.REMOVED.git-id
  52. +11
    -0
      axolotl/failed_installation_repos.txt
  53. +39
    -0
      axolotl/images/Devd3mStart.sh
  54. +13
    -0
      axolotl/images/axolotl.dockerfile
  55. +3
    -0
      axolotl/images/base.dockerfile
  56. +21
    -0
      axolotl/images/build-images.sh
  57. +11
    -0
      axolotl/run_tests.py
  58. +53
    -0
      axolotl/setup.py
  59. +0
    -0
      axolotl/tests/__init__.py
  60. +383
    -0
      axolotl/tests/_server_test.py
  61. +10
    -0
      axolotl/tests/data/.gitignore
  62. +42
    -0
      axolotl/tests/data/.gitlab-ci.yml
  63. +10
    -0
      axolotl/tests/data/README.md
  64. +20
    -0
      axolotl/tests/data/add.sh
  65. +82
    -0
      axolotl/tests/data/datasets/audio_dataset_1/datasetDoc.json
  66. BIN
      axolotl/tests/data/datasets/audio_dataset_1/media/test_audio.mp3
  67. +2
    -0
      axolotl/tests/data/datasets/audio_dataset_1/tables/learningData.csv
  68. +164
    -0
      axolotl/tests/data/datasets/boston_dataset_1/datasetDoc.json
  69. +507
    -0
      axolotl/tests/data/datasets/boston_dataset_1/tables/learningData.csv
  70. +200
    -0
      axolotl/tests/data/datasets/database_dataset_1/datasetDoc.json
  71. +4
    -0
      axolotl/tests/data/datasets/database_dataset_1/tables/authors.csv
  72. +4
    -0
      axolotl/tests/data/datasets/database_dataset_1/tables/codes.csv
  73. +46
    -0
      axolotl/tests/data/datasets/database_dataset_1/tables/learningData.csv
  74. +65
    -0
      axolotl/tests/data/datasets/database_dataset_1/tables/values.csv
  75. +196
    -0
      axolotl/tests/data/datasets/database_dataset_2/datasetDoc.json
  76. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_2/tables/comments.csv
  77. +101
    -0
      axolotl/tests/data/datasets/database_dataset_2/tables/learningData.csv
  78. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_2/tables/posts.csv
  79. +101
    -0
      axolotl/tests/data/datasets/database_dataset_2/tables/users.csv
  80. +188
    -0
      axolotl/tests/data/datasets/database_dataset_3/datasetDoc.json
  81. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_3/tables/comments.csv
  82. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_3/tables/learningData.csv
  83. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_3/tables/posts.csv
  84. +101
    -0
      axolotl/tests/data/datasets/database_dataset_3/tables/users.csv
  85. +202
    -0
      axolotl/tests/data/datasets/database_dataset_4/datasetDoc.json
  86. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_4/tables/comments.csv
  87. +201
    -0
      axolotl/tests/data/datasets/database_dataset_4/tables/learningData.csv
  88. +1001
    -0
      axolotl/tests/data/datasets/database_dataset_4/tables/posts.csv
  89. +101
    -0
      axolotl/tests/data/datasets/database_dataset_4/tables/users.csv
  90. +68
    -0
      axolotl/tests/data/datasets/graph_dataset_1/datasetDoc.json
  91. +98
    -0
      axolotl/tests/data/datasets/graph_dataset_1/graphs/G1.gml
  92. +12
    -0
      axolotl/tests/data/datasets/graph_dataset_1/tables/learningData.csv
  93. +118
    -0
      axolotl/tests/data/datasets/graph_dataset_2/datasetDoc.json
  94. +7
    -0
      axolotl/tests/data/datasets/graph_dataset_2/tables/edgeList.csv
  95. +12
    -0
      axolotl/tests/data/datasets/graph_dataset_2/tables/learningData.csv
  96. +71
    -0
      axolotl/tests/data/datasets/image_dataset_1/datasetDoc.json
  97. BIN
      axolotl/tests/data/datasets/image_dataset_1/media/001_HandPhoto_left_01.jpg
  98. BIN
      axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_1.png
  99. BIN
      axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_2.png
  100. BIN
      axolotl/tests/data/datasets/image_dataset_1/media/mnist_0_2.png

+ 116
- 0
.gitignore View File

@@ -0,0 +1,116 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
tests/.asv

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
docs/d3m.rst
docs/d3m.*.rst

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mypy
.mypy_cache/

# site
public/

.idea/
tmp/

*.swp
results.csv
pipeline.yml
pipeline_run.yml
example_pipeline.json
.DS_Store
tmp.txt

+ 143
- 0
README.md View File

@@ -0,0 +1,143 @@
# TODS
This is a time-seried outlier detection system.

## Axolotl
Running pre-defined pipeline
```
python examples/build_AutoEncoder_pipeline.py
python examples/run_predefined_pipeline.py
```

## Installation

This package works with **Python 3.6** and pip 19+. You need to have the following packages installed on the system (for Debian/Ubuntu):
```
sudo apt-get install libssl-dev libcurl4-openssl-dev libyaml-dev build-essential libopenblas-dev libcap-dev ffmpeg
```

Then run the script `install.sh`. The script witll install d3m core package with:
```
cd d3m
pip3 install -e .
cd ..
```
Then it installs common primitives (which will be used in the running examples):
```
cd common-primitives
pip3 install -e .
cd ..
```
And it installs sklearn wrapper with:
```
cd sklearn-wrap
pip3 install -r requirements.txt
pip3 install -e .
cd ..
```
It installs anomaly primitives (ours) by:
```
cd anomaly-primitives
pip3 install -r requirements.txt
pip3 install -e .
cd ..
```

There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any.

# Dataset
Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format.

The generated csv file will have the following columns: `d3mIndex`, `timestamp`, `value`, `'ground_truth`. In the example kpi dataset, there is only one value. For other datasets there could be multiple values. The goal of the pipline is to predict the `ground_truth` based on `timestamp` and the value(s).

There is a nice script to check whether the dataset is in the right format. Run
```
python3 datasets/validate.py datasets/anomaly/kpi/
```
The expected output is as follows:
```
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/SCORE/problem_TEST/problemDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/SCORE/dataset_TEST/datasetDoc.json'.
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/kpi_problem/problemDoc.json'.
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TEST/problem_TEST/problemDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/kpi_dataset/datasetDoc.json'.
Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json'.
Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json'.
Validating all datasets and problems.
There are no errors.
```
Of course, you can also create other datasets with `transform.py`. But for now, we can focus on this example dataset since other datasets are usually in the same format.

# Example
In D3M, our goal is to provide a **solution** to a **problem** on a **dataset**. Here, solution is a pipline which consists of data processing, classifiers, etc.

Run the example to build the first pipline with
```
python3 examples/build_iforest_pipline.py
```
Note that we have not implemented iForest yet. This one is actually Random Forest. This will generate a file `pipline.yml`, which describes a pipline. We can run the pipeline on the example data in this repo as follows:
```
python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml
```
Another example on a subset of the sequences of Yahoo dataset is as follows:
```
python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml
```
The above commands will generate two files `results.csv` and `pipline_run.yml`

# How to add a new primitive

For new primitives, put them in `/anomaly_pritives`. There is an example for isolation forest (however, this is essentially a RandomForest, although the name is IsolationForest. We need more efforts to change it to real IsolationForest).

In addition to add a new file, you need to register the promitive in `anomaly-primitives/setup.py` and rerun pip install.

Use the following command to check whether your new primitives are registered:
```
python3 -m d3m index search
```

Test the new primitives:
```
python3 examples/build_iforest_pipline.py
```

# Template for meta-data in primitives

* `__author__`: `DATA Lab at Texas A&M University`
* `name`: Just a name. Name your primitive with a few words
* `python_path`: This path should have **5** segments. The first two segments should be `d3m.primitives`. The third segment shoulb be `anomaly_detection`, `data_preprocessing` or `feature_construction` (it should match `primitive_family`). The fourth segment should be your algorithm name, e.g., `isolation_forest`. Note that this name should also be added to [this file](d3m/d3m/metadata/primitive_names.py). The last segment should be one of `Preprocessing`, `Feature`, `Algorithm` (for now).
* `source`: `name` should be `DATA Lab at Texas A&M University`, `contact` should be `mailto:khlai037@tamu.edu`, `uris` should have `https://gitlab.com/lhenry15/tods.git` and the path your py file.
* `algorithms_types`: Name the primitive by your self and add it to [here](d3m/d3m/metadata/schemas/v0/definitions.json#L1957). **Then reinstall d3m.** Fill this field with `metadata_base.PrimitiveAlgorithmType.YOUR_NAME`
* `primitive_family`: For preprocessing primitives, use `metadata_base.PrimitiveFamily.DATA_PREPROCESSING`. For feature analysis primitives, use `metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION`. For anomaly detection primitives, use `metadata_base.PrimitiveFamily.ANOMALY_DETECTION`.
* `id`: Randomly generate one with `import uuid; uuid.uuid4()`
* `hyperparameters_to_tune`: Specify what hyperparameters can be tuned in your primitive
* `version`: `0.0.1`

Notes:

1. `installation` is not required. We remove it.

2. Try to reinstall everything if it does not work.

3. An example of fake Isolation Forest is [here](anomaly-primitives/anomaly_primitives/SKIsolationForest.py#L294)


## Resources of D3M

If you still have questions, you may refer to the following resources.

Dataset format [https://gitlab.com/datadrivendiscovery/data-supply](https://gitlab.com/datadrivendiscovery/data-supply)

Instructions for creating primitives [https://docs.datadrivendiscovery.org/v2020.1.9/interfaces.html](https://docs.datadrivendiscovery.org/v2020.1.9/interfaces.html)

We use a stable version of d3m core package at [https://gitlab.com/datadrivendiscovery/d3m/-/tree/v2020.1.9](https://gitlab.com/datadrivendiscovery/d3m/-/tree/v2020.1.9).

The documentation is at [https://docs.datadrivendiscovery.org/](https://docs.datadrivendiscovery.org/).

The core package documentation is at [https://docs.datadrivendiscovery.org/v2020.1.9/index.html](https://docs.datadrivendiscovery.org/v2020.1.9/index.html)

The common-primitives is v0.8.0 at [https://gitlab.com/datadrivendiscovery/common-primitives/-/tree/v0.8.0/common_primitives](https://gitlab.com/datadrivendiscovery/common-primitives/-/tree/v0.8.0/common_primitives)

The sklearn-wrap uses dist branch [https://gitlab.com/datadrivendiscovery/sklearn-wrap/-/tree/dist](https://gitlab.com/datadrivendiscovery/sklearn-wrap/-/tree/dist)

There are other primitives developed by many universities but are not used in this repo. See [https://gitlab.com/datadrivendiscovery/primitives](https://gitlab.com/datadrivendiscovery/primitives)

+ 108
- 0
axolotl/.gitignore View File

@@ -0,0 +1,108 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
tests/.asv

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
docs/d3m.rst
docs/d3m.*.rst

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mypy
.mypy_cache/

# site
public/

.idea/
tmp/

+ 33
- 0
axolotl/.gitlab-ci.yml View File

@@ -0,0 +1,33 @@
tests:
image: registry.gitlab.com/axolotl1/axolotl/base:latest
stage: test
tags:
- d3m_runner
services:
- docker:dind
variables:
DOCKER_HOST: tcp://docker:2375
DOCKER_TLS_CERTDIR: ""
GIT_SUBMODULE_STRATEGY: recursive
script:
- pip3 install -e .
- python3 ./run_tests.py


build_base_image:
stage: build
image: registry.gitlab.com/datadrivendiscovery/images/testing:ubuntu-bionic-python36
tags:
- d3m_runner
services:
- docker:dind
variables:
DOCKER_HOST: tcp://docker:2375
DOCKER_TLS_CERTDIR: ""
script:
- ./images/build-images.sh base
only:
- devel




+ 3
- 0
axolotl/.gitmodules View File

@@ -0,0 +1,3 @@
[submodule "tests/data"]
path = tests/data
url = https://gitlab.com/datadrivendiscovery/tests-data.git

+ 201
- 0
axolotl/LICENSE View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.

"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:

(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and

(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and

(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and

(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.

You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

+ 41
- 0
axolotl/README.md View File

@@ -0,0 +1,41 @@
# Axolotl

This package provides an easy and high level abstraction
of the [D3M](https://gitlab.com/datadrivendiscovery/d3m) API for AutoML. It contains a suit of basic
requirements and building blocks
[primitives](https://gitlab.com/datadrivendiscovery/primitives).

## Installation

The package contains two different version of dependencies,
one with GPU support and other that uses CPU. For the installation
we strongly encourage the use of a python 3.6 virtual environment.

* CPU version.
```bash
pip3 install -e git+https://gitlab.com/axolotl1/axolotl.git@devel#egg=axolotl[cpu]
```

* GPU version.
```bash
pip3 install -e git+https://gitlab.com/axolotl1/axolotl.git@devel#egg=axolotl[gpu]
```

Note:
For MacOs, pycurls needs to be manually installed:
```bash
PYCURL_SSL_LIBRARY=openssl LDFLAGS="-L/usr/local/opt/openssl/lib" CPPFLAGS="-I/usr/local/opt/openssl/include" pip install --no-cache-dir pycurl==7.43.0.3
```

## Usage
For new users we recommend installing the package and then cloning it via
```bash
git clone --recursive https://gitlab.com/axolotl1/axolotl.git
```

Then start jupyter lab via
```bash
jupyter lab
```
And then open the [examples](https://gitlab.com/axolotl1/axolotl/-/tree/devel/examples)
directory and try to run them.

+ 2
- 0
axolotl/axolotl/__init__.py View File

@@ -0,0 +1,2 @@
__version__ = 'devel'
__description__ = 'Automated Machine Learning Framework'

+ 0
- 0
axolotl/axolotl/algorithms/__init__.py View File


+ 82
- 0
axolotl/axolotl/algorithms/autokeras_integration/__init__.py View File

@@ -0,0 +1,82 @@
from d3m.metadata.pipeline import Pipeline

from axolotl.algorithms.autokeras_integration.constants import OMIT_LAYERS, step_function
from axolotl.algorithms.autokeras_integration.steps import set_learner, set_prediction, set_data, \
set_loss


def keras2pipeline(keras_model, batch_size=32):
# Creating pipeline
from tensorflow.python.keras.activations import softmax
pipeline_description = Pipeline()

pipeline_description.add_input(name='inputs')

set_data(pipeline_description)
set_loss(pipeline_description)

offset = len(pipeline_description.steps)

previous_layer_ids = get_previous_layer_ids(keras_model)

layers = keras_model.layers

step_id = 0
layer_to_step_id = {}

total_layer_num = len(layers)
for i, layer in enumerate(layers):
cls_name = get_layer_class_name(layer)
if cls_name in OMIT_LAYERS:
continue
layer_id = get_layer_id(layer)
if len(previous_layer_ids[layer_id]) > 0:
layer.previous_layer_ids = tuple(
layer_to_step_id[i] + offset for i in previous_layer_ids[layer_id]
)
else:
layer.previous_layer_ids = [None]
# Since JPL does not support Softmax Layer, we add the workaround to make use of softmax
if i == total_layer_num - 2 and cls_name == 'Dense':
layer.activation = softmax
d3m_step = step_function[cls_name](step_id, layer)
pipeline_description.add_step(d3m_step)
layer_to_step_id[layer_id] = step_id
step_id += 1

set_learner(pipeline_description, batch_size)
set_prediction(pipeline_description)
pipeline_description.add_output(
name='output predictions', data_reference=f"steps.{len(pipeline_description.steps) - 1}.produce")

return pipeline_description


def get_previous_layer_ids(keras_model):
from tensorflow.python.util import nest
model = keras_model
layers = model.layers

previous_layer_ids = {}
for layer in layers:
layer_id = str(id(layer))
previous_layer_ids[layer_id] = set()
for i, node in enumerate(layer._inbound_nodes):
node_key = layer.name + '_ib-' + str(i)
if node_key in model._network_nodes:
for inbound_layer in nest.flatten(node.inbound_layers):
inbound_cls_name = get_layer_class_name(inbound_layer)
inbound_layer_id = get_layer_id(inbound_layer)
if inbound_cls_name in OMIT_LAYERS:
previous_layer_ids[layer_id].update(previous_layer_ids[inbound_layer_id])
else:
previous_layer_ids[layer_id].add(inbound_layer_id)
return previous_layer_ids


def get_layer_id(layer):
return str(id(layer))


def get_layer_class_name(layer):
return layer.__class__.__name__

+ 205
- 0
axolotl/axolotl/algorithms/autokeras_integration/block.py View File

@@ -0,0 +1,205 @@
from d3m import index
from d3m.metadata.pipeline import PrimitiveStep
from d3m.metadata.base import ArgumentType


class Block:
def __init__(self, block_id, primitive, previous_layer_id):
self.block_id = block_id
self.primitive = primitive
self.previous_layer_id = previous_layer_id

def get_step(self):
step = PrimitiveStep(primitive=index.get_primitive(self.primitive))
if self.previous_layer_id is not None:
step.add_hyperparameter(name='previous_layer', argument_type=ArgumentType.PRIMITIVE,
data=self.previous_layer_id)
return step


class Conv(Block):
def __init__(self, filters, kernel_size, strides, padding, block_id, primitive, previous_layer_id):
super(Conv, self).__init__(block_id, primitive, previous_layer_id)
self.filters = filters
self.kernel_size = kernel_size[0]
self.strides = strides[0]
self.padding = 'same' if padding else 'valid'
def get_step(self):
step = super().get_step()
step.add_hyperparameter(name='filters', argument_type=ArgumentType.VALUE, data=self.filters)
step.add_hyperparameter(name='kernel_size', argument_type=ArgumentType.VALUE, data=self.kernel_size)
step.add_hyperparameter(name='strides', argument_type=ArgumentType.VALUE, data=self.strides)
step.add_hyperparameter(name='padding', argument_type=ArgumentType.VALUE, data=self.padding)
return step


class Conv1D(Conv):
def __init__(self, block_id, filters=10, kernel_size=2, strides=1, padding='valid', previous_layer_id=None):
super(Conv1D, self).__init__(filters, kernel_size, strides, padding, block_id,
"d3m.primitives.layer.convolution_1d.KerasWrap", previous_layer_id)


class Conv2D(Conv):
def __init__(self, block_id, filters=10, kernel_size=2, strides=1, padding='valid', previous_layer_id=None):
super(Conv2D, self).__init__(filters, kernel_size, strides, padding, block_id,
"d3m.primitives.layer.convolution_2d.KerasWrap", previous_layer_id)


class Conv3D(Conv):
def __init__(self, block_id, filters=10, kernel_size=2, strides=1, padding='valid', previous_layer_id=None):
super(Conv3D, self).__init__(filters, kernel_size, strides, padding, block_id,
"d3m.primitives.layer.convolution_3d.KerasWrap", previous_layer_id)


class Dense(Block):
def __init__(self, block_id, units=120, activation='linear', previous_layer_id=None):
super(Dense, self).__init__(block_id, "d3m.primitives.layer.dense.KerasWrap", previous_layer_id)
self.units = units
self.activation = activation.__name__.lower()

def get_step(self):
step = super().get_step()
step.add_hyperparameter(name='units', argument_type=ArgumentType.VALUE, data=self.units)
step.add_hyperparameter(name='activation', argument_type=ArgumentType.VALUE, data=self.activation)
return step


class BatchNorm2D(Block):
def __init__(self, block_id, previous_layer_id):
super(BatchNorm2D, self).__init__(block_id, "d3m.primitives.layer.batch_normalization.KerasWrap",
previous_layer_id)

def get_step(self):
step = super().get_step()
return step


class MaxPooling(Block):
def __init__(self, pool_size, strides, padding, block_id, primitive, previous_layer_id):
super(MaxPooling, self).__init__(block_id, primitive, previous_layer_id)
self.pool_size = pool_size
self.strides = strides[0]
self.padding = 'same' if padding else 'valid'

def get_step(self):
step = super().get_step()
step.add_hyperparameter(name='pool_size', argument_type=ArgumentType.VALUE, data=self.pool_size)
step.add_hyperparameter(name='strides', argument_type=ArgumentType.VALUE, data=self.strides)
step.add_hyperparameter(name='padding', argument_type=ArgumentType.VALUE, data=self.padding)
return step


class MaxPooling1D(MaxPooling):
def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None):
super(MaxPooling1D, self).__init__(pool_size, strides, padding, block_id,
"d3m.primitives.layer.max_pooling_1d.KerasWrap", previous_layer_id)


class MaxPooling2D(MaxPooling):
def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None):
super(MaxPooling2D, self).__init__(pool_size, strides, padding, block_id,
"d3m.primitives.layer.max_pooling_2d.KerasWrap", previous_layer_id)


class MaxPooling3D(MaxPooling):
def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None):
super(MaxPooling3D, self).__init__(pool_size, strides, padding, block_id,
"d3m.primitives.layer.max_pooling_3d.KerasWrap", previous_layer_id)


class AvgPooling(Block):
def __init__(self, pool_size, strides, padding, block_id, primitive, previous_layer_id):
super(AvgPooling, self).__init__(block_id, primitive, previous_layer_id)
self.pool_size = pool_size[0]
self.strides = strides[0]
self.padding = 'same' if padding else 'valid'

def get_step(self):
step = super().get_step()
step.add_hyperparameter(name='pool_size', argument_type=ArgumentType.VALUE, data=self.pool_size)
step.add_hyperparameter(name='strides', argument_type=ArgumentType.VALUE, data=self.strides)
step.add_hyperparameter(name='padding', argument_type=ArgumentType.VALUE, data=self.padding)
return step


class AvgPooling1D(AvgPooling):
def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None):
super(AvgPooling1D, self).__init__(pool_size, strides, padding, block_id,
"d3m.primitives.layer.average_pooling_1d.KerasWrap", previous_layer_id)


class AvgPooling2D(AvgPooling):
def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None):
super(AvgPooling2D, self).__init__(pool_size, strides, padding, block_id,
"d3m.primitives.layer.average_pooling_2d.KerasWrap", previous_layer_id)


class AvgPooling3D(AvgPooling):
def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None):
super(AvgPooling3D, self).__init__(pool_size, strides, padding, block_id,
"d3m.primitives.layer.average_pooling_3d.KerasWrap", previous_layer_id)


class GlobalAvgPooling2d(Block):
def __init__(self, block_id, data_format='channels_last', previous_layer_id=None):
super(GlobalAvgPooling2d, self).__init__(block_id, "d3m.primitives.layer.global_average_pooling_2d.KerasWrap",
previous_layer_id=previous_layer_id)
self.data_format = data_format

def get_step(self):
step = super().get_step()
step.add_hyperparameter(name='data_format', argument_type=ArgumentType.VALUE, data=self.data_format)
return step


# JPL does not have such primitives,
# class GlobalMaxPooling2d(MaxPooling2D):
# def __init__(self, block_id, input_shape, previous_layer_id):
# kernel_size = input_shape[0]
# super(GlobalMaxPooling2d, self).__init__(block_id, kernel_size, previous_layer_id=previous_layer_id)


class Dropout(Block):
def __init__(self, block_id, rate=0.2, previous_layer_id=None):
super(Dropout, self).__init__(block_id, "d3m.primitives.layer.dropout.KerasWrap", previous_layer_id)
self.rate = rate

def get_step(self):
step = super().get_step()
step.add_hyperparameter(name='rate', argument_type=ArgumentType.VALUE, data=self.rate)
return step


class Flatten(Block):
def __init__(self, block_id, previous_layer_id):
super(Flatten, self).__init__(block_id, "d3m.primitives.layer.flatten.KerasWrap", previous_layer_id)


class Add(Block):
def __init__(self, block_id, previous_layer_ids):
super(Add, self).__init__(block_id, "d3m.primitives.layer.add.KerasWrap", None)
self.previous_layer_ids = previous_layer_ids

def get_step(self):
step = PrimitiveStep(primitive=index.get_primitive(self.primitive))
step.add_hyperparameter(name='previous_layers', argument_type=ArgumentType.PRIMITIVE,
data=self.previous_layer_ids)
return step


class Concatenate(Block):
def __init__(self, block_id, previous_layer_ids):
super(Concatenate, self).__init__(block_id, "d3m.primitives.layer.concat.KerasWrap", None)
self.previous_layer_ids = previous_layer_ids

def get_step(self):
step = PrimitiveStep(primitive=index.get_primitive(self.primitive))
step.add_hyperparameter(name='previous_layers', argument_type=ArgumentType.PRIMITIVE,
data=self.previous_layer_ids)
return step


class Null(Block):
def __init__(self, block_id):
super(Null, self).__init__(block_id, "d3m.primitives.layer.null.KerasWrap", None)

+ 23
- 0
axolotl/axolotl/algorithms/autokeras_integration/constants.py View File

@@ -0,0 +1,23 @@
from .mapping import *

step_function = {
'Dense': fetch_dense_step,
'Conv1D': fetch_conv1D_step,
'Conv2D': fetch_conv2D_step,
'Conv3D': fetch_conv3D_step,
'BatchNormalization': fetch_batch_norm_step,
'MaxPooling2D': fetch_maxpool2d_step,
'Dropout': fetch_dropout_step,
'AvgPooling2D': fetch_avgpool2d_step,
# 'GlobalMaxPooling2d': JPL does not have such primitives,
'GlobalAveragePooling2D': fetch_global_avgpooling_step,
'Flatten': fetch_flatten_step,
'Add': fetch_add_step,
'Concatenate': fetch_concatenate_step,
'Null': fetch_null_step,
# 'Substract': we do not implement
}

ACTIVATIONS = {'ReLU'}
OMIT_LAYERS = {'InputLayer', 'Normalization', 'ReLU', 'ZeroPadding2D', 'Softmax', 'Activation'}
FORWARD_LAYERS = {'Dense', 'Conv1d', 'Conv2d', 'Conv3d'}

+ 122
- 0
axolotl/axolotl/algorithms/autokeras_integration/mapping.py View File

@@ -0,0 +1,122 @@
from .block import *


def fetch_conv1D_step(block_id, layer):
return Conv1D(
block_id,
layer.filters,
layer.kernel_size,
layer.strides,
layer.padding,
layer.previous_layer_ids[0]
).get_step()


def fetch_conv2D_step(block_id, layer):
return Conv2D(
block_id,
layer.filters,
layer.kernel_size,
layer.strides,
layer.padding,
layer.previous_layer_ids[0]
).get_step()


def fetch_conv3D_step(block_id, layer):
return Conv3D(
block_id,
layer.filters,
layer.kernel_size,
layer.strides,
layer.padding,
layer.previous_layer_ids[0]
).get_step()


def fetch_dense_step(block_id, layer):
return Dense(
block_id,
layer.units,
layer.activation,
layer.previous_layer_ids[0]
).get_step()


def fetch_batch_norm_step(block_id, layer):
return BatchNorm2D(
block_id,
layer.previous_layer_ids[0]
).get_step()


def fetch_maxpool2d_step(block_id, layer):
return MaxPooling2D(
block_id,
layer.pool_size,
layer.strides,
layer.padding,
layer.previous_layer_ids[0]
).get_step()


def fetch_avgpool2d_step(block_id, layer):
return AvgPooling2D(
block_id,
layer.pool_size,
layer.strides,
layer.padding,
layer.previous_layer_ids[0]
).get_step()


def fetch_dropout_step(block_id, layer):
return Dropout(
block_id,
layer.rate,
layer.previous_layer_ids[0]
).get_step()


# JPL does not have such primitives,
# def fetch_global_maxpooling_step(block_id, layer):
# return GlobalMaxPooling2d(
# block_id,
# layer.input.shape,
# layer.previous_layer_ids[0]
# ).get_step()


def fetch_global_avgpooling_step(block_id, layer):
return GlobalAvgPooling2d(
block_id,
layer.data_format,
layer.previous_layer_ids[0]
).get_step()


def fetch_flatten_step(block_id, layer):
return Flatten(
block_id,
layer.previous_layer_ids[0]
).get_step()


def fetch_add_step(block_id, layer):
return Add(
block_id,
layer.previous_layer_ids
).get_step()


def fetch_concatenate_step(block_id, layer):
return Concatenate(
block_id,
layer.previous_layer_ids
).get_step()


def fetch_null_step(block_id):
return Null(
block_id,
).get_step()

+ 126
- 0
axolotl/axolotl/algorithms/autokeras_integration/steps.py View File

@@ -0,0 +1,126 @@
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import PrimitiveStep

import d3m.primitives.data_preprocessing.image_reader
import d3m.primitives.data_transformation.denormalize
import d3m.primitives.data_transformation.dataset_to_dataframe
import d3m.primitives.data_transformation.construct_predictions
import d3m.primitives.data_transformation.extract_columns_by_semantic_types
import d3m.primitives.data_transformation.replace_semantic_types

import d3m.primitives.loss_function.categorical_crossentropy
import d3m.primitives.loss_function.categorical_accuracy

import d3m.primitives.learner.model
import d3m.primitives.data_wrangling.batching

LOSS_SETUP_IDX = IP_STEP = OP_STEP = READER_STEP = -1
BATCH_SIZE = 40


def set_data(pipeline_description):
global IP_STEP, OP_STEP, READER_STEP

# denormalize
denorm_step_idx = 0
step = PrimitiveStep(
primitive_description=d3m.primitives.data_transformation.denormalize.Common.metadata.query())
step.add_argument(
name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step.add_output('produce')
pipeline_description.add_step(step)

# dataset_to_dataframe
dataset_to_dataframe_step_idx = len(pipeline_description.steps)
step = PrimitiveStep(
primitive_description=d3m.primitives.data_transformation.dataset_to_dataframe.Common.metadata.query())
step.add_argument(
name='inputs', argument_type=ArgumentType.CONTAINER,
data_reference='steps.{}.produce'.format(denorm_step_idx))
step.add_output('produce')
pipeline_description.add_step(step)

# extract targets
extract_step_idx = len(pipeline_description.steps)
extract_targets = PrimitiveStep(
d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common.metadata.query())
extract_targets.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
data_reference='steps.{}.produce'.format(dataset_to_dataframe_step_idx))
extract_targets.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
extract_targets.add_output('produce')
pipeline_description.add_step(extract_targets)

# replace semantic types
# Need to be used for CIFAR-10
replace_step_idx = len(pipeline_description.steps)
replace_semantic = PrimitiveStep(
d3m.primitives.data_transformation.replace_semantic_types.Common.metadata.query())
replace_semantic.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
data_reference=f'steps.{extract_step_idx}.produce')
replace_semantic.add_hyperparameter(name='to_semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/TrueTarget'])
replace_semantic.add_hyperparameter(name='from_semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
replace_semantic.add_output('produce')
pipeline_description.add_step(replace_semantic)

# image reader
reader_step_idx = len(pipeline_description.steps)
reader = PrimitiveStep(
primitive_description=d3m.primitives.data_preprocessing.image_reader.Common.metadata.query())
reader.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new')
pipeline_description.add_step(reader)

IP_STEP, OP_STEP, READER_STEP = dataset_to_dataframe_step_idx, replace_step_idx, reader_step_idx


def set_loss(pipeline_description):
global LOSS_SETUP_IDX

LOSS_SETUP_IDX = len(pipeline_description.steps)
step = PrimitiveStep(
primitive_description=d3m.primitives.loss_function.categorical_crossentropy.KerasWrap.metadata.query())
pipeline_description.add_step(step)


def set_learner(pipeline_description, batch_size=BATCH_SIZE):
learner_idx = len(pipeline_description.steps)
step = PrimitiveStep(primitive_description=d3m.primitives.learner.model.KerasWrap.metadata.query())
step.add_hyperparameter(name='loss', argument_type=ArgumentType.PRIMITIVE, data=LOSS_SETUP_IDX)
step.add_hyperparameter(name='model_type', argument_type=ArgumentType.VALUE, data='classification')
step.add_hyperparameter(name='network_last_layer', argument_type=ArgumentType.PRIMITIVE,
data=learner_idx - 1)
step.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace')
lr = 0.0001
adam_hypers = d3m.primitives.learner.model.KerasWrap.metadata.get_hyperparams().defaults(path='optimizer.Adam')
adam_hypers = adam_hypers.replace({'lr': lr})
step.add_hyperparameter(name='optimizer', argument_type=ArgumentType.VALUE, data=adam_hypers)
pipeline_description.add_step(step)

bz_loader = PrimitiveStep(primitive_description=d3m.primitives.data_wrangling.batching.TAMU.metadata.query())
bz_loader.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
data_reference=f'steps.{IP_STEP}.produce')
bz_loader.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER,
data_reference='steps.{}.produce'.format(OP_STEP))
bz_loader.add_hyperparameter(name='primitive_reader', argument_type=ArgumentType.PRIMITIVE, data=READER_STEP)
bz_loader.add_hyperparameter(name='primitive_learner', argument_type=ArgumentType.PRIMITIVE, data=learner_idx)
bz_loader.add_hyperparameter(name='batch_size', argument_type=ArgumentType.VALUE, data=batch_size)
bz_loader.add_hyperparameter(name='sampling_method', argument_type=ArgumentType.VALUE, data='random')
bz_loader.add_output('produce')

pipeline_description.add_step(bz_loader)


def set_prediction(pipeline_description):
pred = PrimitiveStep(
primitive_description=d3m.primitives.data_transformation.construct_predictions.Common.metadata.query())
pred.add_argument(
name='inputs', argument_type=ArgumentType.CONTAINER,
data_reference=f"steps.{len(pipeline_description.steps) - 1}.produce"
)
pred.add_argument(name='reference', argument_type=ArgumentType.CONTAINER,
data_reference='steps.{}.produce'.format(IP_STEP))
pred.add_output('produce')
pipeline_description.add_step(pred)

+ 145
- 0
axolotl/axolotl/algorithms/autokeras_search.py View File

@@ -0,0 +1,145 @@
import logging
import numpy as np

import autokeras as ak
from d3m import exceptions, index, container
from d3m.metadata import base as metadata_base

from axolotl.algorithms.autokeras_integration import keras2pipeline
from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils.pipeline import PipelineResult

logger = logging.getLogger(__name__)


class AutoKerasSearch(PipelineSearchBase):

def __init__(self, problem_description, backend,
max_trials=10000, directory='.', epochs=1, batch_size=32, validation_split=0.2):
super(AutoKerasSearch, self).__init__(problem_description, backend, ranking_function=None)

self.clf = ak.ImageClassifier(max_trials=max_trials, seed=self.random_seed, directory=directory)
self.tuner = self.clf.tuner
self.epochs = epochs
self.batch_size = batch_size
self.validation_split = validation_split

def search_fit(self, input_data, time_limit=300, *, expose_values=False):
dataframe = self.get_dataframe(input_data)
y = self.get_y(dataframe)
x = self.get_x(dataframe)

self.clf.fit(x=x, y=y, epochs=self.epochs, batch_size=self.batch_size,
validation_split=self.validation_split)
keras_model = self.clf.export_model()
best_pipeline = keras2pipeline(keras_model, batch_size=self.batch_size)

fitted_pipeline_result = self.backend.fit_pipeline(
problem_description=self.problem_description, pipeline=best_pipeline,
input_data=input_data, expose_outputs=expose_values
)

if fitted_pipeline_result.error is not None:
logging.error('No solution founded')
pipeline_result = PipelineResult(pipeline=best_pipeline)
pipeline_result.error = RuntimeError("No solution found")
return pipeline_result

self.best_fitted_pipeline_id = fitted_pipeline_result.fitted_pipeline_id
return fitted_pipeline_result

def mark_columns(self, dataset):
problem_inputs = self.problem_description['inputs']
for problem_input in problem_inputs:
for target in problem_input.get('targets', []):
if target['resource_id'] not in dataset:
raise exceptions.NotFoundError(
"Error marking target column: dataset does not contain resource with resource ID '{resource_id}'.".format(
resource_id=target['resource_id'],
),
)
if not isinstance(dataset[target['resource_id']], container.DataFrame):
raise TypeError(
"Error marking target column: resource '{resource_id}' is not a DataFrame.".format(
resource_id=target['resource_id'],
),
)
if not 0 <= target['column_index'] < dataset[target['resource_id']].shape[1]:
raise ValueError(
"Error marking target column: resource '{resource_id}' does not have a column with index '{column_index}'.".format(
resource_id=target['resource_id'],
column_index=target['column_index'],
),
)

dataset.metadata = dataset.metadata.add_semantic_type(
(target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']),
'https://metadata.datadrivendiscovery.org/types/Target',
)
dataset.metadata = dataset.metadata.add_semantic_type(
(target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']),
'https://metadata.datadrivendiscovery.org/types/TrueTarget',
)
# If column is marked as a target, it cannot be attribute as well.
# This allows one to define in problem description otherwise attribute columns as targets.
# See: https://gitlab.com/datadrivendiscovery/d3m/issues/265
dataset.metadata = dataset.metadata.remove_semantic_type(
(target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']),
'https://metadata.datadrivendiscovery.org/types/Attribute',
)
return dataset

def get_dataframe(self, input_data):
# denormalize
denormalize = index.get_primitive('d3m.primitives.data_transformation.denormalize.Common')
hyperparams_class = denormalize.metadata.get_hyperparams()
primitive = denormalize(hyperparams=hyperparams_class.defaults())
dataset = primitive.produce(inputs=input_data[0]).value

# Add Target column into dataset
dataset = self.mark_columns(dataset)

# dataset to dataframe
dataset_dataframe = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')
hyperparams_class = dataset_dataframe.metadata.get_hyperparams()
primitive = dataset_dataframe(hyperparams=hyperparams_class.defaults())
dataframe = primitive.produce(inputs=dataset).value

return dataframe

def get_y(self, dataframe):
# extract targets
get_columns_semantic = index.get_primitive(
'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')
hyperparams_class = get_columns_semantic.metadata.get_hyperparams()
primitive = get_columns_semantic(
hyperparams=hyperparams_class.defaults().replace(
{
'semantic_types': (
'https://metadata.datadrivendiscovery.org/types/TrueTarget',
'https://metadata.datadrivendiscovery.org/types/Target',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
)
}
)
)
targets = primitive.produce(inputs=dataframe).value
y = np.array(targets, dtype=np.int64)
return y

def get_x(self, dataframe):
# reading images
image_reader = index.get_primitive('d3m.primitives.data_preprocessing.image_reader.Common')
hyperparams_class = image_reader.metadata.get_hyperparams()
primitive = image_reader(hyperparams=hyperparams_class.defaults().replace(
{'return_result': 'replace'})
)
columns_to_use = primitive._get_columns(dataframe.metadata)
column_index = columns_to_use[0]
temp = [
primitive._read_filename(column_index, dataframe.metadata.query((row_index, column_index)), value)
for row_index, value in enumerate(dataframe.iloc[:, column_index])
]
x = np.array(temp, dtype=np.float64)
return x

+ 241
- 0
axolotl/axolotl/algorithms/base.py View File

@@ -0,0 +1,241 @@
import abc
import uuid
import logging
import time
import typing

from d3m.metadata.problem import Problem
from d3m.metadata.pipeline import Pipeline
from d3m import runtime as runtime_module
from d3m import container
from d3m.metadata.base import Context
from d3m import utils as d3m_utils
from d3m.metadata import pipeline_run as pipeline_run_module

from axolotl.backend.base import RunnerBase
from axolotl.utils.pipeline import PipelineResult
from axolotl.utils.schemas import ContainerType
from axolotl.utils import resources as resources_module

logger = logging.getLogger(__name__)


class PipelineSearchBase:
"""
Base class for pipeline searcher, this class should provide the common interface for pipeline
searchers to be integrated with the system.

Nothing should be computed or initialized on the constructor, just adding more variables.
Everything else should be computed at start_search.

Parameters
----------
problem_description : Problem
A problem description.
backend : RunnerBase
An instance of a backend class.
primitives_blocklist : typing.Sequence[str]
A list of string with pipeline names to avoid.
ranking_function : typing.Callable
A function that takes as an input a dataframe of scores, and generates a rank, smaller is better


Attributes
----------
backend : RunnerBase
An instance of a backend class.
random_seed : int
Random seed passed to the constructor.
volumes_dir : str
Path to a directory with static files required by primitives.
scratch_dir : str
Path to a directory to store any temporary files needed during execution.
ranking_function : typing.Callable
A function that takes as an input a dataframe of scores, and generates a rank, smaller is better
problem_description : Problem
A problem description.
primitives_blocklist : typing.Sequence[str]
A list of string with pipeline names to avoid.

history : typing.Dict[str, PipelineResult]
A list of all the evaluated pipelines with their execution results and performance.
"""

def __init__(self,
problem_description: Problem, backend: RunnerBase, *,
primitives_blocklist: typing.Sequence[str] = None, ranking_function: typing.Callable = None
) -> None:
self.search_id = str(uuid.uuid4())
self.backend = backend
self.random_seed = backend.random_seed
self.volumes_dir = backend.volumes_dir
self.scratch_dir = backend.scratch_dir
self.ranking_function = ranking_function

self.problem_description: Problem = problem_description
self.primitives_blocklist: typing.Sequence[str] = primitives_blocklist

self.history: typing.List[PipelineResult] = []

# missing typing
self.best_fitted_pipeline_id: str = None
self.input_data: typing.Sequence[ContainerType] = None

with d3m_utils.silence():
self.runtime_environment = pipeline_run_module.RuntimeEnvironment()

def search(self, time_limit: float):
"""
This method executes the whole search, by calling the ``_search`` method multiple times
as long as there is time left and put the results on the history.

Parameters
----------
time_limit : float
Time limit for the search
"""
time_start = time.time()
largest_iteration = 0

i = 0

while True:
i += 1
time_left = time_limit - (time.time() - time_start)

if time_left < 5:
logger.info('-- Time out --')
break

if time_left - largest_iteration < 5:
logger.info("""-- Time out -- \n Time left {} Next iteration could be over {}""".format(time_left, largest_iteration))
break

start_iteration_time = time.time()
results = self._search(time_left=time_left)
self.history += results
current_iteration_time = time.time() - start_iteration_time

if largest_iteration < current_iteration_time:
largest_iteration = current_iteration_time

def search_fit(self, input_data: typing.Sequence[ContainerType], time_limit: float = 300, *,
expose_values: bool = False) -> typing.Tuple[runtime_module.Runtime, PipelineResult]:
"""
This method calls search and fit the best ranking pipelines located from the search located on the history.

Parameters
----------
input_data : typing.Sequence[ContainerType]
A list of D3M containers to be use as the pipeline input.

time_limit : float
The time limit to be use for the search.

expose_values : bool
A flag that allows the user expose all intermediate result of the pipeline during fitting.
"""
self.input_data = input_data
self.search(time_limit)

best_pipeline = None
for pipeline_result in self.history:
if pipeline_result.error is None:
if best_pipeline is None:
best_pipeline = pipeline_result
else:
if pipeline_result.rank < best_pipeline.rank:
best_pipeline = pipeline_result

if best_pipeline is None:
logging.error('No solution founded')
pipeline_result = PipelineResult(fitted_pipeline_id='')
pipeline_result.error = RuntimeError("No solution found")
return _, pipeline_result

return self.fit(best_pipeline.pipeline, input_data, expose_values)

def fit(self, pipeline: Pipeline, input_data: typing.Sequence[container.Dataset],
expose_outputs: bool = False) -> typing.Tuple[runtime_module.Runtime, PipelineResult]:

pipeline_result = PipelineResult(pipeline=pipeline)

runtime, output, result = runtime_module.fit(
pipeline=pipeline, inputs=input_data, problem_description=self.problem_description, context=Context.TESTING,
hyperparams=None, random_seed=self.random_seed, volumes_dir=self.volumes_dir,
runtime_environment=self.runtime_environment, scratch_dir=self.scratch_dir, expose_produced_outputs=expose_outputs
)
if result.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = result.error
else:
pipeline_result.status = "COMPLETED"

pipeline_result.exposed_outputs = result.values
pipeline_result.output = output

return runtime, pipeline_result

def produce(self, fitted_pipeline: runtime_module.Runtime, input_data: typing.Sequence[container.Dataset],
expose_outputs: bool = False) -> PipelineResult:
pipeline_result = PipelineResult(fitted_pipeline_id='')

with d3m_utils.silence():
output, result = runtime_module.produce(
fitted_pipeline=fitted_pipeline, test_inputs=input_data,
expose_produced_outputs=expose_outputs
)

if result.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = result.error
else:
pipeline_result.status = "COMPLETED"

pipeline_result.exposed_outputs = result.values
pipeline_result.output = output
return pipeline_result

@abc.abstractmethod
def _search(self, time_left: float) -> typing.Sequence[PipelineResult]:
"""
A method where the search is going to be implemented.
The search algorithm should be iteration oriented, each of the call should end
on returning the status of pipelines evaluated.

Parameters
----------
time_left : float
TTime left for the iteration

Returns
-------
typing.Sequence[PipelineResult]
A list of pipeline results with the information of the pipeline ran during the iteration.

"""

def pretty_print(self, deep: bool = False):
"""
A function that prints everything really nice.
"""
from pprint import pprint

def simplify_value(input_value):
if isinstance(input_value, Problem):
return input_value.to_simple_structure()
elif isinstance(input_value, Pipeline):
return input_value.to_json_structure()
elif isinstance(input_value, PipelineResult):
return vars(input_value)
elif isinstance(input_value, dict):
new_value = {}
for nested_variable, nested_val in input_value.items():
new_value[nested_variable] = simplify_value(nested_val)
return new_value

class_instance = vars(self)
if deep:
class_instance = simplify_value(class_instance)

pprint(class_instance)

+ 27
- 0
axolotl/axolotl/algorithms/bayesian_search.py View File

@@ -0,0 +1,27 @@
import enum

from axolotl.algorithms.tuners.bayesian_oracle import BayesianOptimizationOracle
from axolotl.algorithms.tuners.tunable_base import TunableBase


class BayesianSearch(TunableBase):
def __init__(self, problem_description, backend, primitives_blocklist=None,
max_trials=10000, directory='.', num_initial_points=None, num_eval_trials=None):
super(BayesianSearch, self).__init__(problem_description, backend,
primitives_blocklist=primitives_blocklist, num_eval_trials=num_eval_trials)
self.directory = directory
self.project_name = 'random_search'

self.objective = self.problem_description['problem']['performance_metrics'][0]['metric']
if isinstance(self.objective, enum.Enum):
self.objective = self.objective.name

self.oracle = BayesianOptimizationOracle(
objective=self.objective,
max_trials=max_trials, # pre-defined number,
seed=self.random_seed, # seed
hyperparameters=self.hyperparameters,
num_initial_points=num_initial_points,
)
self.oracle._set_project_dir(
self.directory, self.project_name, overwrite=True)

+ 1086
- 0
axolotl/axolotl/algorithms/data_driven_search.py
File diff suppressed because it is too large
View File


+ 87
- 0
axolotl/axolotl/algorithms/dummy.py View File

@@ -0,0 +1,87 @@
import json
import uuid

from d3m.metadata.pipeline import Pipeline

from axolotl.algorithms.base import PipelineSearchBase
from axolotl.utils import schemas as schemas_utils, pipeline as pipeline_utils


def dummy_ranking_function(pipeline_result):
if pipeline_result.status == 'COMPLETED':
summarize_performance = schemas_utils.summarize_performance_metrics(pipeline_result.scores)
rank = schemas_utils.compute_rank(summarize_performance)
pipeline_result.rank = rank
return pipeline_result


class DummySearch(PipelineSearchBase):
def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None):
super().__init__(problem_description=problem_description, backend=backend,
primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
if self.ranking_function is None:
self.ranking_function = dummy_ranking_function
self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords'])

self.available_pipelines = self._return_pipelines(
self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types'])

# TODO update this to be defined on problem/metrics terms
self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
self.metrics = self.problem_description['problem']['performance_metrics']

self.scoring_pipeline = schemas_utils.get_scoring_pipeline()
self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

self.offset = 10
self.current_pipeline_index = 0

def _search(self, time_left):
pipelines_to_eval = self.available_pipelines[self.current_pipeline_index: self.current_pipeline_index+self.offset]
self.current_pipeline_index += self.offset
pipeline_results = self.backend.evaluate_pipelines(
problem_description=self.problem_description, pipelines=pipelines_to_eval, input_data=self.input_data,
metrics=self.metrics, data_preparation_pipeline=self.data_preparation_pipeline,
scoring_pipeline=self.scoring_pipeline, data_preparation_params=self.data_preparation_params)

return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results]

def _return_pipelines(self, task_type, task_subtype, data_type):
"""
A function that return predefined pipelines given a task type.

Returns
-------
A predefined pipelines if there are pipelines left, also if there is template
returns the new pipeline with the template.

"""
# TODO incorporate task_subtype and data_type for future problems
with open(schemas_utils.PIPELINES_DB_DIR) as file:
possible_pipelines_dict = json.load(file)

if task_type not in possible_pipelines_dict:
self.pipeline_left = False
return None

possible_pipelines_dict = possible_pipelines_dict[task_type]

if not possible_pipelines_dict:
return []

possible_pipelines = []
for pipeline_dict in possible_pipelines_dict:
try:
pipeline = pipeline_utils.load_pipeline(pipeline_dict)

# update id
pipeline.id = str(uuid.uuid4())

# update time
pipeline.created = Pipeline().created

possible_pipelines.append(pipeline)
except Exception:
pass

return possible_pipelines

+ 27
- 0
axolotl/axolotl/algorithms/random_search.py View File

@@ -0,0 +1,27 @@
import enum

from axolotl.algorithms.tuners.random_search_oracle import RandomSearchOracle
from axolotl.algorithms.tuners.tunable_base import TunableBase


class RandomSearch(TunableBase):
def __init__(self, problem_description, backend, primitives_blocklist=None,
max_trials=10000, directory='.', num_eval_trials=None):
super(RandomSearch, self).__init__(problem_description, backend,
primitives_blocklist=primitives_blocklist, num_eval_trials=num_eval_trials)
self.directory = directory
self.project_name = 'random_search'

self.objective = self.problem_description['problem']['performance_metrics'][0]['metric']
if isinstance(self.objective, enum.Enum):
self.objective = self.objective.name

self.oracle = RandomSearchOracle(
objective=self.objective,
max_trials=max_trials, # pre-defined number,
seed=self.random_seed, # seed
hyperparameters=self.hyperparameters,
)
self.oracle._set_project_dir(
self.directory, self.project_name, overwrite=True)


+ 0
- 0
axolotl/axolotl/algorithms/tuners/__init__.py View File


+ 198
- 0
axolotl/axolotl/algorithms/tuners/bayesian_oracle.py View File

@@ -0,0 +1,198 @@
import numpy as np

from scipy import optimize as scipy_optimize
from sklearn import exceptions

from d3m.metadata import hyperparams
from kerastuner import Objective
from kerastuner.tuners.bayesian import BayesianOptimizationOracle as KerasBayesian
from kerastuner.engine import trial as trial_lib

from axolotl.algorithms.tuners.hyperparameters import HyperParameters, \
value_to_cumulative_prob, cumulative_prob_to_value
from axolotl.algorithms.tuners.oracle import infer_metric_direction, random_values, patch_invalid_hyperamaeters


class BayesianOptimizationOracle(KerasBayesian):
"""
Bayesian optimization oracle.
"""

def __init__(self,
objective,
max_trials,
num_initial_points=None,
alpha=1e-4,
beta=2.6,
seed=None,
hyperparameters=None,
allow_new_entries=True,
tune_new_entries=True):
direction = infer_metric_direction(objective)
objective = Objective(name=objective, direction=direction)
super(BayesianOptimizationOracle, self).__init__(
objective=objective,
max_trials=max_trials,
num_initial_points=num_initial_points,
alpha=alpha,
beta=beta,
seed=seed,
hyperparameters=hyperparameters,
allow_new_entries=allow_new_entries,
tune_new_entries=tune_new_entries,
)
self.num_complete_trials = 0
self.sorted_candidates = []

# TODO how to save a trial
def _save_trial(self, trial):
pass

def get_state(self):
# `self.trials` are saved in their own, Oracle-agnostic files.
# Just save the IDs for ongoing trials, since these are in `trials`.
state = {}
state['ongoing_trials'] = {
tuner_id: trial.trial_id
for tuner_id, trial in self.ongoing_trials.items()}
# Hyperparameters are part of the state because they can be added to
# during the course of the search.
state['hyperparameters'] = str(self.hyperparameters.get_config())

state.update({
'num_initial_points': self.num_initial_points,
'alpha': self.alpha,
'beta': self.beta,
})
return state

def _random_values(self):
"""Fills the hyperparameter space with random values.

Returns:
A dictionary mapping parameter names to suggested values.
"""

values, seed_state = random_values(hyperparameters=self.hyperparameters,
seed_state=self._seed_state,
tried_so_far=self._tried_so_far,
max_collisions=self._max_collisions,
)
self._seed_state = seed_state
return values

def _nonfixed_space(self):
return [hp for hp in self.hyperparameters.space
if not isinstance(hp, hyperparams.Constant)]

def _vector_to_values(self, vector):
hps = HyperParameters()
vector_index = 0
for hp in self.hyperparameters.space:
hps.merge([hp])
if isinstance(hp, hyperparams.Constant):
value = hp.get_default()
else:
prob = vector[vector_index]
vector_index += 1
value = cumulative_prob_to_value(prob, hp)

if hps.is_active(hp):
hps.values[hp.name] = value
patch_invalid_hyperamaeters(hps)
return hps.values

def _vectorize_trials(self):
x = []
y = []
ongoing_trials = {t for t in self.ongoing_trials.values()}
for trial in self.trials.values():
# Create a vector representation of each Trial's hyperparameters.
trial_hps = trial.hyperparameters
vector = []
for hp in self._nonfixed_space():
# For hyperparameters not present in the trial (either added after
# the trial or inactive in the trial), set to default value.
if trial_hps.is_active(hp):
trial_value = trial_hps.values[hp.name]
else:
trial_value = hp.default

# Embed an HP value into the continuous space [0, 1].
prob = value_to_cumulative_prob(trial_value, hp)
vector.append(prob)

if trial in ongoing_trials:
# "Hallucinate" the results of ongoing trials. This ensures that
# repeat trials are not selected when running distributed.
x_h = np.array(vector).reshape((1, -1))
y_h_mean, y_h_std = self.gpr.predict(x_h, return_std=True)
# Give a pessimistic estimate of the ongoing trial.
score = y_h_mean[0] + y_h_std[0]
elif trial.status == 'COMPLETED':
score = trial.score
# Always frame the optimization as a minimization for scipy.minimize.
if self.objective.direction == 'max':
score = -1*score
else:
continue

x.append(vector)
y.append(score)

x = np.array(x)
y = np.array(y)
return x, y

def _populate_space(self, trial_id):
# Generate enough samples before training Gaussian process.
completed_trials = [t for t in self.trials.values()
if t.status == 'COMPLETED']

# Use 3 times the dimensionality of the space as the default number of
# random points.
dimensions = len(self.hyperparameters.space)
num_initial_points = self.num_initial_points or 3 * dimensions
if len(completed_trials) < num_initial_points:
return self._random_populate_space()

if self.num_complete_trials == len(completed_trials) and len(self.sorted_candidates) > 0:
optimal_x = self.sorted_candidates.pop().x
values = self._vector_to_values(optimal_x)
return {'status': trial_lib.TrialStatus.RUNNING,
'values': values}

# track the number of complete trials
self.num_complete_trials = len(completed_trials)

# Fit a GPR to the completed trials and return the predicted optimum values.
x, y = self._vectorize_trials()
try:
self.gpr.fit(x, y)
except exceptions.ConvergenceWarning:
# If convergence of the GPR fails, create a random trial.
return self._random_populate_space()

def _upper_confidence_bound(x):
x = x.reshape(1, -1)
mu, sigma = self.gpr.predict(x, return_std=True)
return mu - self.beta * sigma

num_restarts = 50
bounds = self._get_hp_bounds()
x_seeds = self._random_state.uniform(bounds[:, 0], bounds[:, 1],
size=(num_restarts, bounds.shape[0]))
candidates = [
scipy_optimize.minimize(_upper_confidence_bound,
x0=x_try,
bounds=bounds,
method='L-BFGS-B')
for x_try in x_seeds
]

self.sorted_candidates = sorted(candidates, key=lambda x: x.fun[0], reverse=True)
optimal_x = self.sorted_candidates.pop().x

values = self._vector_to_values(optimal_x)
return {'status': trial_lib.TrialStatus.RUNNING,
'values': values}

+ 535
- 0
axolotl/axolotl/algorithms/tuners/custom_hps.py View File

@@ -0,0 +1,535 @@
import sys
from collections import OrderedDict

from d3m.metadata import hyperparams

epsilon = sys.float_info.epsilon

clf_xgboost_config = dict(
n_estimators=hyperparams.UniformInt(
lower=10,
upper=50,
default=20,
description='The number of trees in the forest.',
semantic_types=[
'https://metadata.datadrivendiscovery.org/types/TuningParameter',
'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter',
],
),
n_more_estimators=hyperparams.UniformInt(
lower=10,
upper=50,
default=20,
description='When continuing a fit, it controls how many more trees to add every time.',
semantic_types=[
'https://metadata.datadrivendiscovery.org/types/TuningParameter',
'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter',
],
),
max_depth=hyperparams.UniformInt(
lower=5,
upper=50,
default=30,
lower_inclusive=True,
upper_inclusive=True,
description='The maximum depth of the tree.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
learning_rate=hyperparams.LogUniform(
lower=1e-4,
upper=1e-1,
default=0.05,
lower_inclusive=True,
upper_inclusive=True,
description=r'Boosting learning rate (xgb\`s \"eta\")',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
gamma=hyperparams.Constant[float](
default=0.0,
description='Minimum loss reduction required to make a further partition on a leaf node of the tree',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
min_child_weight = hyperparams.Constant[int](
default=1,
description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results '
'in a leaf node with the sum of instance weight less than min_child_weight, then the building '
'process will give up further partitioning ',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
# max_delta_step = hyperparams.Union[Union[int, None]](
# configuration=OrderedDict(
# limit=hyperparams.Bounded[int](
# lower=1,
# upper=None,
# default=1,
# description='Maximum delta step we allow each leaf output to be.'
# ),
# unlimited=hyperparams.Enumeration[int](
# values=[0],
# default=0,
# description='No constraint.',
# ),
# ),
# default='unlimited',
# description='Maximum delta step we allow.',
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
subsample=hyperparams.Constant[float](
default=1.0,
description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur '
'once in every boosting iteration.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
colsample_bytree=hyperparams.Constant[float](
default=1.0,
description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every '
'boosting iteration',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
colsample_bylevel=hyperparams.Constant[float](
default=1.0,
description='Subsample ratio of columns for each split, in each level. Subsampling will occur each time a new '
'split is made',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
reg_alpha=hyperparams.Uniform(
lower=0.1,
upper=1.0,
default=0.5,
lower_inclusive=True,
upper_inclusive=True,
description='L1 regularization term on weights. Increasing this value will make model more conservative.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
reg_lambda=hyperparams.Uniform(
lower=0.1,
upper=1.0,
default=0.5,
lower_inclusive=True,
upper_inclusive=True,
description='L2 regularization term on weights. Increasing this value will make model more conservative.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
# scale_pos_weight = hyperparams.Bounded[float](
# lower=0,
# upper=None,
# default=1,
# description='Control the balance of positive and negative weights, useful for unbalanced classes',
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
base_score=hyperparams.Bounded[float](
lower=0,
upper=None,
default=0.5,
description='The initial prediction score of all instances, global bias.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
)

dfs_single_tab_config = dict(
max_percent_null=hyperparams.Uniform(
lower=0,
upper=1,
default=0.9,
lower_inclusive=True,
upper_inclusive=True,
description='The maximum allowed correlation between any two features returned. A lower value means features will be more uncorrelated',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)

)

lgbm_clf_config = dict(
n_estimators=hyperparams.UniformInt(
lower=10,
upper=50,
default=20,
description='The number of trees in the forest.',
semantic_types=[
'https://metadata.datadrivendiscovery.org/types/TuningParameter',
'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter',
],
),
n_more_estimators=hyperparams.UniformInt(
lower=10,
upper=50,
default=20,
description='When continuing a fit, it controls how many more trees to add every time.',
semantic_types=[
'https://metadata.datadrivendiscovery.org/types/TuningParameter',
'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter',
],
),
max_depth=hyperparams.UniformInt(
lower=5,
upper=50,
default=30,
lower_inclusive=True,
upper_inclusive=True,
description='The maximum depth of the tree.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
# num_leaves_base=hyperparams.Bounded[float](
# lower=1,
# upper=2,
# default=2,
# description='Maximum tree leaves for base learners, this value is the base of the formula num_leaves_base^(max_depth)',
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
# subsample_for_bin=hyperparams.Bounded[int](
# lower=1,
# upper=None,
# default=200000,
# description='number of data that sampled to construct histogram bins',
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
learning_rate=hyperparams.LogUniform(
lower=1e-4,
upper=1e-1,
default=0.05,
lower_inclusive=True,
upper_inclusive=True,
description=r'Boosting learning rate (xgb\`s \"eta\")',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
min_child_weight = hyperparams.Constant[int](
default=1,
description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results '
'in a leaf node with the sum of instance weight less than min_child_weight, then the building '
'process will give up further partitioning ',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
# min_child_samples=hyperparams.Bounded[int](
# lower=0,
# upper=None,
# default=20,
# description='minimal number of data in one leaf. Can be used to deal with over-fitting',
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
# max_delta_step = hyperparams.Union[Union[int, None]](
# configuration=OrderedDict(
# limit=hyperparams.Bounded[int](
# lower=1,
# upper=None,
# default=1,
# description='Maximum delta step we allow each leaf output to be.'
# ),
# unlimited=hyperparams.Enumeration[int](
# values=[0],
# default=0,
# description='No constraint.',
# ),
# ),
# default='unlimited',
# description='Maximum delta step we allow.',
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
subsample=hyperparams.Constant[float](
default=1.0,
description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur '
'once in every boosting iteration.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
subsample_freq=hyperparams.Bounded[int](
lower=0,
upper=1,
default=0,
description='frequency for bagging',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
colsample_bytree=hyperparams.Constant[float](
default=1.0,
description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every '
'boosting iteration',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
min_split_gain=hyperparams.Bounded[float](
lower=0,
upper=None,
default=0,
description='the minimal gain to perform split',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
reg_alpha=hyperparams.Uniform(
lower=0.1,
upper=1.0,
default=0.5,
lower_inclusive=True,
upper_inclusive=True,
description='L1 regularization term on weights. Increasing this value will make model more conservative.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
reg_lambda=hyperparams.Uniform(
lower=0.1,
upper=1.0,
default=0.5,
lower_inclusive=True,
upper_inclusive=True,
description='L2 regularization term on weights. Increasing this value will make model more conservative.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
)

sk_logistic_regression_config = dict(
dual=hyperparams.Constant[bool](
default=False,
description='Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
penalty=hyperparams.Choice(
choices={
'l1': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'l2': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'none': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'elasticnet': hyperparams.Hyperparams.define(
configuration=OrderedDict({
'l1_ratio': hyperparams.Union(
configuration=OrderedDict({
'float': hyperparams.Uniform(
lower=0,
upper=1,
default=0.001,
lower_inclusive=True,
upper_inclusive=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
# 'l1_ratio must be between 0 and 1; got (l1_ratio=None)'
# 'none': hyperparams.Constant(
# default=None,
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# )
}),
default='float',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
})
)
},
default='l2',
description='Used to specify the norm used in the penalization. The \'newton-cg\', \'sag\' and \'lbfgs\' solvers support only l2 penalties.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
intercept_scaling=hyperparams.Constant[float](
default=1,
description='Useful only when the solver \'liblinear\' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),

)

sk_decision_tree_clf_config = dict(
min_samples_split=hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Constant[int](
default=2,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
lower_inclusive=False,
# upper_inclusive=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
max_features=hyperparams.Union(
configuration=OrderedDict({
# max_features must be in (0, n_features]
# 'specified_int': hyperparams.Bounded[int](
# lower=0,
# upper=None,
# default=0,
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
lower_inclusive=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
# 'max_leaf_nodes 0 must be either None or larger than 1'
max_leaf_nodes=hyperparams.Constant(
default=None,
description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
)

sk_sgd_clf_config = dict(
validation_fraction=hyperparams.Bounded[float](
default=0.1,
lower=0,
upper=0.99999999999,
lower_inclusive=False,
# upper_inclusive=False,
description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
# eta0 must be > 0
eta0=hyperparams.Bounded[float](
lower=0.0,
upper=1.0,
default=0.1,
lower_inclusive=False,
description='The initial learning rate for the \'constant\' or \'invscaling\' schedules. The default value is 0.0 as eta0 is not used by the default schedule \'optimal\'.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),


)

sk_random_forest_clf_config = dict(
max_features=hyperparams.Union(
configuration=OrderedDict({
# max_features must be in (0, n_features]
# 'specified_int': hyperparams.Bounded[int](
# lower=0,
# upper=None,
# default=0,
# semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
# ),
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Uniform(
default=0.25,
lower=0,
upper=1,
lower_inclusive=True,
upper_inclusive=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='calculated',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
max_samples=hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
lower=0,
upper=None,
lower_inclusive=False,
default=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.9,
lower=0 + epsilon,
upper=1,
upper_inclusive=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
)

sk_extra_tree_tree_clf_config = dict(
max_features=hyperparams.Union(
configuration=OrderedDict({
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
lower_inclusive=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='calculated',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
),
max_samples=hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
lower=0,
upper=None,
lower_inclusive=False,
default=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.9,
lower=0 + epsilon,
upper=1,
upper_inclusive=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
)

# To avoid the issue, https://gitlab.com/TAMU_D3M/d3m_primitives/-/issues/1
tamu_feature_selection_config = dict(
percentage_selected_features=hyperparams.Uniform(
default=0.5,
upper=1,
lower=0.25,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
description="percentage of features to select, between 0 and 1")
)

config = {
'd3m.primitives.classification.xgboost_gbtree.DataFrameCommon': clf_xgboost_config,
'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization': dfs_single_tab_config,
'd3m.primitives.classification.light_gbm.DataFrameCommon': lgbm_clf_config,
'd3m.primitives.classification.logistic_regression.SKlearn': sk_logistic_regression_config,
'd3m.primitives.classification.decision_tree.SKlearn': sk_decision_tree_clf_config,
'd3m.primitives.classification.sgd.SKlearn': sk_sgd_clf_config,
'd3m.primitives.classification.random_forest.SKlearn': sk_random_forest_clf_config,
'd3m.primitives.classification.extra_trees.SKlearn': sk_extra_tree_tree_clf_config,
'd3m.primitives.feature_selection.skfeature.TAMU': tamu_feature_selection_config,
}

+ 195
- 0
axolotl/axolotl/algorithms/tuners/hyperparameters.py View File

@@ -0,0 +1,195 @@
import json
import math
from scipy.stats import norm

from d3m import utils as d3m_utils
from d3m.metadata import hyperparams
from d3m.metadata.hyperparams import HyperparameterMeta
from kerastuner.engine.hyperparameters import HyperParameters as KerasHyperparams

PIPELINE_CHOICE = 'pipeline_choice'


def GET_CONFIG(param_val):
config = param_val.to_simple_structure()
config['p'] = param_val
if isinstance(param_val, hyperparams.SortedList) or isinstance(param_val, hyperparams.SortedSet):
config['is_configuration'] = param_val.is_configuration
return config


class HyperParameters(KerasHyperparams):
def get_config(self):
return {
'space': [{'class_name': p.__class__.__name__,
'config': GET_CONFIG(p)}
for p in self.space],
'values': dict((k, v) for (k, v) in self.values.items()),
}

def retrieve(self, name, val, parent_name=None, parent_values=None):
"""Gets or creates a `HyperParameter`."""
config = GET_CONFIG(val)
hp = config['p']
hp.name = self._get_name(name)
hp.default = get_val(hp.get_default)()
hp.random_sample = get_val(hp.sample)
hp.conditions = [c for c in self._conditions]
with self._maybe_conditional_scope(parent_name, parent_values):
return self._retrieve(hp)

def _register(self, hp):
"""Registers a `HyperParameter` into this container."""
self._hps[hp.name].append(hp)
self._space.append(hp)
value = hp.default
if self._conditions_are_active(hp.conditions):
self.values[hp.name] = value
return value
return None

@classmethod
def from_config(cls, config):
hps = cls()
for p in config['space']:
p = p['config']['p']
hps._hps[p.name].append(p)
hps._space.append(p)
hps.values = dict((k, v) for (k, v) in config['values'].items())
return hps

def copy(self):
return HyperParameters.from_config(self.get_config())

def __repr__(self):
return self.to_json()

def to_json(self):
return json.dumps(self.__dict__, default=serialize)

def _get_name_parts(self, full_name):
"""Splits `full_name` into its scopes and leaf name."""
str_parts = full_name.split('/')
parts = []

for part in str_parts:
if '=' in part:
parent_name, parent_values = part.split('=')
parent_values = parent_values.split(',')
parts.append({'parent_name': parent_name,
'parent_values': parent_values})
else:
parts.append(part)

return parts

def get_pipeline_id(self):
pipeline_id = self.values[PIPELINE_CHOICE]
return pipeline_id

def get_name_parts(self, full_name):
step, primitive_name, hp_name = self._get_name_parts(full_name)
return step, primitive_name, hp_name


def get_val(func):
def wrapper(*args, **kwargs):
val = func(*args, **kwargs)
return val['choice'] if isinstance(val, dict) and 'choice' in val else val
return wrapper


def serialize(obj):
if isinstance(obj, HyperparameterMeta):
return obj.__dict__


def value_to_cumulative_prob(value, hp):
"""Convert a hyperparameter value to [0, 1]."""
if isinstance(hp, hyperparams.Constant):
return 0.5
if isinstance(hp, hyperparams.UniformBool):
# Center the value in its probability bucket.
if value:
return 0.75
return 0.25
elif isinstance(hp, (hyperparams.Choice, hyperparams.Enumeration, hyperparams.Union)):
if isinstance(hp, hyperparams.Choice):
choices = hp.choices
index = list(choices.keys()).index(value)
elif isinstance(hp, hyperparams.Union):
choices = hp.configuration.keys()
for index, val_type in enumerate(hp.configuration.values()):
if isinstance(value, val_type.structural_type):
break
else:
choices = hp.values
index = choices.index(value)
ele_prob = 1 / len(choices)
# Center the value in its probability bucket.
return (index + 0.5) * ele_prob
elif isinstance(hp, (hyperparams.UniformInt, hyperparams.Uniform, hyperparams.Bounded)):
lower, upper = hp.lower, hp.upper
if lower is None or upper is None:
return 0.5
return (value - lower) / (upper - lower)
elif isinstance(hp, hyperparams.LogUniform):
lower, upper = hp.lower, hp.upper
if lower is None or upper is None:
return 0.5
return (math.log(value / lower) /
math.log(upper / lower))
elif isinstance(hp, (hyperparams.Normal, hyperparams.LogNormal)):
return norm.cdf(value, hp.mu, hp.sigma)
else:
raise ValueError('Unrecognized HyperParameter type: {}'.format(hp))


def cumulative_prob_to_value(prob, hp):
"""Convert a value from [0, 1] to a hyperparameter value."""
if isinstance(hp, hyperparams.Constant):
return hp.get_default()
elif isinstance(hp, hyperparams.UniformBool):
return bool(prob >= 0.5)
elif isinstance(hp, (hyperparams.Choice, hyperparams.Enumeration, hyperparams.Union)):
if isinstance(hp, hyperparams.Choice):
choices = list(hp.choices.keys())
elif isinstance(hp, hyperparams.Union):
choices = list(hp.configuration.keys())
else:
choices = hp.values
ele_prob = 1 / len(choices)
index = int(math.floor(prob / ele_prob))
# Can happen when `prob` is very close to 1.
if index == len(choices):
index = index - 1
if isinstance(hp, hyperparams.Union):
key = choices[index]
with d3m_utils.silence():
val = hp.configuration[key].sample()
return val
return choices[index]
elif isinstance(hp, (hyperparams.UniformInt, hyperparams.Uniform, hyperparams.Bounded)):
import sys
epsilon = sys.float_info.epsilon
lower, upper = hp.lower, hp.upper
if lower is None or upper is None:
return hp.get_default()
value = prob * (upper - lower) + lower
if hp.structural_type == int:
return int(value)
if value == lower and not hp.lower_inclusive:
return value + epsilon
if value == upper and not hp.upper_inclusive:
return value - epsilon
return value
elif isinstance(hp, hyperparams.LogUniform):
lower, upper = hp.lower, hp.upper
if lower is None or upper is None:
return hp.get_default()
value = lower * math.pow(upper / lower, prob)
return value
elif isinstance(hp, (hyperparams.Normal, hyperparams.LogNormal)):
return norm.ppf(prob, loc=hp.mu, scale=hp.sigma)
else:
raise ValueError('Unrecognized HyperParameter type: {}'.format(hp))

+ 104
- 0
axolotl/axolotl/algorithms/tuners/oracle.py View File

@@ -0,0 +1,104 @@
import os

import hashlib
import random

from d3m import utils as d3m_utils
from d3m.metadata import problem as problem_module
from axolotl.algorithms.tuners.hyperparameters import HyperParameters, PIPELINE_CHOICE

_MAX_METRICS = {
problem_module.PerformanceMetric.ACCURACY,
problem_module.PerformanceMetric.PRECISION,
problem_module.PerformanceMetric.RECALL,
problem_module.PerformanceMetric.F1,
problem_module.PerformanceMetric.F1_MICRO,
problem_module.PerformanceMetric.F1_MACRO,
problem_module.PerformanceMetric.ROC_AUC,
problem_module.PerformanceMetric.JACCARD_SIMILARITY_SCORE,
problem_module.PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, # not sure
problem_module.PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION,
}
_MAX_METRICS_NAME = {s.name for s in _MAX_METRICS}


_MIN_METRICS = {
problem_module.PerformanceMetric.MEAN_ABSOLUTE_ERROR,
problem_module.PerformanceMetric.MEAN_SQUARED_ERROR,
problem_module.PerformanceMetric.ROOT_MEAN_SQUARED_ERROR,
problem_module.PerformanceMetric.R_SQUARED,
}
_MIN_METRICS_NAME = {s.name for s in _MIN_METRICS}


def infer_metric_direction(metric):
# Handle str input and get canonical object.
if isinstance(metric, str):
metric_name = metric
if metric_name in _MIN_METRICS_NAME:
return 'min'
elif metric_name in _MAX_METRICS_NAME:
return 'max'


def random_values(hyperparameters, seed_state, tried_so_far, max_collisions):
collisions = 0
while 1:
# Generate a set of random values.
hps = HyperParameters()
with d3m_utils.silence():
for hp in hyperparameters.space:
hps.merge([hp])
if hps.is_active(hp): # Only active params in `values`.
hps.values[hp.name] = hp.random_sample(seed_state)
seed_state += 1
# Pick out the invalid hyper-parameters
patch_invalid_hyperamaeters(hps)

values = hps.values
# Keep trying until the set of values is unique,
# or until we exit due to too many collisions.
values_hash = compute_values_hash(values)
if values_hash in tried_so_far:
collisions += 1
if collisions > max_collisions:
return None
continue
tried_so_far.add(values_hash)
break
return values, seed_state


def compute_values_hash(values):
keys = sorted(values.keys())
s = ''.join(str(k) + '=' + str(values[k]) for k in keys)
return hashlib.sha256(s.encode('utf-8')).hexdigest()[:32]


def patch_invalid_hyperamaeters(hps):
values = hps.values
for full_name in values:
if full_name == PIPELINE_CHOICE:
continue
hp_val = values[full_name]
step, primitive_name, hp_name = hps.get_name_parts(full_name)
if primitive_name == 'd3m.primitives.classification.svc.SKlearn' \
and hp_name == 'decision_function_shape' and hp_val == 'ovo':
# break_ties must be False if decision-function_shape == 'ovo'
break_ties = os.path.join(step, primitive_name, 'break_ties')
values[break_ties] = False
if primitive_name == 'd3m.primitives.classification.logistic_regression.SKlearn':
# elasticnet' penalty, solver must be'saga'
if hp_name == 'penalty' and hp_val == 'elasticnet':
solver = os.path.join(step, primitive_name, 'solver')
values[solver] = 'saga'
if hp_name == 'solver':
penalty = os.path.join(step, primitive_name, 'penalty')
# liblinear only supports 'ovr' multi_class and [l2, l1] penalty
if hp_val == 'liblinear':
multi_class = os.path.join(step, primitive_name, 'multi_class')
values[multi_class] = 'ovr'
values[penalty] = random.choice(['l2', 'l1'])
# ['lbfgs', 'newton-cg', 'sag'] only support [l2, none] penalty
elif hp_val in ['lbfgs', 'newton-cg', 'sag']:
values[penalty] = random.choice(['l2', 'none'])

+ 66
- 0
axolotl/axolotl/algorithms/tuners/random_search_oracle.py View File

@@ -0,0 +1,66 @@
from kerastuner import Objective
from kerastuner.engine import trial as trial_lib
from kerastuner.tuners.randomsearch import RandomSearchOracle as KerasRandomSearchOracle

from axolotl.algorithms.tuners.oracle import infer_metric_direction, random_values


class RandomSearchOracle(KerasRandomSearchOracle):
"""
Random search oracle.
"""

def __init__(self,
objective,
max_trials,
seed=None,
hyperparameters=None,
allow_new_entries=True,
tune_new_entries=True):
direction = infer_metric_direction(objective)
objective = Objective(name=objective, direction=direction)
super(RandomSearchOracle, self).__init__(
objective=objective,
max_trials=max_trials,
seed=seed,
hyperparameters=hyperparameters,
tune_new_entries=tune_new_entries,
allow_new_entries=allow_new_entries)

def _populate_space(self, _):
values = self._random_values()
if values is None:
return {'status': trial_lib.TrialStatus.STOPPED,
'values': None}
return {'status': trial_lib.TrialStatus.RUNNING,
'values': values}

def _random_values(self):
"""Fills the hyperparameter space with random values.

Returns:
A dictionary mapping parameter names to suggested values.
"""

values, seed_state = random_values(hyperparameters=self.hyperparameters,
seed_state=self._seed_state,
tried_so_far=self._tried_so_far,
max_collisions=self._max_collisions,
)
self._seed_state = seed_state
return values

def _save_trial(self, trial):
pass

def get_state(self):
# `self.trials` are saved in their own, Oracle-agnostic files.
# Just save the IDs for ongoing trials, since these are in `trials`.
state = {}
state['ongoing_trials'] = {
tuner_id: trial.trial_id
for tuner_id, trial in self.ongoing_trials.items()}
# Hyperparameters are part of the state because they can be added to
# during the course of the search.
state['hyperparameters'] = str(self.hyperparameters.get_config())
return state

+ 258
- 0
axolotl/axolotl/algorithms/tuners/tunable_base.py View File

@@ -0,0 +1,258 @@
import logging
import multiprocessing

import os
import uuid
import copy
from typing import Tuple
import re
import numpy as np

from d3m.metadata import hyperparams
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline

from kerastuner.engine import trial as trial_module

from axolotl import predefined_pipelines
from axolotl.algorithms.tuners import custom_hps
from axolotl.algorithms.base import PipelineSearchBase
from axolotl.algorithms.dummy import dummy_ranking_function
from axolotl.algorithms.tuners.hyperparameters import HyperParameters, PIPELINE_CHOICE
from axolotl.utils import schemas as schemas_utils

logger = logging.getLogger(__name__)


class TunableBase(PipelineSearchBase):

def __init__(self, problem_description, backend,
primitives_blocklist=None, ranking_function=None, num_eval_trials=None):
if ranking_function is None:
ranking_function = dummy_ranking_function
if num_eval_trials is None:
num_eval_trials = multiprocessing.cpu_count()
super(TunableBase, self).__init__(problem_description, backend,
primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
# TODO update this to be defined on problem/metrics terms
self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

self.scoring_pipeline = schemas_utils.get_scoring_pipeline()
self.scoring_params = None

self.metrics = problem_description['problem']['performance_metrics']

self.oracle = None
self.tuner_id = 'tuner'
self.hyperparameters = HyperParameters()
self.pipeline_candidates = {}
self.num_eval_trials = num_eval_trials

def set_pipeline_candidates(self, input_data, pipeline_candidates):
if pipeline_candidates is None:
problem = self.problem_description
# ToDo should use fetch(input_data, problem, schemas_utils.PIPELINES_DB_DIR)
for pipeline in predefined_pipelines.fetch_from_file(problem, schemas_utils.PIPELINES_DB_DIR):
self.pipeline_candidates[pipeline.id] = pipeline
elif isinstance(pipeline_candidates, list):
for pipeline in pipeline_candidates:
self.pipeline_candidates[pipeline.id] = pipeline
elif isinstance(pipeline_candidates, dict):
self.pipeline_candidates = pipeline_candidates
else:
raise ValueError('pipeline_candidate should be None, list or dict')

def init_search_space(self):
pipeline_id = hyperparams.Enumeration[str](
values=list(self.pipeline_candidates.keys()),
default=list(self.pipeline_candidates.keys())[0],
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
self.hyperparameters.retrieve(PIPELINE_CHOICE, pipeline_id)
for pipeline in self.pipeline_candidates.values():
self._get_pipeline_search_space(pipeline)

def _get_pipeline_search_space(self, pipeline):
PREFIX_STEP = 'step'
with self.hyperparameters.conditional_scope(PIPELINE_CHOICE, pipeline.id):
for i, step in enumerate(pipeline.steps):
with self.hyperparameters.name_scope('{}{}'.format(PREFIX_STEP, i)):
primitive = step.primitive
self._get_primitive_search_space(primitive)

def _get_primitive_search_space(self, primitive):
hyperparameters = primitive.metadata.query()['primitive_code']['hyperparams']
primitive_python_path = primitive.metadata.query()['python_path']
name = primitive_python_path
config = primitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'].configuration
custom_config = custom_hps.config.get(primitive_python_path, None)
if not custom_config is None:
config._dict.update(custom_config)
with self.hyperparameters.name_scope(name):
for param_name, param_info in hyperparameters.items():
if self.is_tunable(param_info['semantic_types']):
param_val = config[param_name]
# SortedSet.to_simple_structure() has bug, so we skip it.
if isinstance(param_val, (hyperparams.List, hyperparams.Set)):
continue
self.hyperparameters.retrieve(param_name, param_val)
if isinstance(param_val, hyperparams.Choice):
for choice_name, choice_val in param_val.choices.items():
with self.hyperparameters.conditional_scope(param_name, choice_name):
for sub_param_name, sub_param_val in choice_val.configuration.items():
if sub_param_name != 'choice':
self.hyperparameters.retrieve(sub_param_name, sub_param_val)

def is_tunable(self, semantic_types: Tuple[str, ...]) -> bool:
return any('tuning' in t.lower() for t in semantic_types)

def search_fit(self, input_data, time_limit=300, *, expose_values=False, pipeline_candidates=None):
self.set_pipeline_candidates(input_data, pipeline_candidates)
self.init_search_space()
return super(TunableBase, self).search_fit(input_data, time_limit, expose_values=expose_values)

def _search(self, time_left):
trials = self.create_trials(num_trials=self.num_eval_trials)
if len(trials) == 0:
logger.info('Oracle trigger exit')
return []
results = self.run_trials(trials, input_data=self.input_data)
self.end_trials(trials)
return results

def run_trials(self, trials, **fit_kwargs):
pipelines = []
id_2_trials = {}

for trial in trials:
hp = trial.hyperparameters
try:
pipeline = self.build_pipeline(hp)
id_2_trials[pipeline.id] = trial
pipelines.append(pipeline)
except Exception as e:
logger.error('Current trial is failed. Error: {}'.format(e))
trial.status = trial_module.TrialStatus.INVALID

input_data = fit_kwargs.pop('input_data')

pipeline_results = self.backend.evaluate_pipelines(
problem_description=self.problem_description,
pipelines=pipelines,
input_data=input_data,
metrics=self.metrics,
data_preparation_pipeline=self.data_preparation_pipeline,
scoring_pipeline=self.scoring_pipeline,
data_preparation_params=self.data_preparation_params,
)

results = []
for result in pipeline_results:
trial = id_2_trials[result.pipeline.id]
if result.status == 'ERRORED':
logger.error('Current trial is failed. Error: {}'.format(result.error))
trial.status = trial_module.TrialStatus.INVALID
else:
scores = result.scores
# scores = runtime_module.combine_folds(scores)
summarize_performance = schemas_utils.summarize_performance_metrics(scores)
metrics = self._get_pipeline_metrics(summarize_performance)
self.oracle.update_trial(
trial.trial_id, metrics=metrics
)
trial.status = trial_module.TrialStatus.COMPLETED
results.append(self.ranking_function(result))
return results

def build_pipeline(self, hyperparameters):
"""
hyperparameters example:
{
'STEP5/d3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization/max_percent_null: 0,
'STEP7/d3m.primitives.data_preprocessing.robust_scaler.SKlearn/quantile_range: (2.798121390864261, 14.852664215409096),
}
"""
values = hyperparameters.values
pipeline_id = hyperparameters.get_pipeline_id()
pipeline = copy.deepcopy(self.pipeline_candidates[pipeline_id])
pipeline.id = str(uuid.uuid4())
# update time
pipeline.created = Pipeline().created

skip_hps = set()
# for key in sorted(values.keys()):
for hp in hyperparameters.space:
if hyperparameters.is_active(hp) and hp.name not in skip_hps and hp.name != PIPELINE_CHOICE:
key = hp.name
step, primitive_name, hp_name = hyperparameters.get_name_parts(key)
value = values[key]
step_idx = self.__get_step_idx_by_name(step)
if step_idx is None:
raise KeyError('{} not in the pipeline'.format(primitive_name))
primitive_step = pipeline.steps[step_idx]
arg_type = ArgumentType.VALUE
# In order to avoid the following error
# Value '0' for hyper-parameter \
# 'STEP8/d3m.primitives.classification.xgboost_gbtree.DataFrameCommon/max_delta_step' \
# is not an instance of the structural type: typing.Union[int, NoneType]
# Here is workaround
if isinstance(value, np.int64):
value = int(value)
elif isinstance(value, np.str_):
value = str(value)
elif isinstance(value, np.bool_):
value = bool(value)
if hp_name in primitive_step.hyperparams:
del primitive_step.hyperparams[hp_name]
# Handle Choice
if isinstance(hp, hyperparams.Choice):
choice_cls = hp.choices[value]
_vals = {}
for name in choice_cls.configuration:
if name == 'choice':
_vals[name] = value
else:
_key = os.path.join(step, primitive_name, name)
_vals[name] = values[_key]
skip_hps.add(_key)
value = choice_cls(_vals)
primitive_step.add_hyperparameter(name=hp_name, argument_type=arg_type,
data=value)
return pipeline

def __get_step_idx_by_name(self, prefix_primitive_name):
regex = r"(?<=STEP)\d+"
match = re.search(regex, prefix_primitive_name, re.IGNORECASE)
if match:
return int(match.group(0))
return None

def _get_pipeline_metrics(self, summarize_performance):
metrics = {}
for name, info in summarize_performance.items():
metrics[name] = info['mean']
return metrics

def end_trials(self, trials):
"""A hook called after each trial is run.

# Arguments:
trial: A `Trial` instance.
"""
[self.oracle.end_trial(trial.trial_id, trial.status) for trial in trials]
# self.oracle.update_space(trial.hyperparameters)

def create_trials(self, num_trials):
trials = []
for i in range(num_trials):
try:
trial = self.oracle.create_trial('{}_{}'.format(self.tuner_id, i))
except:
break

if trial.status == trial_module.TrialStatus.STOPPED:
break
else:
trials.append(trial)
return trials

+ 0
- 0
axolotl/axolotl/backend/__init__.py View File


+ 313
- 0
axolotl/axolotl/backend/base.py View File

@@ -0,0 +1,313 @@
import abc
import typing

from d3m.metadata.problem import Problem, PerformanceMetric
from d3m.metadata.pipeline import Pipeline

from axolotl.utils.pipeline import PipelineResult
from axolotl.utils.schemas import ContainerType


class RunnerBase:
"""
A base class for the pipeline runner backend.
This child from this class must implement ``request_status`` and ``request_results`` which should keep
track of all requests.

Parameters
----------
random_seed : int
Random seed passed to the constructor.
volumes_dir : str
Path to a directory with static files required by primitives.
In the standard directory structure (as obtained running ``python3 -m d3m index download``).
scratch_dir : str
Path to a directory to store any temporary files needed during execution.

Attributes
----------
random_seed : int
Random seed passed to the constructor.
volumes_dir : str
Path to a directory with static files required by primitives.
In the standard directory structure (as obtained running ``python3 -m d3m index download``).
scratch_dir : str
Path to a directory to store any temporary files needed during execution.
"""
def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None) -> None:
self.random_seed = random_seed
self.volumes_dir = volumes_dir
self.scratch_dir = scratch_dir

def add_metric(self, name: str, *, best_value: float, worst_value: float, score_class: type,
requires_confidence: bool = False, requires_rank: bool = False):
"""
Method to register a new metric.

Parameters
----------
name : str
Metric name, e.g. ACCURACY.
best_value : float
Value that represents the best e.g. in accuracy 1.0
worst_value: float
Value that represent the worst e.g. in accuracy 0
score_class : type
A class that helps computing the score.
requires_confidence : bool
A flag that tells if the scoring function requires a confidence value.
requires_rank : bool
A flag that tell if the scoring function requires the rank of the predictions.
"""

PerformanceMetric.register_metric(name=name, best_value=best_value, worst_value=worst_value, score_class=score_class,
requires_confidence=requires_confidence, requires_rank=requires_rank)

@abc.abstractmethod
def get_request(self, request_id: str) -> PipelineResult:
"""
A method that returns the result from the requests

Parameters
----------
request_id : str
Request id of data to retrieve

Returns
-------
PipelineResult
A PipelineResult instance that contains the information.
"""

@abc.abstractmethod
def fit_pipeline_request(self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, timeout: float = None,
expose_outputs: bool = False) -> str:
"""
A method that submit a fit_pipeline job.

Parameters
----------
problem_description : Problem
A problem description.
pipeline : Pipeline
The pipeline that is going to be fitted.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.
expose_outputs : bool
A variable that enable exposing every intermediate results based on the input_data

Returns
-------
str
A request id.
"""

def fit_pipeline(self, problem_description: Problem, pipeline: Pipeline, input_data: typing.Sequence[ContainerType],
*, timeout: float = None, expose_outputs: bool = False) -> PipelineResult:
"""
A method that fit a pipeline, save the state and returns a PipelineResult.

Parameters
----------
problem_description : Problem
A problem description.
pipeline : Pipeline
A pipeline that are going to be fitted.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.
expose_outputs : bool
A variable that enable exposing every intermediate results based on the input_data

Returns
-------
PipelineResult
A pipeline result containg the result of fitting the pipeline.
"""
request_id = self.fit_pipeline_request(problem_description=problem_description, pipeline=pipeline,
input_data=input_data, timeout=timeout,
expose_outputs=expose_outputs)
return self.get_request(request_id)

@abc.abstractmethod
def produce_pipeline_request(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *,
timeout: float = None, expose_outputs: bool = False) -> str:
"""
A method that submit a produce pipeline request.

Parameters
----------
fitted_pipeline_id : str
The fitted pipeline if of the fitted pipeline to be use to produce results.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.
expose_outputs : bool
A variable that enable exposing every intermediate results based on the input_data

Returns
-------
str
A request id.
"""

# @abc.abstractmethod
def produce_pipeline(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *,
timeout: float = None, expose_outputs: bool = False) -> PipelineResult:
"""
A method that produce multiple fitted pipelines, save their state and returns a list of PipelineResult
that contain the information of every pipeline run.

Parameters
----------
fitted_pipeline_id : str
A list of fitted pipelines to run with the input_data
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.
expose_outputs : bool
A variable that enable exposing every intermediate results based on the input_data

Returns
-------
PipelineResult
A PipelineResult intance containing the information about the produced pipeline.
"""
request_id = self.produce_pipeline_request(fitted_pipeline_id, input_data, timeout=timeout,
expose_outputs=expose_outputs)
return self.get_request(request_id)

@abc.abstractmethod
def evaluate_pipeline_request(
self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> str:
"""
A method that evaluate multiple pipelines, and provides returns the scores and information of the pipelines.

Parameters
----------
problem_description : Problem
A problem description.
pipeline : Pipeline
A list of pipelines that are going to be run.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
metrics : typing.Sequence[typing.Dict]
A dictionary containing the metrics and their arguments.
data_preparation_pipeline : Pipeline
A pipeline that prepares the data for the pipelines to be evaluated in, e.g. Cross-fold validation
scoring_pipeline : Pipeline
A pipeline that is used to compute the scores of the pipelines.
data_preparation_params : typing.Dict[str, str]
Parameters for the data preparation pipeline
scoring_params: typing.Dict[str, str]
Parameters for the scoring pipeline
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.

Returns
-------
str
A request id
"""

def evaluate_pipeline(
self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> PipelineResult:
"""
A method that evaluate multiple pipelines, and provides returns the scores and information of the pipelines.

Parameters
----------
problem_description : Problem
A problem description.
pipeline : Pipeline
A pipeline that is going to be evaluated.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
metrics : typing.Sequence[typing.Dict]
A dictionary containing the metrics and their arguments.
data_preparation_pipeline : Pipeline
A pipeline that prepares the data for the pipelines to be evaluated in, e.g. Cross-fold validation
scoring_pipeline : Pipeline
A pipeline that is used to compute the scores of the pipelines.
data_preparation_params : typing.Dict[str, str]
Parameters for the data preparation pipeline
scoring_params: typing.Dict[str, str]
Parameters for the scoring pipeline
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.

Returns
-------
PipelineResult
Result of the evaluation of the pipeline.
"""
request_id = self.evaluate_pipeline_request(
problem_description, pipeline, input_data, metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params, scoring_params=scoring_params, timeout=timeout
)
return self.get_request(request_id)

def evaluate_pipelines(
self, problem_description: Problem, pipelines: typing.Sequence[Pipeline],
input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> typing.Sequence[PipelineResult]:
"""
A method that evaluate multiple pipelines, and provides returns the scores and information of the pipelines.

Parameters
----------
problem_description : Problem
A problem description.
pipelines : typing.Sequence[str]
A list of pipelines that are going to be run.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
metrics : typing.Sequence[typing.Dict]
A dictionary containing the metrics and their arguments.
data_preparation_pipeline : Pipeline
A pipeline that prepares the data for the pipelines to be evaluated in, e.g. Cross-fold validation
scoring_pipeline : Pipeline
A pipeline that is used to compute the scores of the pipelines.
data_preparation_params : typing.Dict[str, str]
Parameters for the data preparation pipeline
scoring_params: typing.Dict[str, str]
Parameters for the scoring pipeline
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.

Returns
-------
typing.Sequence[PipelineResult]
A sequence of PipelineResults.
"""
request_ids = []
for pipeline in pipelines:
request_ids.append(
self.evaluate_pipeline_request(
problem_description, pipeline, input_data, metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params, scoring_params=scoring_params, timeout=timeout
)
)

return [self.get_request(request_id) for request_id in request_ids]

+ 269
- 0
axolotl/axolotl/backend/ray.py View File

@@ -0,0 +1,269 @@
import ray
import typing
import uuid
import binascii
import hashlib
import time
from ray.util import ActorPool

from d3m import index as d3m_index
from d3m import utils as d3m_utils
from d3m import runtime as runtime_module
from d3m.metadata.problem import Problem
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.base import Context
from d3m.metadata import pipeline_run as pipeline_run_module
from d3m import container as container_module

from axolotl.backend.base import RunnerBase
from axolotl.utils.pipeline import PipelineResult, save_pipeline_run, save_exposed_values
from axolotl.utils.schemas import ContainerType
import multiprocessing


@ray.remote
class DataHandler:
def __init__(self):
self.data = {}

def add_data(self, input_data):
if isinstance(input_data, list):
values = []
for _data in input_data:
if isinstance(_data, container_module.Dataset):
values.append(_data.metadata.query(())['id'])

data_id = str(hashlib.sha256(str(values).encode('utf8')).hexdigest())
if data_id not in self.data:
self.data[data_id] = input_data
return data_id

def get_data(self, data_id):
if data_id in self.data:
return self.data[data_id]


@ray.remote
class RayExecutor:
def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, store_results=False,
blocklist=()) -> None:
self.random_seed = random_seed
self.volumes_dir = volumes_dir
self.scratch_dir = scratch_dir
self.fitted_pipelines = {}
with d3m_utils.silence():
d3m_index.load_all(blocklist=blocklist)
self.runtime_environment = pipeline_run_module.RuntimeEnvironment()
self.store_results = store_results

def fit_pipeline(
self, data_handler, problem_description: Problem, pipeline: Pipeline,
input_data_id: str, *, timeout: float = None, expose_outputs: bool = False
) -> PipelineResult:
pipeline_result = PipelineResult(pipeline=pipeline)
pipeline_result.status = "RUNNING"
pipeline_result.method_called = "fit"

request_id = data_handler.get_data.remote(input_data_id)
input_data = ray.get(request_id)

is_standard_pipeline = False
if len(input_data) == 1 and len(pipeline.outputs) == 1:
is_standard_pipeline = True

with d3m_utils.silence():
runtime, output, result = runtime_module.fit(
pipeline=pipeline, inputs=input_data, problem_description=problem_description, context=Context.TESTING,
hyperparams=None, random_seed=self.random_seed, volumes_dir=self.volumes_dir,
scratch_dir=self.scratch_dir,
runtime_environment=self.runtime_environment, is_standard_pipeline=is_standard_pipeline,
expose_produced_outputs=expose_outputs
)

if result.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = result.error
else:
pipeline_result.status = "COMPLETED"
fitted_pipeline_id = str(uuid.uuid4())

if self.store_results:
pipeline_result.exposed_outputs = save_exposed_values(result.values, pipeline.id, self.scratch_dir)
pipeline_result.output = save_exposed_values(output, pipeline.id, self.scratch_dir)
else:
pipeline_result.exposed_outputs = result.values
pipeline_result.output = output

pipeline_result.fitted_pipeline_id = fitted_pipeline_id
self.fitted_pipelines[fitted_pipeline_id] = runtime

if self.store_results:
pipeline_result.pipeline_run = save_pipeline_run(result.pipeline_run, self.scratch_dir)

return pipeline_result

def produce_pipeline(
self, data_handler, fitted_pipeline_id: str, input_data_id: str, *,
timeout: float = None, expose_outputs: bool = False
) -> PipelineResult:

pipeline_result = PipelineResult(fitted_pipeline_id=fitted_pipeline_id)
pipeline_result.status = "RUNNING"
pipeline_result.method_called = "produce"
pipeline_result.fitted_pipeline_id = fitted_pipeline_id

request_id = data_handler.get_data.remote(input_data_id)
input_data = ray.get(request_id)

with d3m_utils.silence():
output, result = runtime_module.produce(
fitted_pipeline=self.fitted_pipelines[fitted_pipeline_id], test_inputs=input_data,
expose_produced_outputs=expose_outputs
)

if result.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = result.error
else:
pipeline_result.status = "COMPLETED"
if self.store_results:
pipeline_result.exposed_outputs = save_exposed_values(result.values, fitted_pipeline_id, self.scratch_dir)
pipeline_result.output = save_exposed_values(output, fitted_pipeline_id, self.scratch_dir)
else:
pipeline_result.exposed_outputs = result.values
pipeline_result.output = output

if self.store_results:
pipeline_result.pipeline_run = save_pipeline_run(result.pipeline_run, self.scratch_dir)

return pipeline_result

def evaluate_pipeline(
self, data_handler, problem_description: Problem, pipeline: Pipeline,
input_data_id: str, *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> PipelineResult:

with d3m_utils.silence():
pipeline_result = PipelineResult(pipeline=pipeline)
pipeline_result.status = "RUNNING"
pipeline_result.method_called = "evaluate"

request_id = data_handler.get_data.remote(input_data_id)
input_data = ray.get(request_id)

with d3m_utils.silence():
scores, results = runtime_module.evaluate(
pipeline=pipeline, inputs=input_data, data_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline, problem_description=problem_description,
data_params=data_preparation_params, metrics=metrics, context=Context.TESTING,
scoring_params=scoring_params, hyperparams=None, random_seed=self.random_seed,
data_random_seed=self.random_seed, scoring_random_seed=self.random_seed,
volumes_dir=self.volumes_dir, scratch_dir=self.scratch_dir, runtime_environment=self.runtime_environment
)

if results.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = [result.error for result in results]
else:
pipeline_result.status = "COMPLETED"
pipeline_result.scores = runtime_module.combine_folds(scores)

if self.store_results:
pipeline_result.pipeline_run = save_pipeline_run(results.pipeline_runs, self.scratch_dir)
return pipeline_result

def fitted_pipeline_id_exists(self, fitted_pipeline_id):
return fitted_pipeline_id in self.fitted_pipelines


class RayRunner(RunnerBase):
def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None,
store_results=False, n_workers=None, blocklist=()) -> None:
if not ray.is_initialized():
ray.init()

super().__init__(random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir)
self.data_handler = DataHandler.remote()
self.ray_executor = RayExecutor.remote(random_seed=random_seed,
volumes_dir=volumes_dir, scratch_dir=scratch_dir,
store_results=store_results,blocklist=blocklist)

if n_workers is None:
n_workers = multiprocessing.cpu_count()
self.actor_pool = ActorPool([
RayExecutor.remote(random_seed=random_seed, volumes_dir=volumes_dir,
scratch_dir=scratch_dir, store_results=store_results,
blocklist=blocklist) for _ in range(n_workers)]
)

# Wait for primitives to be load on the workers
# time.sleep(len(d3m_index.search()) * 0.15)

def stop_ray(self):
ray.shutdown()

def get_request(self, request_id: str):
return ray.get(ray.ObjectID(binascii.unhexlify(request_id)))

def fit_pipeline_request(self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, timeout: float = None,
expose_outputs: bool = False) -> str:

request_id = self.data_handler.add_data.remote(input_data)
input_data_id = ray.get(request_id)
request_id = self.ray_executor.fit_pipeline.remote(self.data_handler, problem_description, pipeline, input_data_id,
timeout=timeout, expose_outputs=expose_outputs)
return request_id.hex()

def produce_pipeline_request(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *,
timeout: float = None, expose_outputs: bool = False) -> str:
request_id = self.data_handler.add_data.remote(input_data)
input_data_id = ray.get(request_id)
request_id = self.ray_executor.produce_pipeline.remote(self.data_handler, fitted_pipeline_id, input_data_id, timeout=timeout,
expose_outputs=expose_outputs)
return request_id.hex()

def evaluate_pipeline_request(
self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> str:
request_id = self.data_handler.add_data.remote(input_data)
input_data_id = ray.get(request_id)

request_id = self.ray_executor.evaluate_pipeline.remote(
self.data_handler, problem_description, pipeline, input_data_id, metrics=metrics,
data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline,
data_preparation_params=data_preparation_params, scoring_params=scoring_params, timeout=timeout
)
return request_id.hex()

def fitted_pipeline_id_exists(self, fitted_pipeline_id):
request_id = self.ray_executor.fitted_pipeline_id_exists.remote(fitted_pipeline_id)
return ray.get(request_id)

def evaluate_pipelines(
self, problem_description: Problem, pipelines: typing.Sequence[Pipeline],
input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> typing.Sequence[PipelineResult]:
request_id = self.data_handler.add_data.remote(input_data)
input_data_id = ray.get(request_id)

args = []
for pipeline in pipelines:
args.append({
'data_handler': self.data_handler, 'problem_description': problem_description, 'pipeline': pipeline,
'input_data_id': input_data_id, 'metrics': metrics, 'data_preparation_pipeline': data_preparation_pipeline,
'scoring_pipeline': scoring_pipeline,'data_preparation_params': data_preparation_params,
'scoring_params': scoring_params,'timeout': timeout
})

return self.actor_pool.map(lambda actor, arg: actor.evaluate_pipeline.remote(**arg), args)

+ 178
- 0
axolotl/axolotl/backend/simple.py View File

@@ -0,0 +1,178 @@
import typing
import uuid

from d3m import utils as d3m_utils
from d3m import runtime as runtime_module
from d3m.metadata.problem import Problem
from d3m.metadata.pipeline import Pipeline
from d3m.metadata.base import Context
from d3m.metadata import pipeline_run as pipeline_run_module

from axolotl.backend.base import RunnerBase
from axolotl.utils.pipeline import PipelineResult
from axolotl.utils.schemas import ContainerType


class SimpleRunner(RunnerBase):
def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None) -> None:
super().__init__(random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir)
self.fitted_pipelines = {}
self.request_results = {}

with d3m_utils.silence():
self.runtime_environment = pipeline_run_module.RuntimeEnvironment()

def get_request(self, request_id: str) -> PipelineResult:
"""
A method that returns the result from the requests

Parameters
----------
request_id : str
Request id of data to retrieve

Returns
-------
PipelineResult
A PipelineResult instance that contains the information.
"""
if request_id in self.request_results:
return self.request_results[request_id]
else:
return PipelineResult(fitted_pipeline_id='')

def fit_pipeline_request(self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, timeout: float = None,
expose_outputs: bool = False) -> str:
"""
A method that submit a fit_pipeline job.

Parameters
----------
problem_description : Problem
A problem description.
pipeline : Pipeline
The pipeline that is going to be fitted.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.
expose_outputs : bool
A variable that enable exposing every intermediate results based on the input_data

Returns
-------
str
A request id.
"""
request_id = str(uuid.uuid4())
pipeline_result = PipelineResult(pipeline=pipeline)
pipeline_result.status = "RUNNING"
pipeline_result.method_called = "fit"

is_standard_pipeline = False
if len(input_data) == 1 and len(pipeline.outputs) == 1:
is_standard_pipeline = True

runtime, output, result = runtime_module.fit(
pipeline=pipeline, inputs=input_data, problem_description=problem_description, context=Context.TESTING,
hyperparams=None, random_seed=self.random_seed, volumes_dir=self.volumes_dir,
scratch_dir=self.scratch_dir,
runtime_environment=self.runtime_environment, is_standard_pipeline=is_standard_pipeline,
expose_produced_outputs=expose_outputs
)

if result.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = result.error
else:
pipeline_result.status = "COMPLETED"
pipeline_result.exposed_outputs = result.values
pipeline_result.output = output
fitted_pipeline_id = str(uuid.uuid4())
pipeline_result.fitted_pipeline_id = fitted_pipeline_id
self.fitted_pipelines[fitted_pipeline_id] = runtime

pipeline_result.pipeline_run = result.pipeline_run
self.request_results[request_id] = pipeline_result

return request_id

def produce_pipeline_request(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *,
timeout: float = None, expose_outputs: bool = False) -> str:
"""
A method that submit a produce pipeline request.

Parameters
----------
fitted_pipeline_id : str
The fitted pipeline if of the fitted pipeline to be use to produce results.
input_data : typing.Sequence[ContainerType]
A list of D3M containers.
timeout : float
A maximum amount of time that pipelines are going to be executed in seconds.
expose_outputs : bool
A variable that enable exposing every intermediate results based on the input_data

Returns
-------
str
A request id.
"""
request_id = str(uuid.uuid4())

pipeline_result = PipelineResult(fitted_pipeline_id=fitted_pipeline_id)
pipeline_result.status = "RUNNING"
pipeline_result.method_called = "produce"
pipeline_result.fitted_pipeline_id = fitted_pipeline_id

output, result = runtime_module.produce(
fitted_pipeline=self.fitted_pipelines[fitted_pipeline_id], test_inputs=input_data,
expose_produced_outputs=expose_outputs
)

if result.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = result.error
else:
pipeline_result.status = "COMPLETED"
pipeline_result.output = output
pipeline_result.exposed_outputs = result.values

pipeline_result.pipeline_run = result.pipeline_run
self.request_results[request_id] = pipeline_result

return request_id

def evaluate_pipeline_request(
self, problem_description: Problem, pipeline: Pipeline,
input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict],
data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None,
data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None,
timeout: float = None
) -> str:
request_id = str(uuid.uuid4())

pipeline_result = PipelineResult(pipeline=pipeline)
pipeline_result.status = "RUNNING"
pipeline_result.method_called = "evaluate"

scores, results = runtime_module.evaluate(
pipeline=pipeline, inputs=input_data, data_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline, problem_description=problem_description,
data_params=data_preparation_params, metrics=metrics, context=Context.TESTING,
scoring_params=scoring_params, hyperparams=None, random_seed=self.random_seed,
data_random_seed=self.random_seed, scoring_random_seed=self.random_seed,
volumes_dir=self.volumes_dir, scratch_dir=self.scratch_dir, runtime_environment=self.runtime_environment
)

if results.has_error():
pipeline_result.status = "ERRORED"
pipeline_result.error = [result.error for result in results]
else:
pipeline_result.status = "COMPLETED"
pipeline_result.scores = runtime_module.combine_folds(scores)

self.request_results[request_id] = pipeline_result
return request_id


+ 0
- 0
axolotl/axolotl/d3m_grpc/__init__.py View File


+ 127
- 0
axolotl/axolotl/d3m_grpc/constants.py View File

@@ -0,0 +1,127 @@
import os
import json
import re

from axolotl.utils.resources import check_directory


# A class to wrap envrioment variables under d3m scope.
class EnvVars:
# A label what is the setting under which the pod is being run; possible
# values: ta2, ta2ta3; this variable is available only for informative
# purposes but it is not used anymore to change an overall mode of operation
# of TA2 system because now TA2 evaluation will happen through TA2-TA3 API
# as well
D3MRUN = 'run'
PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '../..')
# A location of dataset(s), can contain multiple datasets in arbitrary
# directory structure, read-only
D3MINPUTDIR = '/input_dir'
# A location to problem description to use (should be under D3MINPUTDIR),
# datasets are linked from the problem description using IDs, those datasets
# should exist inside D3MINPUTDIR
D3MPROBLEMPATH = 'problem_path'
# A location of output files, shared by TA2 and TA3 pods (and probably data
# mart)
D3MOUTPUTDIR = os.path.join(PROJECT_ROOT, 'output_dir')
# A local-to-host directory provided; used by memory sharing mechanisms
D3MLOCALDIR = os.path.join(D3MOUTPUTDIR, 'temp', 'plasma')
# A path to the volume with primitives' static files
D3MSTATICDIR = None
# Available CPU units in Kubernetes specification
D3MCPU = 0
# Available CPU units in Kubernetes specification
D3MRAM = 0
# Time limit for the search phase (available to the pod), in seconds
D3MTIMEOUT = -1

# Plasma socket
PLASMA_SOCKET = '/tmp/plasma'

# datamart uri DATAMART_URL_NYU
DATAMART_URL_NYU = 'https://datamart.d3m.vida-nyu.org'

if 'D3MRUN' in os.environ:
D3MRUN = os.environ['D3MRUN']
if 'D3MINPUTDIR' in os.environ:
D3MINPUTDIR = os.environ['D3MINPUTDIR']
if 'D3MPROBLEMPATH' in os.environ:
D3MPROBLEMPATH = os.environ['D3MPROBLEMPATH']
if 'D3MOUTPUTDIR' in os.environ:
D3MOUTPUTDIR = os.environ['D3MOUTPUTDIR']
if 'D3MLOCALDIR' in os.environ:
D3MLOCALDIR = os.environ['D3MLOCALDIR']
if 'D3MSTATICDIR' in os.environ:
D3MSTATICDIR = os.environ['D3MSTATICDIR']
if 'D3MCPU' in os.environ:
D3MCPU = int(float(os.environ['D3MCPU']))
# if we don't set it or its to low set to 4
# if D3MCPU < 4:
# D3MCPU = 4
if 'D3MRAM' in os.environ:
D3MRAM = int(re.search(r'\d+', os.environ['D3MRAM']).group())
if 'D3MTIMEOUT' in os.environ:
D3MTIMEOUT = os.environ['D3MTIMEOUT']
if 'PLASMA_SOCKET' in os.environ:
PLASMA_SOCKET = os.environ['PLASMA_SOCKET']
if 'DATAMART_URL_NYU' in os.environ:
DATAMART_URL_NYU = os.environ['DATAMART_URL_NYU']


# #
class Path:
# Temporary directories.
# A temporary directory for other things.
TEMP_STORAGE_ROOT = os.path.join(EnvVars.D3MOUTPUTDIR, 'temp/')
# A temporary directory to store other stuff between ta2-ta3
OTHER_OUTPUTS = os.path.join(TEMP_STORAGE_ROOT, 'other_outputs')
# To deprecate after figure out what to do with executables.
TEMP_PROBLEM_DESC = os.path.join(TEMP_STORAGE_ROOT, 'problem_description')

check_directory(TEMP_STORAGE_ROOT)
check_directory(OTHER_OUTPUTS)
check_directory(TEMP_PROBLEM_DESC)


class SearchPath:

def __init__(self, search_id):
self.base_path = os.path.join(EnvVars.D3MOUTPUTDIR, search_id)

# A directory with ranked pipelines to be evaluated, named
# <pipeline id>.json; these files should have additional field pipeline_rank
self.pipelines_ranked = os.path.join(self.base_path, 'pipelines_ranked')
check_directory(self.pipelines_ranked)

# A directory with successfully scored pipelines during the search,
# named <pipeline id>.json
self.pipelines_scored = os.path.join(self.base_path, 'pipelines_scored')
check_directory(self.pipelines_scored)
# A directory of full pipelines which have not been scored or ranked for any
# reason, named <pipeline id>.json
self.pipelines_searched = os.path.join(self.base_path, 'pipelines_searched')
check_directory(self.pipelines_searched)
# A directory with any subpipelines referenced from pipelines in
# pipelines_* directories, named <pipeline id>.json
self.subpipelines = os.path.join(self.base_path, 'subpipelines')
check_directory(self.subpipelines)
# A directory with pipeline run records in YAML format, multiple can be
# stored in the same file, named <pipeline run id>.yml
self.pipeline_runs = os.path.join(self.base_path, 'pipeline_runs')
check_directory(self.pipeline_runs)
# A directory where TA2 system can store any additional datasets to be
# provided during training and testing to their pipelines; each dataset
# should be provided in a sub-directory in a D3M dataset format; all
# datasets here should have an unique ID; in the case that additional
# datasets are provided, TA2 should output also pipeline run documents for
# their ranked pipelines because those pipeline run documents contain
# information how to map these additional inputs to pipeline inputs
self.additional_inputs = os.path.join(self.base_path, 'additional_inputs')
check_directory(self.additional_inputs)


# A class that wraps a block list of primitives
# To generate this list is necessary to run modules.utils.primitive_selection
class PrimitivesList:
with open(os.path.join(os.path.dirname(__file__), '..', 'utils', 'resources', 'blocklist.json'), 'r') as file:
BlockList = json.load(file)

+ 854
- 0
axolotl/axolotl/d3m_grpc/server.py View File

@@ -0,0 +1,854 @@
import argparse
import json
import logging
import os
import pathlib
import time
import warnings
from concurrent import futures
import ray
import os
import uuid

import google.protobuf.timestamp_pb2 as p_timestamp
import grpc
from d3m import utils as d3m_utils, index as d3m_index
from d3m.metadata import problem as problem_module
from d3m.metadata.pipeline import Resolver
from d3m import container
from d3m import runtime as runtime_module
from d3m.metadata.base import Context
from ta3ta2_api import core_pb2, core_pb2_grpc, primitive_pb2, value_pb2, utils

from axolotl.backend.ray import RayRunner
from axolotl.algorithms.dummy import DummySearch, dummy_ranking_function
from axolotl.algorithms.data_driven_search import DataDrivenSearch
from axolotl.utils.pipeline import load_pipeline, save_pipeline
from axolotl.d3m_grpc.constants import SearchPath, EnvVars, PrimitivesList, Path
from axolotl.utils import resources as resources_module, schemas as schemas_utils

from pprint import pprint


__version__ = '2020.4.4_pre'
_ONE_DAY_IN_SECONDS = 60 * 60 * 24

logger = logging.getLogger(__name__)
AGENT = 'TAMU.10.0_pre'
ALLOWED_VALUE_TYPES = ['RAW', 'DATASET_URI', 'CSV_URI']
SUPPORTED_EXTENSIONS = []


def available_primitives():
primitives_info = []

with d3m_utils.silence():
for primitive_path in d3m_index.search():
if primitive_path in PrimitivesList.BlockList:
continue

try:
primitive = d3m_index.get_primitive(primitive_path)
primitive_id = primitive.metadata.query()['id']
version = primitive.metadata.query()['version']
python_path = primitive.metadata.query()['python_path']
name = primitive.metadata.query()['name']
digest = primitive.metadata.query().get('digest', None)
primitive_info = {
'id': primitive_id,
'version': version,
'python_path': python_path,
'name': name,
'digest': digest
}
primitives_info.append(primitive_info)
except:
continue
return primitives_info


PRIMITIVES_LIST = available_primitives()


@ray.remote
class SearchWrappers:
def __init__(self, search_class, problem_description, backend, primitives_blocklist=None, ranking_function=None, n_workers=2):
self.search_algorithm = search_class(problem_description=problem_description, backend=backend,
primitives_blocklist=primitives_blocklist, ranking_function=ranking_function,
n_workers=n_workers)
self._seen_index = 0
self.has_input_data = False
self.time_left = None
self.active_search = True
self.save_path = SearchPath(self.search_algorithm.search_id)

def search_request(self, time_left, input_data=None):
time_start = time.time()
if not self.has_input_data:
self.search_algorithm.input_data = input_data
self.time_left = time_left
self.has_input_data = True

results = self.search_algorithm._search(time_left)
self.search_algorithm.history += results
succeed_pipelines = []
for result in results:
print('pipeline', result.pipeline.id, result.status)
# save all results in pipelines searched
save_pipeline(result.pipeline, self.save_path.pipelines_searched)

# save all pipelines_runs
resources_module.copy_file(result.pipeline_run, self.save_path.pipeline_runs)

# we filter the ones that were completed
if result.status == 'COMPLETED':
# since we were able to score it, we put a copy into the pipelines_scored directory
save_pipeline(result.pipeline, self.save_path.pipelines_scored)
succeed_pipelines.append(result)

self.time_left -= time.time() - time_start
return succeed_pipelines

def end_search(self):
self.active_search = False

def is_search_active(self):
return self.active_search

def get_search_id(self):
return self.search_algorithm.search_id

def get_time_left(self):
return self.time_left


class Core(core_pb2_grpc.CoreServicer):
"""
A class that works as a server that provides support for the pipeline searches, and provides the interfaces
defined on the TA3-2 API.

Attributes
----------
version: str
A str that represents the version of the Ta3-2 api that is supporting.
user_agents: dict()
A simple dictionary that keep the relation of the different users.
manager: ExecutionManger
Schedules the searches, and all resources related with the search.
"""

def __init__(self):
logger.info('########## Initializing Service ##########')
self.version = core_pb2.DESCRIPTOR.GetOptions().Extensions[core_pb2.protocol_version]
self.n_workers = EnvVars.D3MCPU
if self.n_workers > 7:
self.n_workers = int(self.n_workers/2) + 1
print('Server n_workers', self.n_workers)
self.backend = RayRunner(random_seed=0, volumes_dir=EnvVars.D3MSTATICDIR, scratch_dir=Path.TEMP_STORAGE_ROOT,
blocklist=PrimitivesList.BlockList, store_results=True, n_workers=self.n_workers)
self.searches = {}
self.request_mapping = {}
self.solutions = {}
self.problem_descriptions = {}

# TODO add support for templates
def SearchSolutions(self, request, context):
user_agent = request.user_agent
logger.info('method=SearchSolution, agent=%s', user_agent)

# Checking version of protocol.
if request.version != self.version:
logger.info(' method=SearchSolution, info=Different api version%s', self.version)

# Types allowed by client
allowed_value_types = list(request.allowed_value_types)

if not allowed_value_types:
allowed_value_types = ALLOWED_VALUE_TYPES

problem_description = utils.decode_problem_description(request.problem)

# Parsing and storing Pipeline Template (store this to a file instead of passing it)
with d3m_utils.silence():
template = utils.decode_pipeline_description(pipeline_description=request.template,
resolver=Resolver(primitives_blocklist=PrimitivesList.BlockList))

time_bound_search = request.time_bound_search
time_bound_search = time_bound_search * 60

input_data = [load_data(utils.decode_value(x)) for x in request.inputs]

search = SearchWrappers.remote(search_class=DataDrivenSearch, problem_description=problem_description,
backend=self.backend, primitives_blocklist=PrimitivesList.BlockList,
ranking_function=dummy_ranking_function, n_workers=self.n_workers)

request_id = search.get_search_id.remote()
search_id = ray.get(request_id)

# print('got search_id')
self.searches[search_id] = search
request_id = self.searches[search_id].search_request.remote(time_left=time_bound_search, input_data=input_data)

self.request_mapping[search_id] = request_id
self.solutions[search_id] = []
self.problem_descriptions[search_id] = problem_description
response = core_pb2.SearchSolutionsResponse(search_id=search_id)
return response

def GetSearchSolutionsResults(self, request, context):
search_id = request.search_id
logger.info('method=GetSearchSolutionsResults, search_id=%s', search_id)
request_id = self.request_mapping[search_id]

progress_start = p_timestamp.Timestamp()
progress_end = p_timestamp.Timestamp()

all_ticks = 0
done_ticks = 0

# Yield running so the client know the search is running.
progress = core_pb2.Progress(state='RUNNING', status='Running Search', start=progress_start)
response = core_pb2.GetSearchSolutionsResultsResponse(progress=progress)
yield response

has_solution = False

succeed_pipelines = ray.get(request_id)
time_left_id = self.searches[search_id].get_time_left.remote()
time_left = ray.get(time_left_id)

while True:
start_time = time.time()

# if no time left we stop
if time_left < 5:
break

# case if a signal from EndSolution is sent to stop the search
is_active_id = self.searches[search_id].is_search_active.remote()
is_active = ray.get(is_active_id)

if not is_active:
logger.info('method=GetSearchSolutionsResults, search_id={} message=SearchStopped'.format(search_id))
break

for succeed_pipeline in succeed_pipelines:
has_solution = True
logger.info('method=GetSearchSolutionsResults, search_id={} solution_id={}'.format(
search_id,succeed_pipeline.pipeline.id))
response = core_pb2.GetSearchSolutionsResultsResponse(
progress=progress,
done_ticks=done_ticks,
all_ticks=all_ticks,
solution_id=succeed_pipeline.pipeline.id,
internal_score=1-succeed_pipeline.rank,
scores=[core_pb2.SolutionSearchScore(scores=encode_scores(succeed_pipeline))]
)
self.solutions[search_id].append(succeed_pipeline.pipeline.id)
yield response

finished, running = ray.wait([request_id], timeout=1)

if finished:
succeed_pipelines = ray.get(request_id)
request_id = self.searches[search_id].search_request.remote(time_left=time_left)
else:
succeed_pipelines = []

time.sleep(1)

time_left -= time.time() - start_time

if has_solution:
progress_state = 'COMPLETED'
progress_status = 'Search completed'
else:
progress_state = 'ERRORED'
progress_status = 'No solution founded'

logger.info('method=GetSearchSolutionsResults, search_id={}, status={}, message={}'.format(
search_id, progress_state, progress_status)
)
progress_end.GetCurrentTime()
progress = core_pb2.Progress(state=progress_state, status=progress_status,
start=progress_start, end=progress_end)
response = core_pb2.GetSearchSolutionsResultsResponse(progress=progress, done_ticks=done_ticks,
all_ticks=all_ticks,)
yield response

def EndSearchSolutions(self, request, context):
search_id = request.search_id
logger.info('method=EndSearchSolutions search_id=%s', search_id)
ray.kill(self.searches[search_id])
del self.searches[search_id]
response = core_pb2.EndSearchSolutionsResponse()
return response

def StopSearchSolutions(self, request, context):
search_id = request.search_id
self.searches[search_id].end_search.remote()
logger.info('method=StopSearchSolutions search_id=%s', search_id)
response = core_pb2.StopSearchSolutionsResponse()
return response

def DescribeSolution(self, request, context):
solution_id = request.solution_id
logger.info('method=DescribeSolution, solution_id=%s', solution_id)

pipeline, _, _ = self.get_solution_problem(solution_id)
if pipeline is None:
logger.info('method=DescribeSolution, solution_id=%s, error=Solution_id not found', solution_id)
response = core_pb2.DescribeSolutionResponse()
return response

with d3m_utils.silence():
pipeline = utils.encode_pipeline_description(pipeline, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT)

response = core_pb2.DescribeSolutionResponse(pipeline=pipeline)
return response

def ScoreSolution(self, request, context):
solution_id = request.solution_id
logger.info('method=SocreSolution, solution_id=%s', solution_id)

pipeline, problem_description, _ = self.get_solution_problem(solution_id)
if pipeline is None:
logger.info('method=FitSolution, solution_id=%s, status=ERRORED, error=Solution_id not found', solution_id)
response = core_pb2.ScoreSolutionResponse()
return response

input_data = [load_data(utils.decode_value(x)) for x in request.inputs]
metrics = [utils.decode_performance_metric(metric) for metric in request.performance_metrics]
scoring_pipeline = schemas_utils.get_scoring_pipeline()
data_preparation_params = decode_scoring_configuration(request.configuration)
data_preparation_pipeline = schemas_utils.get_splitting_pipeline(data_preparation_params['method'])

request_id = self.backend.evaluate_pipeline_request(
problem_description=problem_description, pipeline=pipeline, input_data=input_data,
metrics=metrics, data_preparation_pipeline=data_preparation_pipeline,
scoring_pipeline=scoring_pipeline, data_preparation_params=data_preparation_params)

response = core_pb2.ScoreSolutionResponse(request_id=request_id)
return response

def GetScoreSolutionResults(self, request, context):
request_id = request.request_id
logger.info('method=GetScoreSolutionResults, request_id=%s', request_id)

progress_start = p_timestamp.Timestamp()
progress_end = p_timestamp.Timestamp()
progress_start.GetCurrentTime()

progress = core_pb2.Progress(state='RUNNING', status='Running score job', start=progress_start)
response = core_pb2.GetScoreSolutionResultsResponse(progress=progress)
yield response

pipeline_result = self.backend.get_request(request_id)
progress_end.GetCurrentTime()

if pipeline_result.error is None:
progress = core_pb2.Progress(
state='COMPLETED',
status='Score job COMPLETED',
start=progress_start,
end=progress_end
)

response = core_pb2.GetScoreSolutionResultsResponse(
progress=progress, scores=encode_scores(pipeline_result))
else:
progress = core_pb2.Progress(
state='ERRORED',
status=str(pipeline_result.error),
start=progress_start,
end=progress_end
)

response = core_pb2.GetScoreSolutionResultsResponse(progress=progress)
yield response
return

def FitSolution(self, request, context):
solution_id = request.solution_id
logger.info('method=FitSolution solution_id=%s', solution_id)

pipeline, problem_description, _ = self.get_solution_problem(solution_id)
if pipeline is None:
logger.info('method=FitSolution, solution_id=%s, status=ERRORED, error=Solution_id not found', solution_id)
response = core_pb2.FitSolutionResponse()
return response

input_data = [load_data(utils.decode_value(x)) for x in request.inputs]

expose_outputs = [expose_output for expose_output in request.expose_outputs]
if expose_outputs:
expose_outputs = True
else:
expose_outputs = False

request_id = self.backend.fit_pipeline_request(
problem_description=problem_description, pipeline=pipeline,
input_data=input_data, expose_outputs=expose_outputs
)

response = core_pb2.FitSolutionResponse(request_id=request_id)
return response

def GetFitSolutionResults(self, request, context):
request_id = request.request_id
logger.info('method=GetFitSolutionResults request_id=%s', request_id)

progress_start = p_timestamp.Timestamp()
progress_end = p_timestamp.Timestamp()
progress_start.GetCurrentTime()

progress = core_pb2.Progress(state='RUNNING', status='Running fit job', start=progress_start)
response = core_pb2.GetFitSolutionResultsResponse(progress=progress)
yield response

pipeline_result = self.backend.get_request(request_id)
progress_end.GetCurrentTime()

if pipeline_result.error is None:
progress = core_pb2.Progress(
state='COMPLETED',
status='Fit job COMPLETED',
start=progress_start,
end=progress_end
)
response = core_pb2.GetFitSolutionResultsResponse(
progress=progress, steps=[], exposed_outputs=encode_exposed_values(pipeline_result.exposed_outputs),
fitted_solution_id=pipeline_result.fitted_pipeline_id
)
else:
progress = core_pb2.Progress(
state='ERRORED',
status=str(pipeline_result.error),
start=progress_start,
end=progress_end
)

response = core_pb2.GetFitSolutionResultsResponse(progress=progress)
yield response
return

def ProduceSolution(self, request, context):
fitted_solution_id = request.fitted_solution_id
logger.info('method=ProduceSolution, fitted_solution_id=%s', fitted_solution_id)

if not self.backend.fitted_pipeline_id_exists(fitted_solution_id):
logger.info(
'method=ProduceSolution, fitted_solution_id=%s, status=ERRORED info=No fitted_solution_id found', fitted_solution_id)
response = core_pb2.ProduceSolutionResponse()
return response

input_data = [load_data(utils.decode_value(x)) for x in request.inputs]

expose_outputs = [expose_output for expose_output in request.expose_outputs]
if expose_outputs:
expose_outputs = True
else:
expose_outputs = False

request_id = self.backend.produce_pipeline_request(fitted_pipeline_id=fitted_solution_id,
input_data=input_data, expose_outputs=expose_outputs)
response = core_pb2.ProduceSolutionResponse(request_id=request_id)
return response

# TODO add expose_outputs to files
def GetProduceSolutionResults(self, request, context):
request_id = request.request_id
logger.info('method=GetProduceSolutionResults, request_id=%s', request_id)

progress_start = p_timestamp.Timestamp()
progress_end = p_timestamp.Timestamp()
progress_start.GetCurrentTime()

progress = core_pb2.Progress(state='RUNNING', status='Running produce job', start=progress_start)
response = core_pb2.GetProduceSolutionResultsResponse(progress=progress)
yield response

pipeline_result = self.backend.get_request(request_id)
progress_end.GetCurrentTime()

if pipeline_result.error is None:
progress = core_pb2.Progress(
state='COMPLETED',
status='Produce job COMPLETED',
start=progress_start,
end=progress_end
)
step_progress = []

response = core_pb2.GetProduceSolutionResultsResponse(
progress=progress, steps=step_progress, exposed_outputs=encode_exposed_values(pipeline_result.exposed_outputs))
else:
progress = core_pb2.Progress(
state='ERRORED',
status=str(pipeline_result.error),
start=progress_start,
end=progress_end
)

response = core_pb2.GetProduceSolutionResultsResponse(progress=progress)
yield response
return

def SolutionExport(self, request, context):
solution_id = request.solution_id
rank = request.rank

try:
pipeline, _, search_id = self.get_solution_problem(solution_id)
except:
pipeline = None

if pipeline is None:
logger.info('method=SolutionExport, solution_id=%s, status=ERRORED, error=No solution_id found', solution_id)
else:
logger.info('method=SolutionExport solution_id=%s', solution_id)
save_pipeline(pipeline, SearchPath(search_id).pipelines_ranked, rank=rank)
response = core_pb2.SolutionExportResponse()
return response

# def SaveSolution(self, request, context):
# solution_id = request.solution_id
# logger.info('method=SaveSolution solution_id=%s', solution_id)
#
# if solution_id not in self.manager.solutions:
# logger.info('method=SaveSolution, solution_id=%s, error=Solution_id not found', solution_id)
# response = core_pb2.SaveSolutionResponse()
# else:
# solution_uri = self.manager.save_solution(solution_id)
# response = core_pb2.SaveSolutionResponse(solution_uri=solution_uri)
# return response

# def LoadSolution(self, request, context):
# solution_uri = request.solution_uri
# logger.info('method=LoadSolution solution_uri=%s', solution_uri)
#
# if not os.path.exists(solution_uri):
# logger.info('method=LoadSolution, solution_uri=%s, error=solution_uri not found', solution_uri)
# response = core_pb2.LoadSolutionResponse()
# else:
# solution_id = self.manager.load_solution(solution_uri)
# response = core_pb2.LoadSolutionResponse(solution_id=solution_id)
# return response

# def SaveFittedSolution(self, request, context):
# fitted_solution_id = request.fitted_solution_id
# logger.info('method=SaveFittedSolution, fitted_solution_id=%s', fitted_solution_id)
#
# if fitted_solution_id not in self.manager.fitted_solutions:
# logger.info('method=SaveFittedSolution, fitted_solution_id=%s, status=ERRORED, '
# 'info=No fitted_solution_id found', fitted_solution_id)
# response = core_pb2.SaveFittedSolutionResponse()
# else:
# fitted_solution_uri = self.manager.save_fitted_solution(fitted_solution_id)
# response = core_pb2.SaveFittedSolutionResponse(fitted_solution_uri=fitted_solution_uri)
# return response

# def LoadFittedSolution(self, request, context):
# fitted_solution_uri = request.fitted_solution_uri
# logger.info('method=LoadFittedSolution solution_uri=%s', fitted_solution_uri)
#
# if not os.path.exists(fitted_solution_uri):
# logger.info('method=LoadFittedSolution, solution_uri=%s, error=solution_uri not found', fitted_solution_uri)
# response = core_pb2.LoadFittedSolutionResponse()
# else:
# fitted_solution_id = self.manager.load_fitted_solution(fitted_solution_uri)
# response = core_pb2.LoadFittedSolutionResponse(fitted_solution_id=fitted_solution_id)
# return response

# def ScorePredictions(self, request, context):
# logger.info('method=ScorePredictions')
# predictions = utils.decode_value(request.predictions)
# score_input = utils.decode_value(request.score_input)
# problem = utils.decode_problem_description(request.problem)
# metrics = [utils.decode_performance_metric(_metric) for _metric in request.metric]
#
# scores, score_result = self.manager.score_predictions(predictions, score_input, problem, metrics)
# if score_result.has_error():
# logger.info('method=ScorePredictions, error={}', score_result.error)
# response = core_pb2.ScorePredictionsResponse()
# else:
# scores = self.encode_scores(scores)
# response = core_pb2.ScorePredictionsResponse(scores=scores)
# return response

def DataAvailable(self, request, context):
user_agent = request.user_agent
version = request.version
time_bound = request.time_bound

logger.info('method=DataAvailable, agent={}, version={}, time_bound={}'.format(
user_agent, version, time_bound))
response = core_pb2.DataAvailableResponse()
return response

def SplitData(self, request, context):
input_data = [load_data(utils.decode_value(x)) for x in request.inputs]
scoring_configuration = decode_scoring_configuration(request.scoring_configuration)
problem_description = utils.decode_problem_description(request.problem)
data_pipeline = schemas_utils.get_splitting_pipeline(scoring_configuration['method'])

data_random_seed = 0
outputs, data_result = runtime_module.prepare_data(
data_pipeline=data_pipeline, problem_description=problem_description,
inputs=input_data, data_params=scoring_configuration, context=Context.TESTING, random_seed=data_random_seed,
volumes_dir=EnvVars.D3MSTATICDIR, scratch_dir=Path.TEMP_STORAGE_ROOT, runtime_environment=None,
)

if data_result.has_error():
logger.info('method=SplitData, error={}', data_result.error)
response = core_pb2.SplitDataResponse()
yield response
return
else:
for i, (train_output, test_output, score_output) in enumerate(zip(*outputs)):
uri_list = []
for output, tag in (
(train_output, 'train'),
(test_output, 'test'),
(score_output, 'score'),
):
path = os.path.join(
Path.TEMP_STORAGE_ROOT, '{}_output_{}'.format(tag, i), 'datasetDoc.json')
uri = get_uri(path)
output.save(uri)
uri_list.append(uri)
# response
response = core_pb2.SplitDataResponse(
train_output=value_pb2.Value(dataset_uri=uri_list[0]),
test_output=value_pb2.Value(dataset_uri=uri_list[1]),
score_output=value_pb2.Value(dataset_uri=uri_list[2]),
)
yield response

def ListPrimitives(self, request, context):
logger.info('method=ListPrimitives')
primitives_list = []
for primitive_info in PRIMITIVES_LIST:
primitives_list.append(primitive_pb2.Primitive(**primitive_info))
response = core_pb2.ListPrimitivesResponse(primitives=primitives_list)
return response

def Hello(self, request, context):
logger.info('method=Hello')
user_agent = AGENT
version = core_pb2.DESCRIPTOR.GetOptions().Extensions[core_pb2.protocol_version]
allowed_value_types = ALLOWED_VALUE_TYPES
supported_extensions = SUPPORTED_EXTENSIONS

response = core_pb2.HelloResponse(
user_agent=user_agent,
version=version,
allowed_value_types=allowed_value_types,
supported_extensions=supported_extensions
)
return response

def get_solution_problem(self, solution_id):
describe_search_id = None
for search_id, solution_ids in self.solutions.items():
if solution_id in solution_ids:
describe_search_id = search_id
break

if describe_search_id is None:
return None, None, None

solution_path = os.path.join(SearchPath(describe_search_id).pipelines_scored, '{}.json'.format(solution_id))

with d3m_utils.silence():
pipeline = load_pipeline(solution_path)

problem_description = self.problem_descriptions[describe_search_id]
return pipeline, problem_description, describe_search_id


def encode_exposed_values(exposed_values):
encoded_exposed_values = {}
for name, value in exposed_values.items():
if '.csv' in value:
encoded_exposed_values[name] = utils.encode_value(
{'type': 'csv_uri', 'value': get_uri(value)}, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT)
elif '.json' in value:
encoded_exposed_values[name] = utils.encode_value(
{'type': 'dataset_uri', 'value': get_uri(value)}, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT)
return encoded_exposed_values


def decode_scoring_configuration(scoring_configuration):
"""
Decode a scoring configuration from grpc

Parameters
----------
scoring_configuration: core_pb2.ScoringConfiguration
A grpc ScoringConfiguration message.

Returns
-------
configuration: dict
A dictionary with the scoring configuration.
"""
method = scoring_configuration.method
configuration = {
'method': method,
'train_score_ratio': str(scoring_configuration.train_test_ratio),
'stratified': str(scoring_configuration.stratified).lower(),
'shuffle': str(scoring_configuration.shuffle).lower(),
'randomSeed': str(scoring_configuration.random_seed),
}
if method == 'K_FOLD':
configuration['number_of_folds'] = str(scoring_configuration.folds)
return configuration


def load_data(data):
if data['type'] == 'dataset_uri':
return container.dataset.get_dataset(data['value'])


def get_uri(path):
return pathlib.Path(os.path.abspath(path)).as_uri()


def encode_scores(pipeline_result):
"""
Encode a dict of scores to a GRPC message

Parameters
----------
pipeline_result
A pipeline_result instance that contains the scores and rank to be encoded.

Returns
-------
score_message: GRPC
A GRPC message
"""
ranking = {
'metric': 'RANK',
'value': pipeline_result.rank,
'randomSeed': 0,
'fold': 0,
}

all_scores = pipeline_result.scores.append(ranking, ignore_index=True)

scores = list()
for score in all_scores.to_dict('index').values():
score['random_seed'] = score['randomSeed']
try:
score['metric'] = {'metric': score['metric']}
except:
score['metric'] = {'metric': problem_module.PerformanceMetric[score['metric']]}

scores.append(utils.encode_score(score, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT))
return scores


def encode_scoring_configuration(scoring_configuration):
"""
Decode a scoring configuration from grpc

Parameters
----------
scoring_configuration: dict
A dictionary with the scoring configuration.

Returns
-------
scoring_configuration: core_pb2.ScoringConfiguration
A grpc ScoringConfiguration message.
"""
if scoring_configuration is None:
return core_pb2.ScoringConfiguration()
else:
method = scoring_configuration['method']
folds = scoring_configuration.get('number_of_folds', None)
if folds is not None:
folds = int(folds)
train_test_ratio = scoring_configuration.get('train_score_ratio', None)
if train_test_ratio is not None:
train_test_ratio = float(train_test_ratio)
shuffle = scoring_configuration.get('shuffle', None)
if shuffle is not None:
shuffle = json.loads(shuffle.lower())
random_seed = scoring_configuration.get('randomSeed', None)
if random_seed is not None:
random_seed = int(random_seed)
stratified = scoring_configuration.get('stratified', None)
if stratified is not None:
stratified = json.loads(stratified.lower())
return core_pb2.ScoringConfiguration(method=method, folds=folds, train_test_ratio=train_test_ratio,
shuffle=shuffle, random_seed=random_seed, stratified=stratified)


class Server:
def __init__(self, arguments):
self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
self.core = Core()

core_pb2_grpc.add_CoreServicer_to_server(self.core, self.server)
self.server.add_insecure_port('[::]:45042')

def start(self):
self.server.start()

def stop(self):
self.server.stop(0)


def configure_parser(parser, *, skip_arguments=()):
parser.add_argument(
'-o', '--output-path', type=str, default=os.path.join(os.getcwd(), "output/"),
help="path where the outputs would be stored"
)
parser.add_argument(
'-v', '--verbose', type=bool, default=True,
help="Display detailed log"
)


def main():
ray.init(webui_host='127.0.0.1')
# Creating parser
parser = argparse.ArgumentParser(description="Starts server from command line")
configure_parser(parser)
arguments = parser.parse_args()

# Setup logger
verbose_format = '%(asctime)s %(levelname)-8s %(processName)-15s [%(filename)s:%(lineno)d] %(message)s'
concise_format = '%(asctime)s %(levelname)-8s %(message)s'
log_format = verbose_format if arguments.verbose else concise_format
logging.basicConfig(format=log_format,
handlers=[logging.StreamHandler(),
logging.FileHandler('{}/d3m.log'.format(Path.TEMP_STORAGE_ROOT), 'w', 'utf-8')],
datefmt='%m/%d %H:%M:%S')
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
warnings.filterwarnings('ignore')

server = Server(arguments)

try:
load_time = time.time()
server.start()
with d3m_utils.silence():
d3m_index.load_all(blocklist=PrimitivesList.BlockList)
print('Wait for loading workers for', len(d3m_index.search())*0.3)
time.sleep(len(d3m_index.search())*0.3)
# time.sleep(5)
logger.info('---------- Waiting for Requests ----------')
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
logger.info('############ STOPPING SERVICE ############')
server.stop()


if __name__ == '__main__':
main()

+ 133
- 0
axolotl/axolotl/predefined_pipelines/__init__.py View File

@@ -0,0 +1,133 @@
import json
import os
import uuid

import copy
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import PrimitiveStep
from d3m.container import DataFrame
from d3m import utils as d3m_utils

from axolotl.predefined_pipelines import preprocessor
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils

__all__ = ['fetch', 'fetch_from_file']


def fetch(input_data, problem_description, predefined_path=None):
if predefined_path is None:
root = os.path.join(os.path.dirname(__file__), '../..')
predefined_path = os.path.join(root, 'axolotl', 'utils', 'resources', 'default_pipelines.json')
# ToDo should use yield
pipelines = list()
pipelines_from_file = fetch_from_file(problem_description, path=predefined_path)
pipelines_from_preprocessors = _fetch_from_preprocessors(input_data, problem_description)
for candiate in (
pipelines_from_file,
pipelines_from_preprocessors,
):
pipelines.extend(candiate)
return pipelines


def fetch_from_file(problem_description, path):
# ToDo should use yield
task_type, task_subtype, data_types, semi = _get_task_description(problem_description)

pipelines = []
with open(path) as file:
possible_pipelines = json.load(file)
with d3m_utils.silence():
for task_type_in_file, pipeline_infos in possible_pipelines.items():
if task_type_in_file == task_type:
for pipeline_info in pipeline_infos:
pipeline = pipeline_utils.load_pipeline(pipeline_info)
pipelines.append(pipeline)
return pipelines


def _fetch_from_preprocessors(input_data, problem_description):
task_type, task_subtype, data_types, semi = _get_task_description(problem_description)
primitive_candidates = pipeline_utils.get_primitive_candidates(task_type, data_types, semi)

mapped_task_type = schemas_utils.get_task_mapping(task_type)
if mapped_task_type != task_type:
primitive_candidates += pipeline_utils.get_primitive_candidates(mapped_task_type, data_types, semi)

pipelines = []
for primitive_info in primitive_candidates:
if not check_primitive_dataframe_input(primitive_info):
continue
pps = preprocessor.get_preprocessor(
input_data=input_data, problem=problem_description, treatment=primitive_info[1]
)
for pp in pps:
pipeline_description = copy.deepcopy(pp.pipeline_description)
pipeline_description.id = str(uuid.uuid4())
pipeline = _complete_pipeline(
pipeline_description=pipeline_description,
dataframe_step=pp.dataset_to_dataframe_step,
primitive_info=primitive_info,
attributes=pp.attributes,
targets=pp.targets,
resolver=pp.resolver
)
pipelines.append(pipeline)
return pipelines


def check_primitive_dataframe_input(primitive_info):
primitive, _ = primitive_info
primitive_arguments = primitive.metadata.query()['primitive_code']['arguments']
if 'inputs' in primitive_arguments and primitive_arguments['inputs']['type'] == DataFrame:
return True
else:
return False


def get_primitive(name):
primitive = index.get_primitive(name)
return primitive


def _complete_pipeline(pipeline_description, dataframe_step, attributes, targets, resolver, primitive_info):
primitive, specific_primitive = primitive_info
construct_prediction = 'd3m.primitives.data_transformation.construct_predictions.Common'
construct_prediction_primitive = get_primitive(construct_prediction)

_add_primitive_to_pipeline(pipeline_description, primitive, resolver, attributes, targets)
_add_primitive_to_pipeline(pipeline_description, construct_prediction_primitive, resolver,
dataframe_step=dataframe_step)
# Get the last step for the output
last_step_idx = len(pipeline_description.steps) - 1
output = pipeline_utils.int_to_step(last_step_idx)

# Adding output step to the pieline
pipeline_description.add_output(name='Predictions from the input dataset', data_reference=output)
return pipeline_description


def _add_primitive_to_pipeline(pipeline_description, primitive, resolver, attributes=None, targets=None,
dataframe_step=None):
step_model = PrimitiveStep(primitive=primitive, resolver=resolver)

if dataframe_step is None:
step_model.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
step_model.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets)
else:
last_step_idx = len(pipeline_description.steps) - 1
step_model.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER,
data_reference=pipeline_utils.int_to_step(last_step_idx))
step_model.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference=dataframe_step)
step_model.add_output('produce')
pipeline_description.add_step(step_model)


def _get_task_description(problem_description):
task_description = schemas_utils.get_task_description(problem_description['problem']['task_keywords'])
task_type = task_description['task_type']
task_subtype = task_description['task_subtype']
data_types = task_description['data_types']
semi = task_description['semi']
return task_type, task_subtype, data_types, semi

+ 278
- 0
axolotl/axolotl/predefined_pipelines/base_preprocessor.py View File

@@ -0,0 +1,278 @@
import typing

import abc
from d3m import index
from d3m.metadata.base import Context, ArgumentType
from d3m.metadata.pipeline import Pipeline, Resolver, PrimitiveStep

from axolotl.utils import pipeline as pipeline_utils

DEFAULT_OUTPUT = '.'


class Preprocessor(abc.ABC):
task: str
treatment: str
expected_data_types: set
unsupported_data_types: set
semi: bool

def __init__(self, metadata, main_resource, data_types, loaded_primitives, problem=None, start_resource='inputs.0'):
self.metadata = metadata
self.main_resource = main_resource
self.data_types = data_types
self.loaded_primitives = loaded_primitives
self.start_resource = start_resource
self.problem = problem
# Creating pipeline
pipeline_description = Pipeline(context=Context.TESTING)
pipeline_description.add_input(name='inputs')
self.pipeline = pipeline_description
self.d2d_step = None
self.attr_step = None
self.targ_step = None
self._generate_pipeline()

def __init_subclass__(cls, task: str, treatment: str, expected_data_types: set, **kargs):
cls.task = task
cls.treatment = treatment
cls.expected_data_types = expected_data_types
cls.unsupported_data_types = kargs['unsupported_data_types'] if 'unsupported_data_types' in kargs else None
cls.semi = kargs['semi'] if 'semi' in kargs else False

@classmethod
def check_task_treatment(cls, task, treatment):
if not cls.task:
return True
if not cls.treatment:
return cls.task == task
return cls.task == task and cls.treatment == treatment

@classmethod
def check_expected_data_types(cls, data_types):
if not cls.expected_data_types:
return True
return any(data_type in cls.expected_data_types for data_type in data_types)

@classmethod
def check_unsupported_data_types(cls, data_types):
if not cls.unsupported_data_types:
return True
return not any(data_type in cls.unsupported_data_types for data_type in data_types)

@property
def pipeline_description(self) -> Pipeline:
return self.pipeline

@property
def dataset_to_dataframe_step(self) -> typing.Optional[str]:
return self.get_output_str(self.d2d_step) if self.d2d_step else None

@property
def attributes(self) -> typing.Optional[str]:
return self.get_output_str(self.attr_step) if self.attr_step else None

@property
def targets(self) -> typing.Optional[str]:
return self.get_output_str(self.targ_step) if self.targ_step else None

@property
def resolver(self) -> Resolver:
return pipeline_utils.BlackListResolver()

@abc.abstractmethod
def _generate_pipeline(self):
raise NotImplementedError()

@property
def gpu_budget(self) -> float:
return 0

def get_primitive(self, name):
primitive = index.get_primitive(name)
self.download_static_files(primitive)
return primitive

def common_boilerplate(self):
"""
This boilerplate provides the basic init pipline that contains denormalize and dataset_to_dataframe.

Arguments
---------
include_dataset_to_dataframe: bool
Whether to include dataset_to_dataframe step.
include_simple_profiler: bool
whether or not to include simple profiler
"""
metadata = self.metadata
main_resource_id = self.main_resource
start_resource = self.start_resource

# if there is more that one resource we denormalize
if len(metadata.get_elements(())) > 1:
start_resource = self.add_denormalize_step(start_resource, main_resource_id)

# Finally we transfer to a dataframe.
dtd_step = self.add_dataset_to_dataframe_step(start_resource)

simple_profiler_step = self.add_primitive_to_pipeline(
primitive=self.loaded_primitives['SimpleProfiler'],
attributes=dtd_step,
hyperparameters=[
('categorical_max_ratio_distinct_values', ArgumentType.VALUE, 1),
('categorical_max_absolute_distinct_values', ArgumentType.VALUE, None)
]
)
self.set_d2d_step(simple_profiler_step)

def tabular_common(self, target_at_column_parser=False):
self.common_boilerplate()

# Simple preprocessor
attributes, targets = self.base(target_at_column_parser=target_at_column_parser)

# Adding Imputer
imputer = self.add_imputer(attributes=attributes)

attributes = self.add_simple_text_handler(imputer, targets)
self.set_attribute_step(attributes)
self.set_target_step(targets)

def base(self, target_at_column_parser=False, exclude_attr_columns=None):
dataset_dataframe_step_pos = self.d2d_step

# Step 2: ColumnParser
column_parser_step = self.add_column_parser_step(data_reference=dataset_dataframe_step_pos)

# Step 3: ExtractAttributes
attributes_step = self.add_extract_col_by_semantic_types_step(
column_parser_step,
['https://metadata.datadrivendiscovery.org/types/Attribute'],
exclude_attr_columns
)
target_source = column_parser_step if target_at_column_parser else dataset_dataframe_step_pos

# Step 4: ExtractTargets
targets_step = self.add_extract_col_by_semantic_types_step(
target_source,
['https://metadata.datadrivendiscovery.org/types/TrueTarget']
)
return attributes_step, targets_step

def add_imputer(self, attributes):
# SklearnImputer
primitive = self.loaded_primitives['Imputer']
configuration = \
primitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'].configuration
hyperparameters = []
if 'return_result' in configuration:
hyperparameters.append(
('return_result', ArgumentType.VALUE, 'replace')
)
if 'use_semantic_types' in configuration:
hyperparameters.append(
('use_semantic_types', ArgumentType.VALUE, True)
)
hyperparameters.append(
('error_on_no_input', ArgumentType.VALUE, False)
)
imputer = self.add_primitive_to_pipeline(
primitive=primitive,
attributes=attributes,
hyperparameters=hyperparameters
)
return imputer

def add_extract_col_by_semantic_types_step(self, data_reference, target_semantic_types, exclude_columns=None):
if exclude_columns:
hyperparameters = [
('exclude_columns', ArgumentType.VALUE, exclude_columns),
('semantic_types', ArgumentType.VALUE, target_semantic_types)
]
else:
hyperparameters = [
('semantic_types', ArgumentType.VALUE, target_semantic_types)
]
step = self.add_primitive_to_pipeline(
primitive=self.loaded_primitives['ExtractColumnsBySemanticTypes'],
attributes=data_reference,
hyperparameters=hyperparameters
)
return step

def add_denormalize_step(self, start_resource, data):
denormalize_step = self.add_primitive_to_pipeline(
primitive=self.loaded_primitives['Denormalize'],
attributes=start_resource,
hyperparameters=[
('starting_resource', ArgumentType.VALUE, data)
]
)
return denormalize_step

def add_dataset_to_dataframe_step(self, start_resource):
d2d_step = self.add_primitive_to_pipeline(
primitive=self.loaded_primitives['DatasetToDataFrame'],
attributes=start_resource
)
return d2d_step

def add_column_parser_step(self, data_reference, to_parse=None):
if to_parse:
hyperparameters = [
('parse_semantic_types', ArgumentType.VALUE, to_parse)
]
else:
hyperparameters = []
column_parser = self.add_primitive_to_pipeline(
primitive=self.loaded_primitives['ColumnParser'],
attributes=data_reference,
hyperparameters=hyperparameters
)
return column_parser

def add_simple_text_handler(self, attributes, targets):
text_encoder = self.add_primitive_to_pipeline(
primitive=self.loaded_primitives['TextEncoder'],
attributes=attributes,
hyperparameters=[
('encoder_type', ArgumentType.VALUE, 'tfidf')
],
targets=targets
)
return text_encoder

def download_static_files(self, primitive):
primitive_metadata = primitive.metadata.query()
output = DEFAULT_OUTPUT
redownload = False
index.download_files(primitive_metadata, output, redownload)

def add_primitive_to_pipeline(self, primitive, attributes, hyperparameters=[], targets=None,
produce_collection=False):
inputs_ref = attributes if isinstance(attributes, str) else self.get_output_str(attributes)
step = PrimitiveStep(primitive=primitive, resolver=self.resolver)
step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=inputs_ref)
for hyperparam in hyperparameters:
name, argument_type, data = hyperparam
step.add_hyperparameter(name=name, argument_type=argument_type, data=data)
if targets:
outputs_ref = targets if isinstance(targets, str) else self.get_output_str(targets)
step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=outputs_ref)
step.add_output('produce')
if produce_collection:
step.add_output('produce_collection')
self.pipeline.add_step(step)
return step

def get_output_str(self, step):
return pipeline_utils.int_to_step(step.index)

def set_attribute_step(self, attributes):
self.attr_step = attributes

def set_target_step(self, targets):
self.targ_step = targets

def set_d2d_step(self, dataset_2_dataframe):
self.d2d_step = dataset_2_dataframe

+ 350
- 0
axolotl/axolotl/predefined_pipelines/preprocessor.py View File

@@ -0,0 +1,350 @@
from d3m import index
from d3m.metadata import base as metadata_base
from d3m.metadata.base import ArgumentType
from d3m.metadata.problem import TaskKeyword

from axolotl.predefined_pipelines.base_preprocessor import Preprocessor
from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils


def get_preprocessor(input_data, problem, treatment):
metadata = input_data.metadata
task_description = schemas_utils.get_task_description(problem['problem']['task_keywords'])
task_type = task_description['task_type']
semi = task_description['semi']
data_types = task_description['data_types']
task = pipeline_utils.infer_primitive_family(task_type=task_type, data_types=data_types, is_semi=semi)
main_resource = pipeline_utils.get_tabular_resource_id(dataset=input_data)

# Loading primitives
primitives = {
'DatasetToDataFrame': 'd3m.primitives.data_transformation.dataset_to_dataframe.Common',
'ColumnParser': 'd3m.primitives.data_transformation.column_parser.Common',
'ExtractColumnsBySemanticTypes': 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common',
'Denormalize': 'd3m.primitives.data_transformation.denormalize.Common',
'Imputer': 'd3m.primitives.data_cleaning.imputer.SKlearn',
'SimpleProfiler': 'd3m.primitives.schema_discovery.profiler.Common',
'TextEncoder': 'd3m.primitives.data_transformation.encoder.DistilTextEncoder',
}
loaded_primitives = dict()

try:
for primitive_name in primitives.keys():
loaded_primitives[primitive_name] = index.get_primitive(primitives[primitive_name])
except Exception as e:
print("Cannot load primitive {}".format(e))

candidates = []
for preprocessor in preprocessors:
if preprocessor.check_task_treatment(task, treatment) \
and preprocessor.check_expected_data_types(data_types) \
and preprocessor.check_unsupported_data_types(data_types):
candidates.append(preprocessor(metadata, main_resource, data_types, loaded_primitives, problem))
if not candidates:
candidates.append(TabularPreprocessor(metadata, main_resource, data_types, loaded_primitives))
return candidates


class TimeSeriesTabularPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name,
treatment=metadata_base.PrimitiveFamily.CLASSIFICATION.name,
expected_data_types=None,
unsupported_data_types={TaskKeyword.TABULAR, TaskKeyword.RELATIONAL}):
def _generate_pipeline(self):
time_series_featurization_primitive = self.get_primitive(
'd3m.primitives.feature_extraction.random_projection_timeseries_featurization.DSBOX'
)
time_series_to_list_primitive = self.get_primitive(
'd3m.primitives.data_preprocessing.time_series_to_list.DSBOX'
)

# denormalize -> dataset_to_df
self.common_boilerplate()
dataset_to_dataframe_step = self.d2d_step

# timeseries_to_list
timeseries_tolist_step = self.add_primitive_to_pipeline(
primitive=time_series_to_list_primitive,
attributes=dataset_to_dataframe_step,
)
# timeseries_featurization
timeseries_featurization_step = self.add_primitive_to_pipeline(
primitive=time_series_featurization_primitive,
attributes=timeseries_tolist_step,
)
# extract_col_by_semantic
attr_step = self.add_extract_col_by_semantic_types_step(
timeseries_featurization_step,
['https://metadata.datadrivendiscovery.org/types/Attribute']
)
# extract_col_by_semantic
targ_step = self.add_extract_col_by_semantic_types_step(
dataset_to_dataframe_step,
['https://metadata.datadrivendiscovery.org/types/TrueTarget']
)
self.set_attribute_step(attr_step)
self.set_target_step(targ_step)


class TimeSeriesPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name,
treatment=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name,
expected_data_types=None,
unsupported_data_types={TaskKeyword.TABULAR, TaskKeyword.RELATIONAL}):
def _generate_pipeline(self):
time_series_formatter_primitive = self.get_primitive(
'd3m.primitives.data_preprocessing.data_cleaning.DistilTimeSeriesFormatter'
)
ts_formatter = self.add_primitive_to_pipeline(
primitive=time_series_formatter_primitive,
attributes=self.start_resource
)

dtd_step = self.add_dataset_to_dataframe_step(ts_formatter)
dtd_without_ts_format = self.add_dataset_to_dataframe_step(self.start_resource)

extract_target_step = self.add_extract_col_by_semantic_types_step(
dtd_without_ts_format,
['https://metadata.datadrivendiscovery.org/types/TrueTarget']
)
target_column_parser_step = self.add_column_parser_step(
extract_target_step,
to_parse=[
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector"
]
)
self.set_d2d_step(dtd_without_ts_format)
self.set_attribute_step(dtd_step)
self.set_target_step(target_column_parser_step)


class TimeSeriesForecastingTabularPreprocessor(Preprocessor,
task=metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name,
treatment=metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name,
expected_data_types={TaskKeyword.GROUPED.name}):
# TODO: Pipeline will fail for integer target because simple_profiler profiles it as Categorical data,
# not Float or Integer.
def _generate_pipeline(self):
grouping_compose_primitive = self.get_primitive(
'd3m.primitives.data_transformation.grouping_field_compose.Common'
)

self.common_boilerplate()

# Do not parse categorical data or GroupingCompose will fail.
column_parser = self.add_column_parser_step(
self.d2d_step, [
"http://schema.org/DateTime",
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector"
]
)

attribute_step = self.add_extract_col_by_semantic_types_step(
column_parser, ['https://metadata.datadrivendiscovery.org/types/Attribute']
)

grouping = self.add_primitive_to_pipeline(
primitive=grouping_compose_primitive,
attributes=attribute_step
)

target_step = self.add_extract_col_by_semantic_types_step(column_parser, [
'https://metadata.datadrivendiscovery.org/types/TrueTarget'
])
self.set_attribute_step(grouping)
self.set_target_step(target_step)


class AudioPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.DIGITAL_SIGNAL_PROCESSING.name,
treatment=None,
expected_data_types=None):

def _generate_pipeline(self):
audio_reader_primitive = self.get_primitive(
'd3m.primitives.data_preprocessing.audio_reader.DistilAudioDatasetLoader'
)
audio_feature_extraction_primitive = self.get_primitive(
'd3m.primitives.feature_extraction.audio_transfer.DistilAudioTransfer'
)
audio_reader = self.add_primitive_to_pipeline(
primitive=audio_reader_primitive,
attributes=self.start_resource,
produce_collection=True
)
column_parser = self.add_column_parser_step(
data_reference=audio_reader,
to_parse=[
'http://schema.org/Boolean',
'http://schema.org/Integer',
'http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/FloatVector'
]
)
audio_feature = self.add_primitive_to_pipeline(
primitive=audio_feature_extraction_primitive,
attributes='steps.{}.produce_collection'.format(audio_reader.index),
)
target_step = self.add_extract_col_by_semantic_types_step(
column_parser,
[
'https://metadata.datadrivendiscovery.org/types/TrueTarget',
'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
]
)
self.set_d2d_step(audio_reader)
self.set_attribute_step(audio_feature)
self.set_target_step(target_step)


class ImageDataFramePreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name,
treatment=None,
expected_data_types={TaskKeyword.IMAGE.name}):
def _generate_pipeline(self):
image_reader_primitive = self.get_primitive('d3m.primitives.data_preprocessing.image_reader.Common')
image_feature_extraction_primitive = self.get_primitive(
'd3m.primitives.feature_extraction.image_transfer.DistilImageTransfer')

self.common_boilerplate()
dataset_to_dataframe_step = self.d2d_step

image_reader = self.add_primitive_to_pipeline(
primitive=image_reader_primitive,
attributes=dataset_to_dataframe_step,
hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')]
)
column_parser = self.add_column_parser_step(
data_reference=image_reader,
to_parse=[
'http://schema.org/Boolean',
'http://schema.org/Integer',
'http://schema.org/Float',
'https://metadata.datadrivendiscovery.org/types/FloatVector'
]
)
image_feature_extraction = self.add_primitive_to_pipeline(
primitive=image_feature_extraction_primitive,
attributes=column_parser
)
target_step = self.add_extract_col_by_semantic_types_step(
data_reference=dataset_to_dataframe_step,
target_semantic_types=['https://metadata.datadrivendiscovery.org/types/TrueTarget'],
)
self.set_attribute_step(image_feature_extraction)
self.set_target_step(target_step)


class ImageTensorPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name,
treatment=None,
expected_data_types={TaskKeyword.IMAGE.name}):
def _generate_pipeline(self):
dataframe_to_tensor_primitive = self.get_primitive(
'd3m.primitives.data_preprocessing.dataframe_to_tensor.DSBOX'
)
resnet50_featurizer_primitive = self.get_primitive(
'd3m.primitives.feature_extraction.resnet50_image_feature.DSBOX'
)

self.common_boilerplate()
dataset_to_dataframe_step = self.d2d_step

dataframe_to_tensor = self.add_primitive_to_pipeline(
primitive=dataframe_to_tensor_primitive,
attributes=dataset_to_dataframe_step,
hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')]
)
resnet50_featurizer = self.add_primitive_to_pipeline(
primitive=resnet50_featurizer_primitive,
attributes=dataframe_to_tensor,
hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')]
)
target_step = self.add_extract_col_by_semantic_types_step(
dataset_to_dataframe_step,
['https://metadata.datadrivendiscovery.org/types/TrueTarget']
)
self.set_attribute_step(resnet50_featurizer)
self.set_target_step(target_step)


class TabularPreprocessor(Preprocessor, task=None, treatment=None, expected_data_types={TaskKeyword.TABULAR.name}):
def _generate_pipeline(self):
return self.tabular_common()


class CollaborativeFilteringPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.COLLABORATIVE_FILTERING.name,
treatment=None,
expected_data_types=None):
def _generate_pipeline(self):
return self.tabular_common(target_at_column_parser=True)


class TextPreprocessor(Preprocessor, task=None, treatment=None,
expected_data_types={TaskKeyword.TEXT}):
def _generate_pipeline(self):
text_reader_primitive = self.get_primitive('d3m.primitives.data_preprocessing.text_reader.Common')

self.common_boilerplate()

# Simple preprocessor
attributes, targets = self.base()

text_reader_step = self.add_primitive_to_pipeline(
primitive=text_reader_primitive,
attributes=attributes,
hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')]
)
imputer = self.add_imputer(text_reader_step)
attributes = self.add_simple_text_handler(imputer, targets)
self.set_attribute_step(attributes)
self.set_target_step(targets)


class TextSent2VecPreprocessor(Preprocessor, task=None, treatment=None, expected_data_types={TaskKeyword.TEXT.name}):
def _generate_pipeline(self):
sent2_vec_primitive =self.get_primitive('d3m.primitives.feature_extraction.nk_sent2vec.Sent2Vec')

self.common_boilerplate()

# Simple preprocessor
attributes, targets = self.base()

sent2vec = self.add_primitive_to_pipeline(
primitive=sent2_vec_primitive,
attributes=attributes,
)

imputer = self.add_imputer(sent2vec)
self.set_attribute_step(imputer)
self.set_target_step(targets)


class LupiPreprocessor(Preprocessor, task=None, treatment=None,
expected_data_types={TaskKeyword.LUPI.name}):
def _generate_pipeline(self):
self.common_boilerplate()

privileged_column_indices = [info['column_index'] for info in self.problem['inputs'][0]['privileged_data']]
attributes, targets = self.base(exclude_attr_columns=privileged_column_indices)

imputer = self.add_imputer(attributes)
self.set_attribute_step(imputer)
self.set_target_step(targets)


preprocessors = [
# TODO DSBOX installation has error
# TimeSeriesTabularPreprocessor,
TimeSeriesPreprocessor,
TimeSeriesForecastingTabularPreprocessor,
AudioPreprocessor,
ImageDataFramePreprocessor,
# TODO DSBOX installation has error
# ImageTensorPreprocessor,
CollaborativeFilteringPreprocessor,
TextSent2VecPreprocessor,
TextPreprocessor,
LupiPreprocessor
]

+ 0
- 0
axolotl/axolotl/utils/__init__.py View File


+ 340
- 0
axolotl/axolotl/utils/data_problem.py View File

@@ -0,0 +1,340 @@
import uuid
import numpy
import pandas as pd
from d3m.container import pandas as container_pandas
from d3m.container.dataset import Dataset
from d3m.metadata import base as metadata_base
from d3m.metadata.problem import Problem

from axolotl.utils.schemas import PROBLEM_DEFINITION


def make_unique_columns(data):
"""
Parameters
----------
data : pd.DataFrame
A dataframe to fix the column names.

Returns
-------
The original dataframe where the columns are strings and has a unique name/
"""
seen_columns_name = {}
column_names = []
for column in data.columns:
if column in seen_columns_name:
column_name = str(column) + '_' + str(seen_columns_name[column])
seen_columns_name[column] += 1
else:
seen_columns_name[column] = 0
column_name = str(column)
column_names.append(column_name)
data.columns = column_names
return data


def get_dataset(input_data, target_index=-2, index_column=-1, semantic_types=None, parse=False):
"""
A function that has as input a dataframe, and generates a D3M dataset.

Parameters
----------
input_data : pd.DataFrame
The dataframe to be converted to d3m Dataset.
target_index : int
The index of the target, if index is not present, it will be ignored.
index_column : int
The index of the index target, if not provided it will look for d3m index, if not generate one.
semantic_types : Sequence[Sequence[str]]
A list of semantic types to be applied. The sequence must be of the same length of
the dataframe columns.
parse :
A flag to determine if the dataset will contain parsed columns. By default is set to fault
to make it compatible with most of D3M current infrastructure.

Returns
-------
A D3M dataset.
"""
data = make_unique_columns(input_data.copy(deep=True))
if semantic_types is None:
semantic_types = [[] for i in range(len(data.columns))]
for i, _type in enumerate(input_data.dtypes):
if _type == float:
semantic_types[i].append('http://schema.org/Float')
elif _type == int:
semantic_types[i].append('http://schema.org/Integer')

resources = {}

if 'd3mIndex' in data.columns:
index_column = list(data.columns).index("d3mIndex")
else:
if index_column == -1:
data.insert(0, 'd3mIndex', range(len(data)))
semantic_types.insert(0, [])
target_index += 1
index_column = 0

data = container_pandas.DataFrame(data)

# remove this
if not parse:
data = data.astype(str)
metadata = metadata_base.DataMetadata()

resources['learningData'] = data

metadata = metadata.update(('learningData',), {
'structural_type': type(data),
'semantic_types': [
'https://metadata.datadrivendiscovery.org/types/Table',
'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint',
],
'dimension': {
'name': 'rows',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
'length': len(data),
},
})

metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), {
'dimension': {
'name': 'columns',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
'length': len(data.columns),
},
})

for i, column_name in enumerate(data.columns):
if i == index_column:
metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
'name': column_name,
'structural_type': numpy.int64,
'semantic_types': [
'http://schema.org/Integer',
'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
],
})
else:
_structural_type = str
if semantic_types[i]:
_semantic_types = semantic_types[i]
if 'http://schema.org/Float' in _semantic_types:
_structural_type = numpy.float64
elif 'http://schema.org/Integer' in _semantic_types:
_structural_type = numpy.int64
else:
_semantic_types = ['https://metadata.datadrivendiscovery.org/types/UnknownType']

if not parse:
_structural_type = str
if i == target_index:
_semantic_types += ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget']
else:
_semantic_types += ['https://metadata.datadrivendiscovery.org/types/Attribute']

metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
'name': column_name,
'structural_type': _structural_type,
'semantic_types': _semantic_types
})

dataset_id = str(uuid.uuid4())
dataset_metadata = {
'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
'structural_type': Dataset,
'id': dataset_id,
'name': dataset_id,
'digest': str(uuid.uuid4()),
'dimension': {
'name': 'resources',
'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
'length': len(resources),
},
}

metadata = metadata.update((), dataset_metadata)

dataset = Dataset(resources, metadata)
return dataset


def import_dataframe(data_frame, *, index_column=-1, semantic_types=None):
"""
Function that transforms a dataframe into a dataset.

data_frame : pd.DataFrame
The input dataframe to be converted to d3m Dataset.
index_column : int
The index of the index column.
semantic_types : Sequence[Sequence[str]]
A list of semantic types to be applied. The sequence must be of the same length of
the dataframe columns.

Returns
-------
A D3M dataset.
"""
data = get_dataset(input_data=data_frame, index_column=index_column, semantic_types=semantic_types)
return data


def import_input_data(x, y=None, *, target_index=None, index_column=-1, semantic_types=None, parse=False):
"""
Function that takes an np.array or a dataframe and convert them to a D3M dataset.

x : Union[pd.DataFrame, np.array]
Input features or the features with targets if target index is specified.
y : Union[pd.DataFrame, np.array]
input features or the features with targets if target index is specified.
target_index : int
The index of the target, if index is not present, it will be ignored.
index_column : int
The index of the index target, if not provided it will look for d3m index, if not generate one.
semantic_types : Sequence[Sequence[str]]
A list of semantic types to be applied. The sequence must be of the same length of
the dataframe columns.
parse :
A flag to determine if the dataset will contain parsed columns. By default is set to fault
to make it compatible with most of D3M current infrastructure.

Returns
-------
A D3M dataset.
"""

if y is not None and target_index is not None:
print('Ignoring target index, using y as target')

_target_index = -1
if y is not None:
_x = pd.DataFrame(x)
_y = pd.DataFrame(y)
input_data = pd.concat((_x, _y), axis=1)
_target_index = len(_x.columns)
elif target_index is not None:
input_data = x
else:
raise ValueError('Targets (y) or target index should be provide')

if _target_index != -1:
target_index = _target_index
data = get_dataset(input_data=input_data, target_index=target_index,
index_column=index_column, semantic_types=semantic_types, parse=parse)

return data


def generate_problem_description(dataset, task=None, *, task_keywords=None, performance_metrics=None):
"""
A function that simplifies the generation of a problem description.

Parameters
----------
dataset : Dataset
Dataset to be use for pipeline search.
task : str
A string that represent the problem type, currently only supported: ``binary_classification`` and
``regression``.
task_keywords : List[TaskKeyword]
A list of TaskKeyword.
performance_metrics: List[PerformanceMetric]
A list of PerformanceMetric.

Returns
-------
A Problem
"""
dataset_id = dataset.metadata.query(())['id']
problem_id = dataset_id + '_problem'
schema = 'https://metadata.datadrivendiscovery.org/schemas/v0/problem.json'
version = '4.0.0'

target_column_index = None

for i in range(dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS,))['dimension']['length']):
if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in \
dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i,))['semantic_types']:
target_column_index = i
break

if target_column_index is None:
raise ValueError('Input dataframe does not contains targets')

inputs = {
'dataset_id': dataset_id,
'targets': [{
'column_index': target_column_index,
'column_name': dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i,))['name'],
'resource_id': 'learningData',
'target_index': 0
}]
}

problem = None
if task is None:
if performance_metrics is not None and task_keywords is not None:
problem = {
'performance_metrics': performance_metrics,
'task_keywords': task_keywords
}
else:
if task in PROBLEM_DEFINITION:
problem = PROBLEM_DEFINITION[task]
else:
raise ValueError(task + """ task is not supported in default definitions.
You can define your own task by specifying the task_keywords and performance metrics.""")

problem_description = {
'id': problem_id,
'schema': schema,
'version': version,
'inputs': [inputs],
'problem': problem
}

return Problem(problem_description)


def generate_dataset_problem(x, y=None, task=None, *, target_index=None, index_column=-1,
semantic_types=None, parse=False, task_keywords=None, performance_metrics=None):
"""
Function that takes an np.array or a dataframe and convert them to a D3M dataset.

x : Union[pd.DataFrame, np.array]
Input features or the features with targets if target index is specified.
y : Union[pd.DataFrame, np.array]
input features or the features with targets if target index is specified.
task : str
A string that represent the problem type, currently only supported: ``binary_classification`` and
``regression``.
target_index : int
The index of the target, if index is not present, it will be ignored.
index_column : int
The index of the index target, if not provided it will look for d3m index, if not generate one.
semantic_types : Sequence[Sequence[str]]
A list of semantic types to be applied. The sequence must be of the same length of
the dataframe columns.
parse :
A flag to determine if the dataset will contain parsed columns. By default is set to fault
to make it compatible with most of D3M current infrastructure.
task_keywords : List[TaskKeyword]
A list of TaskKeyword.
performance_metrics: List[PerformanceMetric]
A list of PerformanceMetric.

Returns
-------
dataset : Dataset
A D3M dataset.
problem_description : Problem
A D3M problem.
"""
dataset = import_input_data(x, y=y, target_index=target_index, index_column=index_column,
semantic_types=semantic_types, parse=parse)
problem_description = generate_problem_description(dataset=dataset, task=task, task_keywords=task_keywords,
performance_metrics=performance_metrics)

return dataset, problem_description

+ 542
- 0
axolotl/axolotl/utils/pipeline.py View File

@@ -0,0 +1,542 @@
import os
import pprint
import typing
import uuid
import json

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import networkx as nx
import pandas

import d3m
from d3m import container
from d3m import utils as d3m_utils
from d3m.container import utils as container_utils
from d3m.metadata import base as metadata_base
from d3m.metadata.pipeline import Pipeline, PlaceholderStep, PrimitiveStep, SubpipelineStep, get_pipeline, Resolver
from d3m.metadata.pipeline_run import PipelineRun
from d3m.metadata import problem as problem_module
from d3m.primitive_interfaces import base
from d3m.container.pandas import DataFrame


class PipelineResult:
"""
A class that captures the output of multiple operations around the system.

Parameters
----------
pipeline: Pipeline
The pipeline used for the run (fit/score)
fitted_pipeline_id: str
The id of the fitted pipeline used to produce the result.

Attributes
----------
pipeline: Pipeline
Pipeline used for the run (fit/score)
fitted_pipeline_id: str
The id of the fitted pipeline used to produce the result.
status: str
A string representing the status of the run (PENDING, RUNNING, COMPLETED, ERRORED)
error: typing.Union[Exception, typing.List[Exception]]
An error of list of errors occured during the execution of the pipeline or fitted pipeline.
exposed_outputs: typing.Dict[str, typing.Any]
A dictionary containing the name of te exposed output and the value, this could be a string
of the path of the stored output or the object itself.
output: container.DataFrame
A dataframe of the pipeline output, this could be a string if the output is stored.
pipeline_run
A pipeline run, or the path where is stored.
method_called: str
The method that it was called while generating this result. (fit, produce)
scores: pandas.DataFrame
A dataframe containing the scores of the evaluated pipeline.
rank: float
The rank of the pipeline from 0 to 1, where 0 is the best.
"""
def __init__(self, *, pipeline: Pipeline = None, fitted_pipeline_id: str = None):
self.pipeline = pipeline
self.fitted_pipeline_id: str = fitted_pipeline_id
self.status: str = None
self.error: typing.Union[Exception, typing.List[Exception]] = None
self.exposed_outputs: typing.Dict[str, typing.Any] = None
self.output: container.DataFrame = None
self.pipeline_run = None
self.method_called: str = None
self.scores: pandas.DataFrame = None
self.rank: float = None

def __str__(self):
string_representation = {}

for name, value in self.__dict__.items():
if not name.startswith('__') and not callable(name):
if value is not None:
string_representation[name] = str(value)

return pprint.pformat(string_representation).replace("\\n", "")

def __repr__(self):
base_string = 'PipelineResult'
if self.pipeline is not None:
base_string += ' pipeline_id:{}'.format(self.pipeline.id)

if self.fitted_pipeline_id is not None:
base_string += ' fitted_pipeline_id:{}'.format(self.fitted_pipeline_id)

return base_string


class PrimitivesList:
# root = os.path.dirname(__file__)
# black_list = os.path.join(root, 'axolotl', 'utils', 'resources', 'blacklist.json')
with open(os.path.join(os.path.dirname(__file__), 'resources', 'blocklist.json'), 'r') as file:
BlockList = json.load(file)


class BlackListResolver(Resolver):
"""
A resolver to resolve primitives and pipelines.

It resolves primitives from available primitives on the system,
and resolves pipelines from files in pipeline search paths.

Attributes
----------
strict_resolving : bool
If resolved primitive does not fully match specified primitive reference, raise an exception?
pipeline_search_paths : Sequence[str]
A list of paths to directories with pipelines to resolve from.
Their files should be named ``<pipeline id>.json`` or ``<pipeline id>.yml``.

Parameters
----------
strict_resolving : bool
If resolved primitive does not fully match specified primitive reference, raise an exception?
pipeline_search_paths : Sequence[str]
A list of paths to directories with pipelines to resolve from.
Their files should be named ``<pipeline id>.json`` or ``<pipeline id>.yml``.
respect_environment_variable : bool
Use also (colon separated) pipeline search paths from ``PIPELINES_PATH`` environment variable?
"""

def __init__(self, black_list=PrimitivesList.BlockList, *, strict_resolving: bool = False, strict_digest: bool = False,
pipeline_search_paths: typing.Sequence[str] = None,
respect_environment_variable: bool = True, load_all_primitives: bool = True,
primitives_blocklist: typing.Collection[str] = None) -> None:
super().__init__(strict_resolving=strict_resolving, strict_digest=strict_digest,
pipeline_search_paths=pipeline_search_paths,
respect_environment_variable=respect_environment_variable,
load_all_primitives=load_all_primitives, primitives_blocklist=primitives_blocklist)
self.black_list = black_list
if len(black_list) == 0:
self.black_list = None

def _get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]:
if not self._primitives_loaded:
self._primitives_loaded = True

d3m.index.load_all(blacklist=self.black_list)

return d3m.index.get_primitive_by_id(primitive_description['id'])


def load_pipeline(pipeline_file: typing.Union[str, typing.Dict]):
"""
Load pipeline from a pipeline URI

Parameters
----------
pipeline_file: Union[str, dict]
The URI pointing to a json file of pipeline or dict of string that is a pipeline

Returns
-------
pipeline: Pipeline
An object of Pipeline

"""
if isinstance(pipeline_file, dict):
try:
with d3m_utils.silence():
pipeline = Pipeline.from_json_structure(pipeline_file)
except:
pipeline = None
else:
with d3m_utils.silence():
pipeline = get_pipeline(pipeline_path=pipeline_file, load_all_primitives=False)
return pipeline


def save_pipeline(pipeline, path, *, rank=None):
"""
A function that make a copy of an already scored pipeline to scored directory according with specifications.

Parameters
----------
pipeline : Pipeline
A pipeline to be save into the path
path: str
Path where the pipeline will be stored
rank : float
A float that represents the rank of the pipeline.
"""

pipeline_path = os.path.join(path, '{}.json'.format(pipeline.id))

with open(pipeline_path, 'w') as file:
pipeline.to_json(file, indent=2, sort_keys=True, ensure_ascii=False)

if rank is not None:
rank_path = os.path.join(path, '{}.rank'.format(pipeline.id))
with open(rank_path, 'w') as file:
file.write('{rank}'.format(rank=rank))


def save_pipeline_run(pipeline_run, path):
"""
A function that make a copy of an already scored pipeline to scored directory according with specifications.

Parameters
----------
pipeline_run : PipelineRun
A pipeline_run to be save into the path
path: str
Path where the pipeline_run will be stored

Returns
-------
pipeline_run_path : str
Path where the pipeline_run is stored.
"""

if pipeline_run is None:
return

if isinstance(pipeline_run, list):
first = True
pipeline_run_path = os.path.join(path, '{}.yml'.format(pipeline_run[0].pipeline['id']))
with d3m_utils.silence():
with open(pipeline_run_path, 'w') as file:
for run in pipeline_run:
run.to_yaml(file, appending=not first)
first = False
else:
pipeline_run_path = os.path.join(path, '{}.yml'.format(pipeline_run.pipeline['id']))
with d3m_utils.silence():
with open(pipeline_run_path, 'w') as file:
pipeline_run.to_yaml(file)

return pipeline_run_path


def save_exposed_values(values, output_id, output_dir):
"""
A function to save the exposed values of a PipelineResult.

Parameters
----------
values : Union[dict[str, container], container]
A container to be stored into the path
output_id : str
An id that identify the values.
output_dir : str
The path where the values are going to be store.

Returns
-------
A dict of names and stored paths.

"""
output_paths = {}
output_path = os.path.join(output_dir, output_id)
unique_id = str(uuid.uuid4())

def get_file_path(path):
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
file_path = ""
if 'data.csv' in files:
file_path = os.path.join(path, 'data.csv')
elif 'datasetDoc.json' in files:
file_path = os.path.join(path, 'datasetDoc.json')
return file_path

if isinstance(values, dict):
for name, value in values.items():
_output_path = os.path.join(output_path, output_id, unique_id, name)
container_utils.save_container(value, _output_path)
output_paths[name] = get_file_path(_output_path)
else:
_output_path = os.path.join(output_path, output_id, unique_id, 'output')
container_utils.save_container(values, _output_path)
output_paths['output'] = get_file_path(_output_path)

return output_paths


def plot_pipeline(pipeline):
figure(num=None, figsize=(10, 12), dpi=80, facecolor='w', edgecolor='k')
graph, nodes_info = get_pipeline_graph(pipeline)

the_table = plt.table(cellText=nodes_info, colWidths=[0.05, 0.5], colLabels=['Step', 'Primitive'], loc='right')
the_table.set_fontsize(25)
the_table.scale(2, 1)
pos = nx.kamada_kawai_layout(graph, scale=3)
grafo_labels = nx.get_edge_attributes(graph, 'label')
edges_label = nx.draw_networkx_edge_labels(graph, pos, edge_labels=grafo_labels, font_size=7)
nx.draw(graph, pos=pos, node_size=900, alpha=0.5, font_size=16, edges_label=edges_label, with_labels=True, scale=5)


def __get_header(index, step):
if isinstance(step, PrimitiveStep):
header = 'steps.' + str(index) + ' - ' + step.primitive.metadata.query()['python_path']
elif isinstance(step, PlaceholderStep):
header = 'steps.' + str(index) + ' - ' + 'PlaceHolderStep'
elif isinstance(step, SubpipelineStep):
header = 'steps.' + str(index) + ' - ' + 'SubPipeline'
return header


def get_pipeline_graph(pipeline):
graph = nx.DiGraph()
nodes_info = []

for i in range(0, len(pipeline.steps)):
nodes_info.append([str(i), pipeline.steps[i].primitive.metadata.query()['python_path']])

if isinstance(pipeline.steps[i], PrimitiveStep) or isinstance(pipeline.steps[i], PlaceholderStep):
target = i
graph.add_node(target)
for argument in pipeline.steps[i].arguments.keys():
data = pipeline.steps[i].arguments[argument]['data']
if 'input' in data:
source = 'inputs'
else:
index = int(data.split('.')[1])
source = index
label = argument + '-' + data
graph.add_edge(source, target, label=label)

for hp in pipeline.steps[i].hyperparams.keys():
if pipeline.steps[i].hyperparams[hp]['type'] == metadata_base.ArgumentType.PRIMITIVE:
index = pipeline.steps[i].hyperparams[hp]['data']
source = index
label = 'Step {} hyperparam - {}'.format(i, hp)
graph.add_edge(source, target, label=label)
else:
# TODO add support here for subpipelines
continue

for i in range(0, len(pipeline.outputs)):
index = int(pipeline.outputs[i]['data'].split('.')[1])
source = index
label = 'outputs.{}'.format(i)
graph.add_edge(source, 'output', label=label)

return graph, nodes_info


def infer_primitive_family(task_type: str, data_types: typing.Iterable, is_semi: bool = False) -> typing.Optional[str]:
"""
Infer target primitive family by task and data_types

Parameters
----------
task_type: str
The task type
data_types: typing.Iterable
The data types
is_semi: bool
Is semi supervised probelm

Returns
-------
str
The primitive family
"""

#TODO temp solution
if problem_module.TaskKeyword.CLASSIFICATION == task_type and \
problem_module.TaskKeyword.TIME_SERIES in data_types and \
problem_module.TaskKeyword.GROUPED in data_types:
return metadata_base.PrimitiveFamily.CLASSIFICATION
if problem_module.TaskKeyword.CLASSIFICATION == task_type and \
problem_module.TaskKeyword.TIME_SERIES in data_types:
return metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name
if problem_module.TaskKeyword.FORECASTING and problem_module.TaskKeyword.TIME_SERIES in data_types:
return metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name
if problem_module.TaskKeyword.CLASSIFICATION == task_type and is_semi:
return metadata_base.PrimitiveFamily.SEMISUPERVISED_CLASSIFICATION.name
if problem_module.TaskKeyword.IMAGE in data_types:
return metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name
if problem_module.TaskKeyword.VIDEO in data_types:
return metadata_base.PrimitiveFamily.DIGITAL_SIGNAL_PROCESSING.name

return task_type


def check_black_list(primitive_name: str, extra_block: typing.List=[]) -> bool:
"""
Check if the primitive is in the black list, which is from `LIST.BlACK_LIST`

Parameters
----------
primitive_name: str
The name of the primitive

Returns
-------
bool

"""
banned_terms = PrimitivesList.BlockList + extra_block
for banned_element in banned_terms:
if banned_element in primitive_name:
return True
return False


def get_primitive_candidates(task_type: str, data_types: typing.Iterable, semi: bool,
extra_block: typing.List=[]) -> typing.List:
"""
Get a list of primitive candidates related to the task type except those primitives in `BLACK_LIST`

Parameters
----------
task_type: str
The task type
data_types: typing.Iterable
The data types
semi: bool
Is it semi-supervised problem

Returns
-------
list
A list of primitives
"""
specific_task = infer_primitive_family(task_type, data_types, semi)
primitives_path = d3m.index.search()
primitives = list()
for primitive_path in primitives_path:
if check_black_list(primitive_path, extra_block):
continue
try:
with d3m_utils.silence():
primitive = d3m.index.get_primitive(primitive_path)
primitive_family = primitive.metadata.query()['primitive_family'].name
if primitive_family == task_type:
primitives.append((primitive, task_type))
elif primitive_family == specific_task:
primitives.append((primitive, specific_task))
# TODO what exception?
except Exception as e:
continue
return primitives


def int_to_step(n_step: int) -> str:
"""
Convert the step number to standard str step format

Parameters
----------
n_step: int

Returns
-------
str
str format in "steps.<n_step>.produce"
"""
return 'steps.' + str(n_step) + '.produce'


def get_primitives(primitives_dict):
"""
A function that loads and returns a dictionary of primitives

Parameters
----------
primitives_dict: dict[str, str]
A dictionary that contains the alias and the primitives to load.

Returns
-------
loaded_primitives_dict: dict[str, str]
A dictionary containing the aliases and the loaded primitives.
"""
loaded_primitives_dict = {}
for primitive_name in primitives_dict.keys():
loaded_primitives_dict[primitive_name] = d3m.index.get_primitive(primitives_dict[primitive_name])
return loaded_primitives_dict


def get_tabular_resource_id(dataset):
"""
A function that retrieves the main resource id

Parameters
----------
dataset: Dataset
A dataset.

Returns
-------
resource_id: str
An id of the main resource.
"""

resource_id = None
for dataset_resource_id in dataset.keys():
if dataset.metadata.has_semantic_type((dataset_resource_id,),
'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'):
resource_id = dataset_resource_id
break

if resource_id is None:
tabular_resource_ids = [dataset_resource_id for dataset_resource_id, dataset_resource in dataset.items() if
isinstance(dataset_resource, container.DataFrame)]
if len(tabular_resource_ids) == 1:
resource_id = tabular_resource_ids[0]

if resource_id is None:
resource_id = 'learningData'

return resource_id


def query_multiple_terms(metadata, list_queries):
data = metadata.query()
valid_queries = []
for query in list_queries:
if query in data:
valid_queries.append(query)
data = data[query]
else:
break
if len(valid_queries) == len(list_queries):
return data


def filter_primitives_by_dataframe_input(primitive_info):
primitives_dataframe_input = []
for info in primitive_info:
primitive, task = info
arguments = query_multiple_terms(
primitive.metadata, ['primitive_code', 'class_type_arguments'])

has_dataframe_arguments = True
for argument, value in arguments.items():
if argument == 'Params' or argument == 'Hyperparams':
continue
else:
if value != DataFrame:
has_dataframe_arguments = False
break
if has_dataframe_arguments:
primitives_dataframe_input.append(info)

return primitives_dataframe_input


+ 31
- 0
axolotl/axolotl/utils/resources.py View File

@@ -0,0 +1,31 @@
import os
import shutil
import signal
from contextlib import contextmanager


class TimeoutException(Exception):
pass


@contextmanager
def time_limit(seconds):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)


def check_directory(dir_name):
dir_name = os.path.abspath(dir_name)
if not os.path.exists(dir_name):
os.makedirs(dir_name)


def copy_file(source_path, target_path):
path = os.path.join(target_path, os.path.basename(source_path))
shutil.copyfile(source_path, path)

+ 31
- 0
axolotl/axolotl/utils/resources/blocklist.json View File

@@ -0,0 +1,31 @@
[
"d3m.primitives.classification.xgboost_dart.Common",
"d3m.primitives.classification.canonical_correlation_forests.UBC",
"d3m.primitives.classification.logistic_regression.UBC",
"d3m.primitives.classification.multilayer_perceptron.UBC",
"d3m.primitives.classification.simple_cnaps.UBC",
"d3m.primitives.clustering.kmeans_clustering.UBC",
"d3m.primitives.dimensionality_reduction.principal_component_analysis.UBC",
"d3m.primitives.feature_extraction.boc.UBC",
"d3m.primitives.feature_extraction.bow.UBC",
"3m.primitives.feature_extraction.googlenet_cnn.UBC",
"d3m.primitives.feature_extraction.convolutional_neural_network.UBC",
"d3m.primitives.schema_discovery.semantic_type.UBC",
"d3m.primitives.regression.linear_regression.UBC",
"d3m.primitives.operator.diagonal_mvn.UBC",
"d3m.primitives.feature_extraction.resnet_cnn.UBC",
"d3m.primitives.feature_extraction.mobilenet_cnn.UBC",
"d3m.primitives.feature_extraction.vggnet_cnn.UBC",
"d3m.primitives.regression.canonical_correlation_forests.UBC",
"d3m.primitives.regression.multilayer_perceptron.UBC",
"d3m.primitives.schema_discovery.semantic_type.UBC",
"d3m.primitives.data_transformation.missing_indicator.DistilMissingIndicator",
"d3m.primitives.data_transformation.graph_to_edge_list.DSBOX",
"d3m.primitives.feature_construction.graph_transformer.GCN",
"d3m.primitives.feature_extraction.huber_pca.Cornell",
"d3m.primitives.natural_language_processing.glda.Fastlv",
"d3m.primitives.feature_construction.corex_continuous.DSBOX",
"d3m.primitives.natural_language_processing.glda.Fastlvm",
"d3m.primitives.classification.xgboost_dart.Common",
"d3m.primitives.classification.global_causal_discovery.ClassifierRPI"
]

+ 64
- 0
axolotl/axolotl/utils/resources/default_pipelines.json View File

@@ -0,0 +1,64 @@
{
"CLASSIFICATION": [
{"id": "6a520746-108c-45bf-a6d8-c875b5a9d326","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "1dd82833-5692-39cb-84fb-2455683075f3","version": "2019.6.7","python_path": "d3m.primitives.classification.random_forest.SKlearn","name": "sklearn.ensemble.forest.RandomForestClassifier"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "a6b468a5-4d03-405e-a707-8e377f9ad1c3","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "01d2c086-91bf-3ca5-b023-5139cf239c77","version": "2019.6.7","python_path": "d3m.primitives.classification.gradient_boosting.SKlearn","name": "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "ef1c483a-34fc-4398-a6b3-063b33786972","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "c8a28f02-ef4a-35a8-87f1-cf79980f5c3e","version": "2019.6.7","python_path": "d3m.primitives.classification.extra_trees.SKlearn","name": "sklearn.ensemble.forest.ExtraTreesClassifier"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}
],
"REGRESSION": [
{"id": "efab70e7-461a-42de-a5d7-9bdd98cc05d8","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f0fd7a62-09b5-3abc-93bb-f5f999f7cc80","version": "2019.6.7","python_path": "d3m.primitives.regression.random_forest.SKlearn","name": "sklearn.ensemble.forest.RandomForestRegressor"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "a6b468a5-4d03-405e-a707-8e377f9ad1c3","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "2a031907-6b2c-3390-b365-921f89c8816a","version": "2019.6.7","python_path": "d3m.primitives.regression.gradient_boosting.SKlearn","name": "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "a6b468a5-4d03-405e-a707-8e377f9ad1c3","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "35321059-2a1a-31fd-9509-5494efc751c7","version": "2019.6.7","python_path": "d3m.primitives.regression.extra_trees.SKlearn","name": "sklearn.ensemble.forest.ExtraTreesRegressor"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}
],
"CLUSTERING": [],
"LINK_PREDICTION": [
{"id": "ddc6c7e9-64b4-4f9c-af07-5f27461cb940","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "12a4b6a8-b2e4-4604-afe5-8196bf55a925","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]},
{"id": "6216f2bd-2f23-4dbf-92d0-f3b40aeac150","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.2.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "09f2eea8-667c-44b8-a955-6a153ba9ccc3","version": "0.1.0","python_path": "d3m.primitives.link_prediction.data_conversion.JHU","name": "jhu.link_pred_graph_reader"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"which_elbow": {"type": "VALUE","data": 1},"max_dimension": {"type": "VALUE","data": 2},"use_attributes": {"type": "VALUE","data": false}}},{"type": "PRIMITIVE","primitive": {"id": "25e97696-b96f-4f5c-8620-b340fe83414d","version": "0.1.0","python_path": "d3m.primitives.link_prediction.rank_classification.JHU","name": "jhu.link_pred_rc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "0f5d0c4a-2c7f-4a9b-9441-80449c460993","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "79012210-2463-4f94-9da6-11bdc5a7e6c4","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_single_graph.DistilSingleGraphLoader","name": "Load single graph and dataframe into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "fc138210-c317-4528-81ae-5eed3a1a0267","version": "0.1.1","python_path": "d3m.primitives.link_prediction.link_prediction.DistilLinkPrediction","name": "LinkPrediction"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "accuracy"}}}]}

],
"VERTEX_NOMINATION": [],
"COMMUNITY_DETECTION": [
{"id": "bfe17a08-bc94-4f6d-8be1-4758e899a6c6","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "79012210-2463-4f94-9da6-11bdc5a7e6c4","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_single_graph.DistilSingleGraphLoader","name": "Load single graph and dataframe into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "064cec55-39dd-45b7-a663-50d3e17e0c42","version": "0.1.1","python_path": "d3m.primitives.community_detection.community_detection.DistilCommunityDetection","name": "CommunityDetection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "normalizedMutualInformation"}}}]},
{"id": "0f6cafc4-5628-47bc-bbf5-8cab3a7c0e95","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]},
{"id": "ffc49730-eb73-423c-ab6c-acb47300fcfc","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]}
],
"GRAPH_MATCHING": [
{"id": "b5dd2766-da63-4526-a29b-e6322c1f9cc8","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.0.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "ff22e721-e4f5-32c9-ab51-b90f32603a56","version": "0.1.0","python_path": "d3m.primitives.graph_matching.seeded_graph_matching.JHU","name": "jhu.sgm"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]}]},
{"id": "2bf14cda-1edd-4abd-a499-422913c075e6","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "ae0797506-ea7b-4a7f-a7e4-2f91e2082f05","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_graphs.DistilGraphLoader","name": "Load graphs into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "8baea8e6-9d3a-46d7-acf1-04fd593dcd37","version": "0.2.0","python_path": "d3m.primitives.graph_matching.seeded_graph_matching.DistilSeededGraphMatcher","name": "SeededGraphMatcher"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "accuracy"}}}]}
],
"COLLABORATIVE_FILTERING": [
{"id": "8c3a2db6-4449-4a7a-9830-1b9cf2b993d6","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.6.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","version": "0.2.0","python_path": "d3m.primitives.schema_discovery.profiler.Common","name": "Determine missing semantic types for columns automatically"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.6.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"parse_semantic_types": {"type": "VALUE","data": ["http://schema.org/Boolean","http://schema.org/Integer","http://schema.org/Float","https://metadata.datadrivendiscovery.org/types/FloatVector"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Target","https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "a242314d-7955-483f-aed6-c74cd2b880df","version": "0.1.4","python_path": "d3m.primitives.collaborative_filtering.collaborative_filtering_link_prediction.DistilCollaborativeFiltering","name": "Collaborative filtering"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"reference": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "15cea2f3-9eef-4a37-8f04-eea2e30f8d68","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 50,"type": "VALUE"},"maxiter": {"data": 500,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"n_estimators": {"data": 50,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1","name": "sklearn.ensemble.bagging.BaggingClassifier","python_path": "d3m.primitives.classification.bagging.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "164f4dfe-fcca-4769-aa10-d0d9f2a72cb3","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 20,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.02}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "9ea39abe-b164-4eff-918e-c364ce87d167","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.6.produce","name": "output predictions"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": true,"type": "VALUE"},"to_type": {"data": {"encoding": "pickle","value": "gANjYnVpbHRpbnMKaW50CnEALg=="},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [3],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": false,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"},"outputs": {"data": "steps.4.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 1,"type": "VALUE"},"beta": {"data": 1,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "c4019fda-d205-4f89-9acf-5741e45e601a","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 90,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "e1a156e9-0e34-4def-b960-5ad5f3a910a1","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","name": "Parses strings into their types","python_path": "d3m.primitives.data_transformation.column_parser.Common","version": "0.5.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}
],
"OBJECT_DETECTION": [
{"id": "f0aeacc2-3147-4a35-ac75-449e3f92f286", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.2.produce", "name": "output_predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", "version": "0.2.0", "python_path": "d3m.primitives.data_transformation.denormalize.Common", "name": "Denormalize datasets"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"dataframe_resource": {"type": "VALUE", "data": "learningData"}}}, {"type": "PRIMITIVE", "primitive": {"id": "d921be1e-b158-4ab7-abb3-cb1b17f42639", "version": "0.1.0", "python_path": "d3m.primitives.object_detection.retinanet", "name": "retina_net"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}]},
{"id": "dd2d98ed-5d94-4245-a0c9-0861ed7bc177","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "input dataset"}],"outputs": [{"data": "steps.4.produce","name": "predictions of input dataset"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version": "0.2.0","python_path": "d3m.primitives.data_transformation.denormalize.Common","name": "Denormalize datasets"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey","https://metadata.datadrivendiscovery.org/types/FileName"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "dsbox-featurizer-object-detection-yolo","version": "1.5.3","python_path": "d3m.primitives.feature_extraction.yolo.DSBOX","name": "DSBox Object Detection YOLO"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"},"outputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"epochs": {"type": "VALUE","data": 200},"use_fitted_weight": {"type": "VALUE","data": false}}}]},
{"id":"acdb068f-be85-48b1-81cc-e65d7b148d74","schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs":[{"name":"input dataset"}],"outputs":[{"data":"steps.4.produce","name":"predictions of input dataset"}],"steps":[{"type":"PRIMITIVE","primitive":{"id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version":"0.2.0","python_path":"d3m.primitives.data_transformation.denormalize.Common","name":"Denormalize datasets"},"arguments":{"inputs":{"type":"CONTAINER","data":"inputs.0"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65","version":"0.3.0","python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common","name":"Extract a DataFrame from a Dataset"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.4.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey","https://metadata.datadrivendiscovery.org/types/FileName"]}}},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.4.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type":"PRIMITIVE","primitive":{"id":"dsbox-featurizer-object-detection-yolo","version":"1.5.3","python_path":"d3m.primitives.feature_extraction.yolo.DSBOX","name":"DSBox Object Detection YOLO"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.2.produce"},"outputs":{"type":"CONTAINER","data":"steps.3.produce"}},"outputs":[{"id":"produce"}]}]}
],
"VERTEX_CLASSIFICATION": [
{"id": "704163cb-eb0d-4771-8258-5e057503a437","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "79012210-2463-4f94-9da6-11bdc5a7e6c4","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_single_graph.DistilSingleGraphLoader","name": "Load single graph and dataframe into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "0130828c-1ac0-47a9-a167-f05bae5a3146","version": "0.1.1","python_path": "d3m.primitives.vertex_nomination.vertex_nomination.DistilVertexNomination","name": "VertexNomination"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "accuracy"}}}]},
{"id": "15cea2f3-9eef-4a37-8f04-eea2e30f8d68","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 50,"type": "VALUE"},"maxiter": {"data": 500,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"n_estimators": {"data": 50,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1","name": "sklearn.ensemble.bagging.BaggingClassifier","python_path": "d3m.primitives.classification.bagging.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "164f4dfe-fcca-4769-aa10-d0d9f2a72cb3","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 20,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.02}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "9ea39abe-b164-4eff-918e-c364ce87d167","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.6.produce","name": "output predictions"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": true,"type": "VALUE"},"to_type": {"data": {"encoding": "pickle","value": "gANjYnVpbHRpbnMKaW50CnEALg=="},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [3],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": false,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"},"outputs": {"data": "steps.4.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 1,"type": "VALUE"},"beta": {"data": 1,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "c4019fda-d205-4f89-9acf-5741e45e601a","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 90,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "e1a156e9-0e34-4def-b960-5ad5f3a910a1","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","name": "Parses strings into their types","python_path": "d3m.primitives.data_transformation.column_parser.Common","version": "0.5.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "0f6cafc4-5628-47bc-bbf5-8cab3a7c0e95","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]},
{"id": "ffc49730-eb73-423c-ab6c-acb47300fcfc","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "4a2fb696-bf29-410d-934d-c4b17b273938","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "Results"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "a22f9bd3-818e-44e9-84a3-9592c5a85408","version": "1.7.8","python_path": "d3m.primitives.data_transformation.vertex_classification_parser.VertexClassificationParser","name": "Vertex Classification Parser"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "dca25a46-7a5f-48d9-ac9b-d14d4d671b0b","version": "1.7.8","python_path": "d3m.primitives.classification.vertex_nomination.VertexClassification","name": "Vertex Classification"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "2e216966-bd3b-4b53-9933-7ce9a88de6d1","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 15,"type": "VALUE"},"epsilon": {"data": 0.1,"type": "VALUE"},"maxiter": {"data": 5000,"type": "VALUE"},"t": {"data": 0.001,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 1000,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "4f678918-1de5-4db4-8c1c-d7dd0e3b2bec","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.11.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","name": "Parses strings into their types","python_path": "d3m.primitives.data_transformation.column_parser.Common","version": "0.5.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","name": "sklearn.impute.SimpleImputer","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 20,"type": "VALUE"},"epsilon": {"data": 1,"type": "VALUE"},"t": {"data": 0.001,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.8.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.9.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 8,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.10.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "94db5247-7827-468a-81b6-6b709af86d5c","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 50,"type": "VALUE"},"epsilon": {"data": 0.1,"type": "VALUE"},"maxiter": {"data": 2000,"type": "VALUE"},"t": {"data": 0.001,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 1000,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id": "7cb3e0eb-2f3e-4756-9c4e-1cc2852c84b9","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 25,"type": "VALUE"},"epsilon": {"data": 0.01,"type": "VALUE"},"maxiter": {"data": 5000,"type": "VALUE"},"t": {"data": 0.0005,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 5000,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.1}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]},
{"id":"c50643d6-9f82-44fb-ae6e-e40ee96b6899","schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs":[{"name":"input dataset"}],"outputs":[{"data":"steps.5.produce","name":"predictions of input dataset"}],"steps":[{"type":"PRIMITIVE","primitive":{"id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version":"0.2.0","python_path":"d3m.primitives.data_transformation.denormalize.Common","name":"Denormalize datasets"},"arguments":{"inputs":{"type":"CONTAINER","data":"inputs.0"}},"outputs":[{"id":"produce"}],"hyperparams":{"starting_resource":{"type":"VALUE","data":null},"recursive":{"type":"VALUE","data":true},"many_to_many":{"type":"VALUE","data":false},"discard_not_joined_tabular_resources":{"type":"VALUE","data":false}}},{"type":"PRIMITIVE","primitive":{"id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65","version":"0.3.0","python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common","name":"Extract a DataFrame from a Dataset"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.3.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type":"PRIMITIVE","primitive":{"id":"7d61e488-b5bb-4c79-bad6-f1dc07292bf4","version":"1.0.0","python_path":"d3m.primitives.feature_construction.sdne.DSBOX","name":"SDNE"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"beta":{"type":"VALUE","data":4},"alpha":{"type":"VALUE","data":0.00001},"dimension":{"type":"VALUE","data":128},"epochs":{"type":"VALUE","data":200},"lr":{"type":"VALUE","data":0.0005}}},{"type": "PRIMITIVE","primitive": {"id":"7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93","version":"1.5.3","python_path":"d3m.primitives.data_transformation.to_numeric.DSBOX","name":"ISI DSBox To Numeric DataFrame"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"1dd82833-5692-39cb-84fb-2455683075f3","version":"2019.6.7","python_path":"d3m.primitives.classification.random_forest.SKlearn","name":"sklearn.ensemble.forest.RandomForestClassifier"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.4.produce"},"outputs":{"type":"CONTAINER","data":"steps.2.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"max_depth":{"type":"VALUE","data":{"case":"int","value":30}},"min_samples_leaf":{"type":"VALUE","data":{"case":"absolute","value":2}},"min_samples_split":{"type":"VALUE","data":{"case":"absolute","value":2}},"max_features":{"type":"VALUE","data":{"case":"calculated","value":"sqrt"}},"n_estimators":{"type":"VALUE","data":100},"add_index_columns":{"type":"VALUE","data":true},"use_semantic_types":{"type":"VALUE","data":false},"error_on_no_input":{"type":"VALUE","data":true}}}]},
{"id":"fc1eee7f-6435-4001-9cf6-6d24330d9b1c","schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs":[{"name":"input dataset"}],"outputs":[{"data":"steps.4.produce","name":"predictions of input dataset"}],"steps":[{"type":"PRIMITIVE","primitive":{"id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version":"0.2.0","python_path":"d3m.primitives.data_transformation.denormalize.Common","name":"Denormalize datasets"},"arguments":{"inputs":{"type":"CONTAINER","data":"inputs.0"}},"outputs":[{"id":"produce"}],"hyperparams":{"starting_resource":{"type":"VALUE","data":null},"recursive":{"type":"VALUE","data":true},"many_to_many":{"type":"VALUE","data":false},"discard_not_joined_tabular_resources":{"type":"VALUE","data":false}}},{"type":"PRIMITIVE","primitive":{"id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65","version":"0.3.0","python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common","name":"Extract a DataFrame from a Dataset"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.3.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type":"PRIMITIVE","primitive":{"id":"48572851-b86b-4fda-961d-f3f466adb58e","version":"1.0.0","python_path":"d3m.primitives.feature_construction.gcn_mixhop.DSBOX","name":"GCN"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"},"outputs":{"type":"CONTAINER","data":"steps.2.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"epochs":{"type":"VALUE","data":200},"adjacency_order":{"type":"VALUE","data":3}}},{"type":"PRIMITIVE","primitive":{"id":"1dd82833-5692-39cb-84fb-2455683075f3","version":"2019.6.7","python_path":"d3m.primitives.classification.random_forest.SKlearn","name":"sklearn.ensemble.forest.RandomForestClassifier"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.3.produce"},"outputs":{"type":"CONTAINER","data":"steps.2.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"max_depth":{"type":"VALUE","data":{"case":"int","value":30}},"min_samples_leaf":{"type":"VALUE","data":{"case":"absolute","value":2}},"min_samples_split":{"type":"VALUE","data":{"case":"absolute","value":2}},"max_features":{"type":"VALUE","data":{"case":"calculated","value":"sqrt"}},"n_estimators":{"type":"VALUE","data":100},"add_index_columns":{"type":"VALUE","data":true},"use_semantic_types":{"type":"VALUE","data":false},"error_on_no_input":{"type":"VALUE","data":true}}}]},
{"id": "ddc6c7e9-64b4-4f9c-af07-5f27461cb940","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]},
{"id": "12a4b6a8-b2e4-4604-afe5-8196bf55a925","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]},
{"id": "6216f2bd-2f23-4dbf-92d0-f3b40aeac150","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.2.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "09f2eea8-667c-44b8-a955-6a153ba9ccc3","version": "0.1.0","python_path": "d3m.primitives.link_prediction.data_conversion.JHU","name": "jhu.link_pred_graph_reader"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"which_elbow": {"type": "VALUE","data": 1},"max_dimension": {"type": "VALUE","data": 2},"use_attributes": {"type": "VALUE","data": false}}},{"type": "PRIMITIVE","primitive": {"id": "25e97696-b96f-4f5c-8620-b340fe83414d","version": "0.1.0","python_path": "d3m.primitives.link_prediction.rank_classification.JHU","name": "jhu.link_pred_rc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}
],
"FORECASTING": []
}

+ 31
- 0
axolotl/axolotl/utils/resources/scoring_pipeline.yml View File

@@ -0,0 +1,31 @@
id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
source:
name: Mitar
created: "2020-04-18T11:42:44.138742Z"
name: Scoring pipeline
description: |-
A general scoring pipeline.
inputs:
- name: predictions
- name: score dataset
outputs:
- name: scores
data: steps.0.produce
steps:
# Step 0.
- type: PRIMITIVE
primitive:
id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70
version: 0.5.0
python_path: d3m.primitives.evaluation.compute_scores.Core
name: Compute scores given the metrics to use
arguments:
inputs:
type: CONTAINER
data: inputs.0
score_dataset:
type: CONTAINER
data: inputs.1
outputs:
- id: produce

+ 7
- 0
axolotl/axolotl/utils/resources/splitting_pipelines.json View File

@@ -0,0 +1,7 @@
{
"HOLDOUT_FIXED": {"id": "9c18472e-fff7-4129-93f6-1ab996e82adb", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-10-27T01:30:10.245934Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "1654f000-2178-4520-be4c-a95bc26b8d3a", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.fixed_split_dataset_split.Commmon", "name": "Fixed split tabular dataset splits", "digest": "4ebb8d32da071e84370aa978f0b455a592fb2cc88181d669bcf8081ecd98fa00"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "Fixed split of tabular datasets", "description": "A pipeline which splits a tabular dataset in a way that uses for the test\n(score) split a fixed list of primary index values or row indices of the main\nresource to be used.\n", "digest": "28193e7483794e5bd164c352e02e90090d9cda17abfe542b2393a4ecb58c0bb8"},
"K_FOLD": {"id": "c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-07-27T19:39:00.676949Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.kfold_dataset_split.Common", "name": "K-fold cross-validation tabular dataset splits", "digest": "8fc8fd388ed30e8e13c0c04880b0dd81051cd15ae7416a962d79b8187be65fbc"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "K-fold split of tabular datasets", "description": "K-fold split of tabular datasets for cross-validation.\n", "digest": "c1546da06d12b4f435973bc335a54ca7486ba51a7067c65e58e397236cecad73"},
"k-fold-timeseries-split": {"id": "5bed1f23-ac17-4b52-9d06-a5b77a6aea51", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2019-04-08T16:18:27.250294Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "002f9ad1-46e3-40f4-89ed-eeffbb3a102b", "version": "0.3.0", "python_path": "d3m.primitives.evaluation.kfold_time_series_split.Common", "name": "K-fold cross-validation timeseries dataset splits", "digest": "e06a27b03f9cea879c21e012b031f84c2a7b37193987134481db1117f05e9657"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Jeffrey Gleason"}, "name": "K-fold split of timeseries datasets", "description": "K-fold split of timeseries datasets for cross-validation.\n", "digest": "33aea0b6bd864a383020eb9d1f64fda193e20bb8690ee516809004d805f9614a"},
"TRAINING_DATA": {"id": "79ce71bd-db96-494b-a455-14f2e2ac5040", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-10-26T00:48:08.341897Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "48c683ad-da9e-48cf-b3a0-7394dba5e5d2", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.no_split_dataset_split.Common", "name": "No-split tabular dataset splits", "digest": "869d62e577148338d1c732347d6d0bf2119ae9af6b90037fda5044ab0eef01dc"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "No split of tabular datasets", "description": "A pipeline which splits a tabular dataset in a way that for all splits it\nproduces the same (full) dataset. It still redacts the test split.\nUseful for unsupervised learning tasks.\n", "digest": "690373622142f12dc078657246b8f2f6c070ebd32720321d786a3f0c653d55cc"},
"HOLDOUT": {"id": "3c11d171-e2ad-4d26-a034-04f3b062306c", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-07-28T01:24:39.642266Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "3fcc6dc4-6681-4c86-948e-066d14e7d803", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.train_score_dataset_split.Common", "name": "Train-score tabular dataset splits", "digest": "f65655f435f9e703e00f174dae743f93fee5c10aa2016d2398f4d53bee8d5bae"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "Train-test split of tabular datasets", "description": "Train-test split of tabular datasets.\n", "digest": "675ee3e96e9b1bfba41694b6289a889ef6fc96e5477b89c8267871b941e4d78e"}
}

+ 472
- 0
axolotl/axolotl/utils/schemas.py View File

@@ -0,0 +1,472 @@
import os
import copy
import json
import typing
import logging
import math
import random
import binascii

from d3m import container
from d3m.metadata.problem import TaskKeyword, PerformanceMetric
from d3m.metadata.pipeline import Pipeline
from d3m import utils as d3m_utils

from axolotl.utils import pipeline as pipeline_utils

logger = logging.getLogger(__name__)


# ContainerType = typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.List]
ContainerType = container.Dataset

resource_dir = os.path.dirname(__file__)
SPLITTING_PIPELINES_DIR = os.path.join(resource_dir, 'resources', 'splitting_pipelines.json')
SCORING_PIPELINES_DIR = os.path.join(resource_dir, 'resources', 'scoring_pipeline.yml')
PIPELINES_DB_DIR = os.path.join(resource_dir, 'resources', 'default_pipelines.json')

TASK_TYPE = {
TaskKeyword.CLASSIFICATION, TaskKeyword.REGRESSION,
TaskKeyword.CLUSTERING, TaskKeyword.LINK_PREDICTION,
TaskKeyword.VERTEX_NOMINATION, TaskKeyword.COMMUNITY_DETECTION,
TaskKeyword.GRAPH_MATCHING, TaskKeyword.COLLABORATIVE_FILTERING,
TaskKeyword.OBJECT_DETECTION, TaskKeyword.VERTEX_CLASSIFICATION,
TaskKeyword.FORECASTING
}

TASK_SUBTYPES = {
TaskKeyword.MULTIVARIATE,
TaskKeyword.BINARY,
TaskKeyword.NONOVERLAPPING,
TaskKeyword.OVERLAPPING,
TaskKeyword.UNIVARIATE,
TaskKeyword.MULTICLASS,
TaskKeyword.MULTILABEL,
}

DATA_TYPES = {
TaskKeyword.TIME_SERIES,
TaskKeyword.AUDIO,
TaskKeyword.TABULAR,
TaskKeyword.TEXT,
TaskKeyword.VIDEO,
TaskKeyword.GRAPH,
TaskKeyword.IMAGE,
TaskKeyword.GEOSPATIAL,
TaskKeyword.RELATIONAL,
TaskKeyword.GROUPED,
TaskKeyword.LUPI
}

CLASSIFICATION_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}},
{'metric': PerformanceMetric.PRECISION, 'params': {}},
{'metric': PerformanceMetric.RECALL, 'params': {}},
{'metric': PerformanceMetric.F1, 'params': {}},
{'metric': PerformanceMetric.F1_MICRO, 'params': {}},
{'metric': PerformanceMetric.F1_MACRO, 'params': {}},
{'metric': PerformanceMetric.ROC_AUC, 'params': {}},
]

BINARY_CLASSIFICATION_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}},
]

MULTICLASS_CLASSIFICATION_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}},
{'metric': PerformanceMetric.F1_MICRO, 'params': {}},
{'metric': PerformanceMetric.F1_MACRO, 'params': {}},
]

MULTILABEL_CLASSIFICATION_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}},
]

REGRESSION_METRICS = [
{'metric': PerformanceMetric.MEAN_ABSOLUTE_ERROR, 'params': {}},
{'metric': PerformanceMetric.MEAN_SQUARED_ERROR, 'params': {}},
{'metric': PerformanceMetric.ROOT_MEAN_SQUARED_ERROR, 'params': {}},
{'metric': PerformanceMetric.R_SQUARED, 'params': {}},
]

CLUSTERING_METRICS = [
{'metric': PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, 'params': {}},
]

LINK_PREDICTION_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}},
]

VERTEX_NOMINATION_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}},
]

COMMUNITY_DETECTION_METRICS = [
{'metric': PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, 'params': {}},
]

GRAPH_CLUSTERING_METRICS = []

GRAPH_MATCHING_METRICS = [
{'metric': PerformanceMetric.ACCURACY, 'params': {}}
]

TIME_SERIES_FORECASTING_METRICS = REGRESSION_METRICS

COLLABORATIVE_FILTERING_METRICS = REGRESSION_METRICS

OBJECT_DETECTION_METRICS = [
{'metric': PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION, 'params': {}},
]

MULTICLASS_VERTEX_METRICS = MULTICLASS_CLASSIFICATION_METRICS

SEMI_SUPERVISED_MULTICLASS_CLASSIFICATION_METRICS = MULTICLASS_CLASSIFICATION_METRICS

SEMI_SUPERVISED_REGRESSION_METRICS = REGRESSION_METRICS

DATA_PREPARATION_PARAMS = {
'k_fold_tabular': {
'method': 'K_FOLD',
'number_of_folds': '3',
'stratified': 'false',
'shuffle': 'true',
'randomSeed': '42',
},

'holdout': {
'method': 'HOLDOUT',
'train_score_ratio': '0.2',
'shuffle': 'true',
'stratified': 'true',
'randomSeed': '42',
},

'no_stratified_holdout': {
'method': 'HOLDOUT',
'train_score_ratio': '0.2',
'shuffle': 'true',
'stratified': 'false',
'randomSeed': '42',
},

'no_split': {
'method': 'TRAINING_DATA',
'number_of_folds': '1',
'stratified': 'true',
'shuffle': 'true',
'randomSeed': '42',
},
}

PROBLEM_DEFINITION = {
'binary_classification': {
'performance_metrics': BINARY_CLASSIFICATION_METRICS,
'task_keywords': [TaskKeyword.CLASSIFICATION, TaskKeyword.BINARY]
},
'regression': {
'performance_metrics': REGRESSION_METRICS,
'task_keywords': [TaskKeyword.UNIVARIATE, TaskKeyword.REGRESSION]
}

}


def get_task_description(keywords) -> dict:
"""
A function that parse the keywords from the problem and map them to
TaskType, SubTasktype and data type eg. tabular, images, audio, etc

Parameters
----------
keywords: List[d3m.problem.TaskKeyword]
List of keywords that comes from d3m problem description

Returns
-------
dict
{
task_type: str
task_subtype: str
data_types: list
semi: bool
}
"""

task_type = None
task_subtype = None
data_types = []
semi = False
for keyword in keywords:
if keyword in TASK_TYPE:
task_type = keyword.name
elif keyword in TASK_SUBTYPES:
task_subtype = keyword.name
elif keyword in DATA_TYPES:
data_types.append(keyword.name)
elif keyword.name == TaskKeyword.SEMISUPERVISED:
semi = True

# if data_types is empty we assume is tabular:
if not data_types:
data_types.append(TaskKeyword.TABULAR)

return {'task_type': task_type, 'task_subtype': task_subtype, 'data_types': data_types, 'semi': semi}


def get_metrics_from_task(task_des, perf_metrics=None):
"""
Provides a dictionary of metrics ready to use for perfromance_metrics

Parameters
----------
task_des: dict
A dictionary describe the task
perf_metrics: dict
A dictionary specifying the needed performance metric parameters

Returns
-------
performance_metrics: dict
A dict containing performance metrics.
"""
# For the case thet the user only want to run a full pipeline
task_type = task_des['task_type']
task_subtype = task_des['task_subtype']
data_types = task_des['data_types']
if not task_des:
return None
if TaskKeyword.CLASSIFICATION == task_type or \
TaskKeyword.VERTEX_CLASSIFICATION == task_type:
if task_des['semi']:
# TODO: Temporary solution to binary semi supervised classification
metrics = SEMI_SUPERVISED_MULTICLASS_CLASSIFICATION_METRICS
elif TaskKeyword.BINARY == task_subtype:
metrics = BINARY_CLASSIFICATION_METRICS
elif TaskKeyword.MULTICLASS == task_subtype:
metrics = MULTICLASS_CLASSIFICATION_METRICS
elif TaskKeyword.MULTILABEL == task_subtype:
metrics = MULTILABEL_CLASSIFICATION_METRICS
else:
metrics = CLASSIFICATION_METRICS
elif TaskKeyword.REGRESSION == task_type:
metrics = REGRESSION_METRICS
elif TaskKeyword.CLUSTERING == task_type:
metrics = CLUSTERING_METRICS
elif TaskKeyword.LINK_PREDICTION == task_type:
metrics = LINK_PREDICTION_METRICS
elif TaskKeyword.VERTEX_NOMINATION == task_type:
metrics = VERTEX_NOMINATION_METRICS
elif TaskKeyword.COMMUNITY_DETECTION == task_type:
metrics = COMMUNITY_DETECTION_METRICS
elif TaskKeyword.GRAPH_MATCHING == task_type:
metrics = GRAPH_MATCHING_METRICS
elif TaskKeyword.TIME_SERIES in data_types and TaskKeyword.FORECASTING:
metrics = TIME_SERIES_FORECASTING_METRICS
elif TaskKeyword.COLLABORATIVE_FILTERING == task_type:
metrics = COLLABORATIVE_FILTERING_METRICS
elif TaskKeyword.OBJECT_DETECTION == task_type:
metrics = OBJECT_DETECTION_METRICS
else:
raise ValueError('Task keywords not supported, keywords: {}'.format(task_des))

for i, metric in enumerate(metrics):
for perf_metric in perf_metrics:
if perf_metric['metric'] == metric['metric'] and 'params' in perf_metric:
copy_metric = copy.deepcopy(metric)
copy_metric['params']['pos_label'] = perf_metric['params']['pos_label']
metrics[i] = copy_metric
logger.info('get_metrics_from_task:metrics: {}'.format(metrics))
return metrics


def get_eval_configuration(task_type: str, data_types: typing.Sequence, semi: bool) -> typing.Dict:
"""
Determines which method of evaluation to use, cross_fold, holdout, etc.

Parameters
----------
task_type: str
task type
data_types: list
data types
semi: bool
is it semi-supervised problem

Returns
-------
eval_configuration: dict
A dict that contains the evaluation method to use.
"""

# for the case of no problem return None.
if not task_type:
return {}

if semi:
# Splitting semi may get empty ground truth, which can cause error in sklearn metric.
return DATA_PREPARATION_PARAMS['no_split']

if TaskKeyword.CLASSIFICATION == task_type:
# These data types tend to take up a lot of time to run, so no k_fold.
if TaskKeyword.AUDIO in data_types or TaskKeyword.VIDEO in data_types \
or TaskKeyword.IMAGE in data_types:
return DATA_PREPARATION_PARAMS['holdout']
else:
return DATA_PREPARATION_PARAMS['k_fold_tabular']
elif TaskKeyword.REGRESSION in data_types:
return DATA_PREPARATION_PARAMS['no_stratified_holdout']
else:
return DATA_PREPARATION_PARAMS['no_split']


def get_splitting_pipeline(splitting_name: str) -> Pipeline:
with open(SPLITTING_PIPELINES_DIR) as file:
splitting_pipelines = json.load(file)

if splitting_name in splitting_pipelines:
return pipeline_utils.load_pipeline(splitting_pipelines[splitting_name])
else:
raise ValueError("{} not supported".format(splitting_name))


def get_scoring_pipeline() -> Pipeline:
with open(SCORING_PIPELINES_DIR, 'r') as pipeline_file:
with d3m_utils.silence():
pipeline = Pipeline.from_yaml(pipeline_file)
return pipeline


def get_pipelines_db():
with open(PIPELINES_DB_DIR) as file:
pipelines_dict = json.load(file)
return pipelines_dict


def get_task_mapping(task: str) -> str:
"""
Map the task in problem_doc to the task types that are currently supported

Parameters
----------
task: str
The task type in problem_doc

Returns
-------
str
One of task types that are supported

"""
mapping = {
'LINK_PREDICTION': 'CLASSIFICATION',
TaskKeyword.VERTEX_CLASSIFICATION: 'CLASSIFICATION',
'COMMUNITY_DETECTION': 'CLASSIFICATION',
'GRAPH_MATCHING': 'CLASSIFICATION',
TaskKeyword.FORECASTING: 'REGRESSION',
'OBJECT_DETECTION': 'CLASSIFICATION',
'VERTEX_CLASSIFICATION': 'CLASSIFICATION',
}
if task in mapping:
return mapping[task]
else:
return task



def hex_to_binary(hex_identifier):
return binascii.unhexlify(hex_identifier)


def binary_to_hex(identifier):
hex_identifier = binascii.hexlify(identifier)
return hex_identifier.decode()


def summarize_performance_metrics(performance_metrics):
"""
A function that averages all the folds if they exist.

Parameters
----------
performance_metrics: dict
A dictionary containing the fold, metrics targets and values from evaluation.
"""
sumarized_performance_metrics = {}

for metric in performance_metrics.metric.unique():
mean = performance_metrics[performance_metrics.metric == metric]['value'].mean()
std = performance_metrics[performance_metrics.metric == metric]['value'].std()
if math.isnan(std):
std = 0
sumarized_performance_metrics[metric] = {
'mean': mean,
'std': std,
}
return sumarized_performance_metrics


def compute_score(sumarized_performance_metrics):
"""
A function that computes the internal score based on the average normalized metrics.

Parameters
----------
sumarized_performance_metrics: dict
A dictionary containing the summarized version.
"""
score = 0

for metric, info in sumarized_performance_metrics.items():
score += PerformanceMetric[metric].normalize(info['mean'])

score = score / float(len(sumarized_performance_metrics))
return score


def compute_rank(sumarized_performance_metrics):
"""
A function that computes the rank based on the average normalized metrics.

Parameters
----------
sumarized_performance_metrics: dict
A dictionary containing the summarized version.
"""
ranks = {}
mean = 0
for metric, info in sumarized_performance_metrics.items():
try:
ranks[metric] = PerformanceMetric[metric].normalize(abs(info['mean'] - info['std']))
except:
ranks[metric] = 0
mean += ranks[metric]

mean = mean / len(sumarized_performance_metrics)
# rank = 1 - ranks[min(ranks.keys(), key=(lambda k: ranks[k]))] + random.randint(10, 30)**-6
rank = 1 - mean

# We add some randomness on the rank to avoid duplications
noise = 0
sign = -1 if random.randint(0, 1) == 0 else 1
range_0 = -9
range_1 = -5
if rank < 1e-5:
range_0 = -12
range_1 = -9

for i in range(range_0, range_1):
noise += random.randint(0, 9) * 10 ** i
rank = rank + noise * sign
if rank < 0:
rank *= -1
return rank


def random_rank():
ranks = 0
average_number = 5
for i in range(average_number):
ranks += random.uniform(0, 1)
ranks = ranks/average_number
return ranks

+ 284
- 0
axolotl/examples/build_search_algorithm.ipynb
File diff suppressed because it is too large
View File


+ 424
- 0
axolotl/examples/load_csv.ipynb View File

@@ -0,0 +1,424 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Axolotl CSV manipulation [Binary Classification]."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this example, we are showcasing different components of the system.\n",
"- Loading syntethic data for a univariate regression task.\n",
"- Easy use of the backend.\n",
"- Use of simple interface for search predefined method.\n",
"- Exploring searched pipelines."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import multiple utils we will be using"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-07-12 15:23:25,435\tINFO resource_spec.py:212 -- Starting Ray with 4.39 GiB memory available for workers and up to 2.2 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).\n",
"2020-07-12 15:23:25,965\tINFO services.py:1170 -- View the Ray dashboard at localhost:8265\n"
]
}
],
"source": [
"import os\n",
"from pprint import pprint\n",
"import pandas as pd\n",
"from sklearn.datasets import make_regression\n",
"\n",
"from d3m import container\n",
"from d3m.metadata.pipeline import Pipeline\n",
"\n",
"from axolotl.utils import data_problem, pipeline as pipeline_utils\n",
"from axolotl.backend.ray import RayRunner\n",
"from axolotl.algorithms.random_search import RandomSearch\n",
"\n",
"# init runner\n",
"backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load csv file and transform it as dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"table_path = os.path.join('..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv')\n",
"df = pd.read_csv(table_path)\n",
"dataset, problem_description = data_problem.generate_dataset_problem(df, task='binary_classification', target_index=5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an instance of the search and fit with the input_data."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data.\n",
"search = RandomSearch(problem_description=problem_description, backend=backend)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 47ec5c86-46b8-4dee-9562-1e5ebc3d0824 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 64da5190-c2ee-4b8e-abef-697b54cfa32b failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 9e03188f-2120-49ac-a087-1e4fb1b29754 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline af32bc20-64fa-44a5-ab34-bbe810b671b1 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 5dbc9e87-19be-4cda-ac51-c1d7ea9328c1 failed.',)]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 918c088e-58dd-4991-8336-deb0b41cb5eb failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 41dfec8f-0b07-4f8e-8ff3-cdbb1dab11c7 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline d465a878-1ea5-4b72-b8a7-3a4122d1a482 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8c39e981-f446-4fde-8744-5606c35a7fdf failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline df127bce-11af-4fae-b8bb-722cb0666484 failed.',)]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n",
"(pid=85426) The parameter 'presort' is deprecated and has no effect. It will be removed in v0.24. You can suppress this warning by not passing any value to the 'presort' parameter. We also recommend using HistGradientBoosting models instead.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 0985e11e-8db0-4c1c-9f34-3ce8fbc626c1 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8977a9c0-dd79-4771-9dc1-455586b80947 failed.',)]\n",
"Current trial is failed. Error: [StepFailedError('Step 7 for pipeline c0238551-5fbb-41cd-8187-d3d23bc5571d failed.',)]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n"
]
}
],
"source": [
"fitted_pipeline, fitted_pipelineine_result = search.search_fit(input_data=[dataset], time_limit=30)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"produce_results = search.produce(fitted_pipeline, [dataset])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>d3mIndex</th>\n",
" <th>species</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Iris-setosa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Iris-setosa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Iris-setosa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Iris-setosa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Iris-setosa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>145</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>146</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>147</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>148</th>\n",
" <td>148</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>149</td>\n",
" <td>Iris-virginica</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>150 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" d3mIndex species\n",
"0 0 Iris-setosa\n",
"1 1 Iris-setosa\n",
"2 2 Iris-setosa\n",
"3 3 Iris-setosa\n",
"4 4 Iris-setosa\n",
".. ... ...\n",
"145 145 Iris-virginica\n",
"146 146 Iris-virginica\n",
"147 147 Iris-virginica\n",
"148 148 Iris-virginica\n",
"149 149 Iris-virginica\n",
"\n",
"[150 rows x 2 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"produce_results.output"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Print information about scores of the succeded pipelines."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"----------------------------------------------------\n",
"Pipeline id: 676360d8-71ac-401c-b44a-31a810c4e8d3\n",
"Rank: 0.22667216466666668\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.773333 0.773333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 85d44359-0dac-4260-aea8-c78950025c3f\n",
"Rank: 0.33333446433333336\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.666667 0.666667 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 3efb07be-28ff-45d8-b1fb-1c49f96b3381\n",
"Rank: 0.6666653826666668\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: abd9eb99-a4ba-4210-bb34-c2dec7c3ccfa\n",
"Rank: 0.6666606186666667\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 8948a194-0dfe-4d07-a7c8-d1f5136f68c6\n",
"Rank: 0.21333939733333337\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.786667 0.786667 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 22866f54-ba68-49e5-8f84-a2a6aba98253\n",
"Rank: 0.16000235200000004\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.84 0.84 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 37a1c72a-9efd-4b0a-9d3d-811d47571b45\n",
"Rank: 0.6666753326666668\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 2d3cae0f-66f6-46e0-9fa5-128bf02b4d7e\n",
"Rank: 0.6666655736666668\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: d1e5a59d-be50-42f3-a71b-cf8ba59b3c47\n",
"Rank: 0.08666869166666667\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.913333 0.913333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 35d47611-bded-4669-9803-9d259f686ec1\n",
"Rank: 0.35999672099999996\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.64 0.64 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 7398d17f-e91f-4c75-9a95-c9f85763c858\n",
"Rank: 0.6666598006666667\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 5293503b-4cb6-4b8b-bf8e-8b9d981c3b03\n",
"Rank: 0.04666429966666663\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.953333 0.953333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 756e2a15-3315-4aa1-8620-f73ffc69f8a4\n",
"Rank: 0.6666748276666667\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 46633510-6f46-479e-982e-263aaa2e187a\n",
"Rank: 0.17999182400000005\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.82 0.82 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 49a750b0-5c86-4ff3-9b2d-c58c6390dd0d\n",
"Rank: 0.6666588986666667\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.333333 0.333333 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 84c24452-b2cf-41a2-813c-a135eaeef480\n",
"Rank: 0.36000324699999997\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.64 0.64 42 0\n",
"----------------------------------------------------\n",
"Pipeline id: 82117b6b-6960-48bb-b1f4-91355acf51d6\n",
"Rank: 0.026667331666666617\n",
" metric value normalized randomSeed fold\n",
"0 ACCURACY 0.973333 0.973333 42 0\n"
]
}
],
"source": [
"for pipeline_result in search.history:\n",
" print('-' * 52)\n",
" print('Pipeline id:', pipeline_result.pipeline.id)\n",
" print('Rank:', pipeline_result.rank)\n",
" print(pipeline_result.scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

+ 1
- 0
axolotl/examples/random_search/oracle.json
File diff suppressed because it is too large
View File


+ 31
- 0
axolotl/examples/run.py View File

@@ -0,0 +1,31 @@
import os
import time
from pprint import pprint
import pandas as pd
from sklearn.datasets import make_regression

from d3m import container
from d3m.metadata.pipeline import Pipeline

from axolotl.utils import data_problem, pipeline as pipeline_utils
from axolotl.backend.simple import SimpleRunner
from axolotl.backend.ray import RayRunner
from axolotl.algorithms.random_search import RandomSearch

# init runner
#backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=3)
backend = SimpleRunner(random_seed=42, volumes_dir=None)
#time.sleep(30)

table_path = os.path.join('..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv')
df = pd.read_csv(table_path)
dataset, problem_description = data_problem.generate_dataset_problem(df, task='binary_classification', target_index=5)

# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data.
search = RandomSearch(problem_description=problem_description, backend=backend)

fitted_pipeline, fitted_pipelineine_result = search.search_fit(input_data=[dataset], time_limit=30)

produce_results = search.produce(fitted_pipeline, [dataset])

print(produce_results.output)

+ 1
- 0
axolotl/examples/synthetic_data_bayesian_hp_tunning.ipynb.REMOVED.git-id View File

@@ -0,0 +1 @@
0b793ea6bbd8536751fb6941cb70e3ff2ed5739b

+ 11
- 0
axolotl/failed_installation_repos.txt View File

@@ -0,0 +1,11 @@
Repository Name: dsbox-primitives
Package URI: git+https://github.com/usc-isi-i2/dsbox-primitives@390595a708a8702cd6b7b388661127fcf63e4605#egg=dsbox-primitives
Error: "AttributeError: module 'tensorflow' has no attribute 'get_default_graph'"

Repository Name: distil-primitives
Package URI: git+https://github.com/uncharted-distil/distil-primitives.git@08065c3e867401e444d8e25177c779fcc3ad5af7#egg=distil-primitives
Error: "Cannnot be install due to hard dependency on tensorflow-gpu"

Repository Name: kf-d3m-primitives
Package URI: git+https://github.com/kungfuai/d3m-primitives.git@17ca6cd4e9ca00e09e2cf91e1cb9f18562645821#egg=kf-d3m-primitives
Error: "Cannnot be install due to hard dependency on tensorflow-gpu"

+ 39
- 0
axolotl/images/Devd3mStart.sh View File

@@ -0,0 +1,39 @@
#!/bin/bash

alias python="python3"

# check if we are on a deployment container or not.
if [ -d "/user_dev" ]; then
cd /user_dev
echo "Running on deployment"
else
echo "Running on testing"
fi


# check output_dir
if [[ -z "$D3MOUTPUTDIR" ]]; then
D3MOUTPUTDIR="$(pwd)/output_dir"
mkdir -p "$D3MOUTPUTDIR"
else
D3MOUTPUTDIR="$D3MOUTPUTDIR"
fi

# check if time is set, otherwise we use 1 min
if [[ -z "$D3MTIMEOUT" ]]; then
D3MTIMEOUT="60" # 10 gb
else
D3MTIMEOUT="$D3MTIMEOUT"
fi

# execute d3m server.
case $D3MRUN in
"standalone")
echo "Executing TAMU TA2 Standalone"
echo "No standalone supported yet"
;;
*)
echo "Executing TAMU TA2"
python3 -m axolotl.d3m_grpc.server
;;
esac

+ 13
- 0
axolotl/images/axolotl.dockerfile View File

@@ -0,0 +1,13 @@
FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.5.18-20200630-050709

RUN pip3 install -e git+https://gitlab.com/axolotl1/axolotl.git@9619a077e1d06a152fa01f0fca7fa0321dcd3d2c#egg=axolotl
COPY images/Devd3mStart.sh /user_dev/Devd3mStart.sh

RUN chmod a+x /user_dev/Devd3mStart.sh

ENV D3MRUN ta2ta3
ENV TOKENIZERS_PARALLELISM false

EXPOSE 45042

ENTRYPOINT ["/user_dev/Devd3mStart.sh"]

+ 3
- 0
axolotl/images/base.dockerfile View File

@@ -0,0 +1,3 @@
FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.5.18-20200630-050709

RUN apt update

+ 21
- 0
axolotl/images/build-images.sh View File

@@ -0,0 +1,21 @@
#!/bin/sh -e

docker login -u gitlab-ci-token -p "$CI_JOB_TOKEN" "$CI_REGISTRY"

for IMAGE_NAME in "$@"; do
if [ "$IMAGE_NAME" = "base" ]; then
echo "Bulding "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest"
docker build -t "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest" -f images/base.dockerfile .
echo "Pushing "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest"
docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest"
echo "Done"
fi

if [ "$IMAGE_NAME" = "axolotl" ]; then
echo "Bulding "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest"
docker build -t "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest" -f images/axolotl.dockerfile .
echo "Pushing "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest"
docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest"
echo "Done"
fi
done

+ 11
- 0
axolotl/run_tests.py View File

@@ -0,0 +1,11 @@
#!/usr/bin/env python3

import sys
import unittest

runner = unittest.TextTestRunner(verbosity=1)

tests = unittest.TestLoader().discover('tests')

if not runner.run(tests).wasSuccessful():
sys.exit(1)

+ 53
- 0
axolotl/setup.py View File

@@ -0,0 +1,53 @@
import os
import os.path
import sys
from setuptools import setup, find_packages
import subprocess

PACKAGE_NAME = 'axolotl'
MINIMUM_PYTHON_VERSION = 3, 6


def check_python_version():
"""Exit when the Python version is too low."""
if sys.version_info < MINIMUM_PYTHON_VERSION:
sys.exit("Python {}.{}+ is required.".format(*MINIMUM_PYTHON_VERSION))


def read_package_variable(key):
"""Read the value of a variable from the package without importing."""
module_path = os.path.join(PACKAGE_NAME, '__init__.py')
with open(module_path) as module:
for line in module:
parts = line.strip().split(' ')
if parts and parts[0] == key:
return parts[-1].strip("'")
raise KeyError("'{0}' not found in '{1}'".format(key, module_path))


check_python_version()
version = read_package_variable('__version__')
description = read_package_variable('__description__')
setup(
name=PACKAGE_NAME,
version=version,
description=version,

packages=find_packages(exclude=['tests*']),
license='Apache-2.0',
classifiers=[
'License :: OSI Approved :: Apache Software License',
],
install_requires=[
'd3m',
'grpcio',
'grpcio-tools',
'grpcio-testing',
'ray',
'networkx',
],
extras_require={
'cpu': ['tensorflow==2.2.0'],
'gpu': ['tensorflow-gpu==2.2.0']
}
)

+ 0
- 0
axolotl/tests/__init__.py View File


+ 383
- 0
axolotl/tests/_server_test.py View File

@@ -0,0 +1,383 @@
# from __future__ import print_function

import argparse
import os
import pathlib
from pprint import pprint

import grpc
from d3m import utils as d3m_utils, runtime as runtime_module
from d3m.metadata import problem as problem_module
from ta3ta2_api import core_pb2, core_pb2_grpc, value_pb2, utils

from axolotl.utils import pipeline as pipeline_utils
from axolotl.d3m_grpc import constants

# with d3m_utils.silence():
# d3m_index.load_all(blocklist=constants.PrimitivesList.BLACK_LIST)


# primitives = [
# 'd3m.primitives.datasets.DatasetToDataFrame',
# 'd3m.primitives.data_transformation.denormalize.Common'
# ]
#
# with d3m_utils.silence():
# for primitive in primitives:
# d3m_index.get_primitive(primitive)


LENGTH = 60
ALLOWED_VALUE_TYPES = ['DATASET_URI', 'CSV_URI', 'RAW']
FULL_SPECIFIED_PIPELINE_PATH = 'modules/server/test_full_pipeline.json'
PRE_SPECIFIED_PIPELINE_PATH = 'modules/server/test_placeholder.json'


# PRE_SPECIFIED_PIPELINE_PATH = 'modules/server/test_placeholder_pipeline.json'


def hello_request():
request = core_pb2.HelloRequest()
return request


def list_primitives_request():
request = core_pb2.ListPrimitivesRequest()
return request


def search_solutions_request(test_paths, specified_template=None):
user_agent = "test_agent"
version = core_pb2.DESCRIPTOR.GetOptions().Extensions[core_pb2.protocol_version]

time_bound = 0.5
priority = 10
# allowed_value_types = [value_pb2.ValueType.Value(value) for value in ALLOWED_VALUE_TYPES]

problem_description = utils.encode_problem_description(
problem_module.Problem.load(test_paths['TRAIN']['problem'])
)

template = None
if specified_template == 'FULL':
with d3m_utils.silence():
pipeline = pipeline_utils.load_pipeline(FULL_SPECIFIED_PIPELINE_PATH)
template = utils.encode_pipeline_description(pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT)
elif specified_template == 'PRE': # PRE for PREPROCESSING
pipeline = runtime_module.get_pipeline(PRE_SPECIFIED_PIPELINE_PATH, load_all_primitives=False)
template = utils.encode_pipeline_description(pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT)

inputs = [
value_pb2.Value(
dataset_uri=test_paths['TRAIN']['dataset']
)
]

request = core_pb2.SearchSolutionsRequest(
user_agent=user_agent,
version=version,
time_bound_search=time_bound,
priority=priority,
allowed_value_types=ALLOWED_VALUE_TYPES,
problem=problem_description,
template=template,
inputs=inputs
)
return request


def get_search_solution_results_request(search_id):
request = core_pb2.GetSearchSolutionsResultsRequest(search_id=search_id)
return request


def fit_solution_request(solution_id, test_paths):
inputs = [
value_pb2.Value(
dataset_uri=test_paths['TRAIN']['dataset']
)
]
expose_outputs = ['outputs.0']
expose_value_types = ['CSV_URI']
users = [
core_pb2.SolutionRunUser(
id='test_user',
chosen=True,
reason='just because'
)
]
request = core_pb2.FitSolutionRequest(
solution_id=solution_id,
inputs=inputs,
expose_outputs=expose_outputs,
expose_value_types=expose_value_types,
users=users
)
return request


def get_fit_solution_results_request(request_id):
request = core_pb2.GetFitSolutionResultsRequest(
request_id=request_id
)
return request


def produce_solution_request(fitted_solution_id, test_paths):
inputs = [
value_pb2.Value(
dataset_uri=test_paths['TEST']['dataset']
)
]
expose_outputs = ['outputs.0']
expose_value_types = ['CSV_URI']

users = [
core_pb2.SolutionRunUser(
id='test_user',
chosen=True,
reason='just because'
)
]

request = core_pb2.ProduceSolutionRequest(
fitted_solution_id=fitted_solution_id,
inputs=inputs,
expose_outputs=expose_outputs,
expose_value_types=expose_value_types,
users=users
)
return request


def get_produce_solution_results_request(request_id):
request = core_pb2.GetProduceSolutionResultsRequest(
request_id=request_id
)
return request


def describe_solution_request(solution_id):
request = core_pb2.DescribeSolutionRequest(
solution_id=solution_id
)
return request


def score_solution_request(solution_id, test_paths):
inputs = [
value_pb2.Value(
dataset_uri=test_paths['SCORE']['dataset']
)
]

problem = problem_module.Problem.load(test_paths['SCORE']['problem'])
performance_metrics = []
for performance_metric in problem['problem'].get('performance_metrics', []):
performance_metrics.append(utils.encode_performance_metric(performance_metric))

# TODO add support for more evaluation methods
users = []
evaluation_method = 'K_FOLD'
configuration = core_pb2.ScoringConfiguration(
method=evaluation_method,
folds=2,
# train_test_ratio
shuffle=True,
random_seed=42,
stratified=True,
)
request = core_pb2.ScoreSolutionRequest(
solution_id=solution_id,
inputs=inputs,
performance_metrics=performance_metrics,
users=users,
configuration=configuration
)
return request


def get_score_solution_request(solution_id):
request = core_pb2.ScoreSolutionRequest(
solution_id=solution_id
)
return request


def solution_export_request(solution_id):
rank = 0.1
request = core_pb2.SolutionExportRequest(
solution_id=solution_id,
rank=rank
)
return request


def end_search_solutions_request(search_id):
request = core_pb2.EndSearchSolutionsRequest(search_id=search_id)
return request


def stop_search_solution_request(search_id):
request = core_pb2.StopSearchSolutionsRequest(search_id=search_id)
return request


def run(test_paths, specified_template=None):
channel = grpc.insecure_channel('localhost:45042')
stub = core_pb2_grpc.CoreStub(channel)

print_name('Hello')
hello_r = stub.Hello(hello_request())
pprint(hello_r)

print_name('ListPrimitive')
list_primitives_r = stub.ListPrimitives(list_primitives_request())
for _primitive in list_primitives_r.primitives:
print_space()
pprint(_primitive)

print_name('SearchSolution')
search_solutions_r = stub.SearchSolutions(search_solutions_request(test_paths, specified_template))
search_id = search_solutions_r.search_id
pprint(search_solutions_r)

print_name('GetSearchSolutionsResults')
solution_id = None
for get_search_solution_r in stub.GetSearchSolutionsResults(get_search_solution_results_request(search_id)):
print_space()
pprint(get_search_solution_r)
if get_search_solution_r.solution_id:
solution_id = get_search_solution_r.solution_id

print_name('DescribeSolution')
describe_solution_r = stub.DescribeSolution(describe_solution_request(solution_id))
pprint(describe_solution_r)

print_name('FitSolution')
fit_solution_r = stub.FitSolution(fit_solution_request(solution_id, test_paths))
fit_request_id = fit_solution_r.request_id
pprint(fit_solution_r)

print_name('GetFitSolutionResultsRequest')
fitted_solution_id = None
for get_git_solution_results_r in stub.GetFitSolutionResults(get_fit_solution_results_request(fit_request_id)):
print_space()
pprint(get_git_solution_results_r)
fitted_solution_id = get_git_solution_results_r.fitted_solution_id

print_name('ProduceSolutionRequest')
produce_solution_r = stub.ProduceSolution(produce_solution_request(fitted_solution_id, test_paths))
produce_request_id = produce_solution_r.request_id
pprint(produce_solution_r)

print_name('GetProduceSolutionResultsRequest')
for get_produce_solution_results_r in stub.GetProduceSolutionResults(
get_produce_solution_results_request(produce_request_id)):
print_space()
pprint(get_produce_solution_results_r)

print_name('ScoreSolution')
score_solution_r = stub.ScoreSolution(score_solution_request(solution_id, test_paths))
score_request_id = score_solution_r.request_id

pprint(score_solution_r)

print_name('GetScoreSolutionResults')
for score_solution_r in stub.GetScoreSolutionResults(get_score_solution_request(score_request_id)):
print_space()
pprint(score_solution_r)

print_name('SolutionExport')
solution_export_r = stub.SolutionExport(solution_export_request(solution_id))
pprint(solution_export_r)

print_name('StopSearchSolutions')
stop_search_solution_r = stub.StopSearchSolutions(stop_search_solution_request(search_id))
pprint(stop_search_solution_r)

print_name('EndSearchSolutions')
end_search_solutions_r = stub.EndSearchSolutions(end_search_solutions_request(search_id))
pprint(end_search_solutions_r)


def print_name(name):
length = LENGTH
free_space = length - len(name) - 2
space = int(free_space / 2)
name = '#' + ' ' * space + name + ' ' * space
if free_space % 2 == 0:
name = name + '#'
else:
name = name + ' #'

print("#" * length)
print(name)
print("#" * length)


def print_space():
print('-' * LENGTH)


def configure_parser(parser, *, skip_arguments=()):
parser.add_argument(
'-t', '--test-path', type=str, default="/D3M/internal_d3m/Winter_2018_tamuta2/datasets/26/",
help="path of d3m dataset to test."
)


def get_problem_id(test_path):
problem_description = problem_module.Problem.load(test_path)
print(problem_description)
problem_id = problem_description.get('id', None)
return problem_id


def get_paths(test_path):
# Classification Score dataset path is (problem_SCORE, dataset_SCORE) not
# However, regression and other Score dataset path is (problem_TEST, dataset_TEST)
score_problem_relative_path = os.path.join(test_path, 'SCORE/problem_SCORE/problemDoc.json')
score_dataset_relative_path = os.path.join(test_path, 'SCORE/dataset_SCORE/datasetDoc.json')

if not os.path.exists(score_problem_relative_path) or not os.path.exists(score_dataset_relative_path):
score_problem_relative_path = os.path.join(test_path, 'SCORE/problem_TEST/problemDoc.json')
score_dataset_relative_path = os.path.join(test_path, 'SCORE/dataset_TEST/datasetDoc.json')

test_paths = {
'TRAIN': {
'dataset': os.path.join(test_path, 'TRAIN/dataset_TRAIN/datasetDoc.json'),
'problem': pathlib.Path(
os.path.abspath(os.path.join(test_path, 'TRAIN/problem_TRAIN/problemDoc.json'))).as_uri()
},
'TEST': {
'dataset': os.path.join(test_path, 'TEST/dataset_TEST/datasetDoc.json'),
'problem': pathlib.Path(
os.path.abspath(os.path.join(test_path, 'TEST/problem_TEST/problemDoc.json'))).as_uri()
},
'SCORE': {
'dataset': os.path.join(test_path, score_dataset_relative_path),
'problem': pathlib.Path(os.path.abspath(score_problem_relative_path)).as_uri()
},
}
return test_paths


if __name__ == '__main__':
# Creating parser
parser = argparse.ArgumentParser(description="Test from command line")
configure_parser(parser)
arguments = parser.parse_args()

# Getting test root path
test_path = arguments.test_path

# Getting test paths train/test/score
test_paths = get_paths(test_path)

# Getting problem id
test_id = get_problem_id(test_paths['TEST']['problem'])

print_name('Starting Test: ' + test_id)
run(test_paths, None)
print_name('Finishing Test: ' + test_id)

+ 10
- 0
axolotl/tests/data/.gitignore View File

@@ -0,0 +1,10 @@
*.pyc
__pycache__
.DS_Store
.ipynb_checkpoints
.cache
.idea
*.egg-info
.mypy_cache
dist
build

+ 42
- 0
axolotl/tests/data/.gitlab-ci.yml View File

@@ -0,0 +1,42 @@
build_summing_image:
stage: build

image: docker:stable

services:
- docker:dind

before_script:
- docker info

script:
- docker login -u gitlab-ci-token -p "$CI_JOB_TOKEN" "$CI_REGISTRY"
- docker build --cache-from="$CI_REGISTRY_IMAGE/summing:latest" -t "$CI_REGISTRY_IMAGE/summing:latest" docker/summing
- docker push "$CI_REGISTRY_IMAGE/summing:latest"

only:
- master

style_check:
stage: build

image: registry.gitlab.com/datadrivendiscovery/images/testing:ubuntu-bionic-python36

script:
- pycodestyle primitives/test_primitives

type_check:
stage: build

image: registry.gitlab.com/datadrivendiscovery/images/testing:ubuntu-bionic-python36

variables:
DEPENDENCY_REF: devel

script:
- cd primitives
- git clone https://gitlab.com/datadrivendiscovery/d3m.git
- cd d3m
- git checkout ${DEPENDENCY_REF}
- cd ..
- MYPYPATH=d3m mypy test_primitives

+ 10
- 0
axolotl/tests/data/README.md View File

@@ -0,0 +1,10 @@
# Data used for tests

This repository contains data used for tests across multiple other repositories.

## About Data Driven Discovery Program

DARPA Data Driven Discovery (D3M) Program is researching ways to get machines to build
machine learning pipelines automatically. It is split into three layers:
TA1 (primitives), TA2 (systems which combine primitives automatically into pipelines
and executes them), and TA3 (end-users interfaces).

+ 20
- 0
axolotl/tests/data/add.sh View File

@@ -0,0 +1,20 @@
#!/bin/bash -e

# Assumption is that this repository is cloned into "d3m-test-data" directory
# which is a sibling of "d3m-primitives" directory.

for PRIMITIVE in d3m.primitives.regression.monomial.Test \
d3m.primitives.operator.increment.Test \
d3m.primitives.operator.sum.Test \
d3m.primitives.data_generation.random.Test \
d3m.primitives.operator.primitive_sum.Test \
d3m.primitives.operator.null.TransformerTest \
d3m.primitives.operator.null.UnsupervisedLearnerTest \
d3m.primitives.classification.random_classifier.Test \
d3m.primitives.evaluation.compute_scores.Test ; do
echo $PRIMITIVE
python -m d3m primitive describe -i 4 $PRIMITIVE > primitive.json
pushd ../d3m-primitives
./add.py ../d3m-tests-data/primitive.json
popd
done

+ 82
- 0
axolotl/tests/data/datasets/audio_dataset_1/datasetDoc.json View File

@@ -0,0 +1,82 @@
{
"about": {
"datasetID": "audio_dataset_1",
"datasetName": "Audio dataset to be used for tests",
"license": "CC0",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0",
"digest": "4eaa4ee8ce18dc066d400d756105aab1ce92895593d09c8be23e08fdd89640e1"
},
"dataResources": [
{
"resID": "0",
"resPath": "media/",
"resType": "audio",
"resFormat": {
"audio/mpeg": [
"mp3"
]
},
"isCollection": true
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 5,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "audio_file",
"colType": "string",
"role": [
"attribute"
],
"refersTo": {
"resID": "0",
"resObject": "item"
}
},
{
"colIndex": 2,
"colName": "start",
"colType": "real",
"role": [
"boundaryIndicator"
]
},
{
"colIndex": 3,
"colName": "end",
"colType": "real",
"role": [
"boundaryIndicator"
]
},
{
"colIndex": 4,
"colName": "class",
"colType": "categorical",
"role": [
"suggestedTarget"
]
}
]
}
]
}

BIN
axolotl/tests/data/datasets/audio_dataset_1/media/test_audio.mp3 View File


+ 2
- 0
axolotl/tests/data/datasets/audio_dataset_1/tables/learningData.csv View File

@@ -0,0 +1,2 @@
d3mIndex,audio_file,start,end,class
0,test_audio.mp3,0.007,0.008,test

+ 164
- 0
axolotl/tests/data/datasets/boston_dataset_1/datasetDoc.json View File

@@ -0,0 +1,164 @@
{
"about": {
"datasetID": "boston_dataset_1",
"datasetName": "Boston Dataset",
"description": "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter.",
"datasetVersion": "4.0.0",
"datasetSchemaVersion": "4.0.0",
"sourceURI": "http://lib.stat.cmu.edu/datasets/boston",
"license": "unknown",
"digest": "7797ade70da006a47c32db5dd24be51a6956dbf8d600c4720e53576d32e451e6"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 15,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "CRIM",
"colDescription": "per capita crime rate by town",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "ZN",
"colDescription": "proportion of residential land zoned for lots over 25,000 sq.ft.",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "INDUS",
"colDescription": "proportion of non-retail business acres per town",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "CHAS",
"colDescription": "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)",
"colType": "boolean",
"role": [
"attribute"
]
},
{
"colIndex": 5,
"colName": "NOX",
"colDescription": "nitric oxides concentration (parts per 10 million)",
"colType": "real",
"role": [
"attribute",
"suggestedTarget"
]
},
{
"colIndex": 6,
"colName": "RM",
"colDescription": "average number of rooms per dwelling",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 7,
"colName": "AGE",
"colDescription": "proportion of owner-occupied units built prior to 1940",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 8,
"colName": "DIS",
"colDescription": "weighted distances to five Boston employment centres",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 9,
"colName": "RAD",
"colDescription": "index of accessibility to radial highways",
"colType": "categorical",
"role": [
"attribute"
]
},
{
"colIndex": 10,
"colName": "TAX",
"colDescription": "full-value property-tax rate per $10,000",
"colType": "integer",
"role": [
"attribute"
]
},
{
"colIndex": 11,
"colName": "PTRATIO",
"colDescription": "pupil-teacher ratio by town",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 12,
"colName": "B",
"colDescription": "1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 13,
"colName": "LSTAT",
"colDescription": "% lower status of the population",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 14,
"colName": "MEDV",
"colDescription": "Median value of owner-occupied homes in $1000's\n",
"colType": "real",
"role": [
"attribute",
"suggestedTarget"
]
}
]
}
]
}

+ 507
- 0
axolotl/tests/data/datasets/boston_dataset_1/tables/learningData.csv View File

@@ -0,0 +1,507 @@
d3mIndex,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9
10,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,15
11,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27,18.9
12,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71,21.7
13,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26,20.4
14,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26,18.2
15,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47,19.9
16,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58,23.1
17,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67,17.5
18,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69,20.2
19,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28,18.2
20,1.25179,0,8.14,0,0.538,5.57,98.1,3.7979,4,307,21,376.57,21.02,13.6
21,0.85204,0,8.14,0,0.538,5.965,89.2,4.0123,4,307,21,392.53,13.83,19.6
22,1.23247,0,8.14,0,0.538,6.142,91.7,3.9769,4,307,21,396.9,18.72,15.2
23,0.98843,0,8.14,0,0.538,5.813,100,4.0952,4,307,21,394.54,19.88,14.5
24,0.75026,0,8.14,0,0.538,5.924,94.1,4.3996,4,307,21,394.33,16.3,15.6
25,0.84054,0,8.14,0,0.538,5.599,85.7,4.4546,4,307,21,303.42,16.51,13.9
26,0.67191,0,8.14,0,0.538,5.813,90.3,4.682,4,307,21,376.88,14.81,16.6
27,0.95577,0,8.14,0,0.538,6.047,88.8,4.4534,4,307,21,306.38,17.28,14.8
28,0.77299,0,8.14,0,0.538,6.495,94.4,4.4547,4,307,21,387.94,12.8,18.4
29,1.00245,0,8.14,0,0.538,6.674,87.3,4.239,4,307,21,380.23,11.98,21
30,1.13081,0,8.14,0,0.538,5.713,94.1,4.233,4,307,21,360.17,22.6,12.7
31,1.35472,0,8.14,0,0.538,6.072,100,4.175,4,307,21,376.73,13.04,14.5
32,1.38799,0,8.14,0,0.538,5.95,82,3.99,4,307,21,232.6,27.71,13.2
33,1.15172,0,8.14,0,0.538,5.701,95,3.7872,4,307,21,358.77,18.35,13.1
34,1.61282,0,8.14,0,0.538,6.096,96.9,3.7598,4,307,21,248.31,20.34,13.5
35,0.06417,0,5.96,0,0.499,5.933,68.2,3.3603,5,279,19.2,396.9,9.68,18.9
36,0.09744,0,5.96,0,0.499,5.841,61.4,3.3779,5,279,19.2,377.56,11.41,20
37,0.08014,0,5.96,0,0.499,5.85,41.5,3.9342,5,279,19.2,396.9,8.77,21
38,0.17505,0,5.96,0,0.499,5.966,30.2,3.8473,5,279,19.2,393.43,10.13,24.7
39,0.02763,75,2.95,0,0.428,6.595,21.8,5.4011,3,252,18.3,395.63,4.32,30.8
40,0.03359,75,2.95,0,0.428,7.024,15.8,5.4011,3,252,18.3,395.62,1.98,34.9
41,0.12744,0,6.91,0,0.448,6.77,2.9,5.7209,3,233,17.9,385.41,4.84,26.6
42,0.1415,0,6.91,0,0.448,6.169,6.6,5.7209,3,233,17.9,383.37,5.81,25.3
43,0.15936,0,6.91,0,0.448,6.211,6.5,5.7209,3,233,17.9,394.46,7.44,24.7
44,0.12269,0,6.91,0,0.448,6.069,40,5.7209,3,233,17.9,389.39,9.55,21.2
45,0.17142,0,6.91,0,0.448,5.682,33.8,5.1004,3,233,17.9,396.9,10.21,19.3
46,0.18836,0,6.91,0,0.448,5.786,33.3,5.1004,3,233,17.9,396.9,14.15,20
47,0.22927,0,6.91,0,0.448,6.03,85.5,5.6894,3,233,17.9,392.74,18.8,16.6
48,0.25387,0,6.91,0,0.448,5.399,95.3,5.87,3,233,17.9,396.9,30.81,14.4
49,0.21977,0,6.91,0,0.448,5.602,62,6.0877,3,233,17.9,396.9,16.2,19.4
50,0.08873,21,5.64,0,0.439,5.963,45.7,6.8147,4,243,16.8,395.56,13.45,19.7
51,0.04337,21,5.64,0,0.439,6.115,63,6.8147,4,243,16.8,393.97,9.43,20.5
52,0.0536,21,5.64,0,0.439,6.511,21.1,6.8147,4,243,16.8,396.9,5.28,25
53,0.04981,21,5.64,0,0.439,5.998,21.4,6.8147,4,243,16.8,396.9,8.43,23.4
54,0.0136,75,4,0,0.41,5.888,47.6,7.3197,3,469,21.1,396.9,14.8,18.9
55,0.01311,90,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81,35.4
56,0.02055,85,0.74,0,0.41,6.383,35.7,9.1876,2,313,17.3,396.9,5.77,24.7
57,0.01432,100,1.32,0,0.411,6.816,40.5,8.3248,5,256,15.1,392.9,3.95,31.6
58,0.15445,25,5.13,0,0.453,6.145,29.2,7.8148,8,284,19.7,390.68,6.86,23.3
59,0.10328,25,5.13,0,0.453,5.927,47.2,6.932,8,284,19.7,396.9,9.22,19.6
60,0.14932,25,5.13,0,0.453,5.741,66.2,7.2254,8,284,19.7,395.11,13.15,18.7
61,0.17171,25,5.13,0,0.453,5.966,93.4,6.8185,8,284,19.7,378.08,14.44,16
62,0.11027,25,5.13,0,0.453,6.456,67.8,7.2255,8,284,19.7,396.9,6.73,22.2
63,0.1265,25,5.13,0,0.453,6.762,43.4,7.9809,8,284,19.7,395.58,9.5,25
64,0.01951,17.5,1.38,0,0.4161,7.104,59.5,9.2229,3,216,18.6,393.24,8.05,33
65,0.03584,80,3.37,0,0.398,6.29,17.8,6.6115,4,337,16.1,396.9,4.67,23.5
66,0.04379,80,3.37,0,0.398,5.787,31.1,6.6115,4,337,16.1,396.9,10.24,19.4
67,0.05789,12.5,6.07,0,0.409,5.878,21.4,6.498,4,345,18.9,396.21,8.1,22
68,0.13554,12.5,6.07,0,0.409,5.594,36.8,6.498,4,345,18.9,396.9,13.09,17.4
69,0.12816,12.5,6.07,0,0.409,5.885,33,6.498,4,345,18.9,396.9,8.79,20.9
70,0.08826,0,10.81,0,0.413,6.417,6.6,5.2873,4,305,19.2,383.73,6.72,24.2
71,0.15876,0,10.81,0,0.413,5.961,17.5,5.2873,4,305,19.2,376.94,9.88,21.7
72,0.09164,0,10.81,0,0.413,6.065,7.8,5.2873,4,305,19.2,390.91,5.52,22.8
73,0.19539,0,10.81,0,0.413,6.245,6.2,5.2873,4,305,19.2,377.17,7.54,23.4
74,0.07896,0,12.83,0,0.437,6.273,6,4.2515,5,398,18.7,394.92,6.78,24.1
75,0.09512,0,12.83,0,0.437,6.286,45,4.5026,5,398,18.7,383.23,8.94,21.4
76,0.10153,0,12.83,0,0.437,6.279,74.5,4.0522,5,398,18.7,373.66,11.97,20
77,0.08707,0,12.83,0,0.437,6.14,45.8,4.0905,5,398,18.7,386.96,10.27,20.8
78,0.05646,0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.4,12.34,21.2
79,0.08387,0,12.83,0,0.437,5.874,36.6,4.5026,5,398,18.7,396.06,9.1,20.3
80,0.04113,25,4.86,0,0.426,6.727,33.5,5.4007,4,281,19,396.9,5.29,28
81,0.04462,25,4.86,0,0.426,6.619,70.4,5.4007,4,281,19,395.63,7.22,23.9
82,0.03659,25,4.86,0,0.426,6.302,32.2,5.4007,4,281,19,396.9,6.72,24.8
83,0.03551,25,4.86,0,0.426,6.167,46.7,5.4007,4,281,19,390.64,7.51,22.9
84,0.05059,0,4.49,0,0.449,6.389,48,4.7794,3,247,18.5,396.9,9.62,23.9
85,0.05735,0,4.49,0,0.449,6.63,56.1,4.4377,3,247,18.5,392.3,6.53,26.6
86,0.05188,0,4.49,0,0.449,6.015,45.1,4.4272,3,247,18.5,395.99,12.86,22.5
87,0.07151,0,4.49,0,0.449,6.121,56.8,3.7476,3,247,18.5,395.15,8.44,22.2
88,0.0566,0,3.41,0,0.489,7.007,86.3,3.4217,2,270,17.8,396.9,5.5,23.6
89,0.05302,0,3.41,0,0.489,7.079,63.1,3.4145,2,270,17.8,396.06,5.7,28.7
90,0.04684,0,3.41,0,0.489,6.417,66.1,3.0923,2,270,17.8,392.18,8.81,22.6
91,0.03932,0,3.41,0,0.489,6.405,73.9,3.0921,2,270,17.8,393.55,8.2,22
92,0.04203,28,15.04,0,0.464,6.442,53.6,3.6659,4,270,18.2,395.01,8.16,22.9
93,0.02875,28,15.04,0,0.464,6.211,28.9,3.6659,4,270,18.2,396.33,6.21,25
94,0.04294,28,15.04,0,0.464,6.249,77.3,3.615,4,270,18.2,396.9,10.59,20.6
95,0.12204,0,2.89,0,0.445,6.625,57.8,3.4952,2,276,18,357.98,6.65,28.4
96,0.11504,0,2.89,0,0.445,6.163,69.6,3.4952,2,276,18,391.83,11.34,21.4
97,0.12083,0,2.89,0,0.445,8.069,76,3.4952,2,276,18,396.9,4.21,38.7
98,0.08187,0,2.89,0,0.445,7.82,36.9,3.4952,2,276,18,393.53,3.57,43.8
99,0.0686,0,2.89,0,0.445,7.416,62.5,3.4952,2,276,18,396.9,6.19,33.2
100,0.14866,0,8.56,0,0.52,6.727,79.9,2.7778,5,384,20.9,394.76,9.42,27.5
101,0.11432,0,8.56,0,0.52,6.781,71.3,2.8561,5,384,20.9,395.58,7.67,26.5
102,0.22876,0,8.56,0,0.52,6.405,85.4,2.7147,5,384,20.9,70.8,10.63,18.6
103,0.21161,0,8.56,0,0.52,6.137,87.4,2.7147,5,384,20.9,394.47,13.44,19.3
104,0.1396,0,8.56,0,0.52,6.167,90,2.421,5,384,20.9,392.69,12.33,20.1
105,0.13262,0,8.56,0,0.52,5.851,96.7,2.1069,5,384,20.9,394.05,16.47,19.5
106,0.1712,0,8.56,0,0.52,5.836,91.9,2.211,5,384,20.9,395.67,18.66,19.5
107,0.13117,0,8.56,0,0.52,6.127,85.2,2.1224,5,384,20.9,387.69,14.09,20.4
108,0.12802,0,8.56,0,0.52,6.474,97.1,2.4329,5,384,20.9,395.24,12.27,19.8
109,0.26363,0,8.56,0,0.52,6.229,91.2,2.5451,5,384,20.9,391.23,15.55,19.4
110,0.10793,0,8.56,0,0.52,6.195,54.4,2.7778,5,384,20.9,393.49,13,21.7
111,0.10084,0,10.01,0,0.547,6.715,81.6,2.6775,6,432,17.8,395.59,10.16,22.8
112,0.12329,0,10.01,0,0.547,5.913,92.9,2.3534,6,432,17.8,394.95,16.21,18.8
113,0.22212,0,10.01,0,0.547,6.092,95.4,2.548,6,432,17.8,396.9,17.09,18.7
114,0.14231,0,10.01,0,0.547,6.254,84.2,2.2565,6,432,17.8,388.74,10.45,18.5
115,0.17134,0,10.01,0,0.547,5.928,88.2,2.4631,6,432,17.8,344.91,15.76,18.3
116,0.13158,0,10.01,0,0.547,6.176,72.5,2.7301,6,432,17.8,393.3,12.04,21.2
117,0.15098,0,10.01,0,0.547,6.021,82.6,2.7474,6,432,17.8,394.51,10.3,19.2
118,0.13058,0,10.01,0,0.547,5.872,73.1,2.4775,6,432,17.8,338.63,15.37,20.4
119,0.14476,0,10.01,0,0.547,5.731,65.2,2.7592,6,432,17.8,391.5,13.61,19.3
120,0.06899,0,25.65,0,0.581,5.87,69.7,2.2577,2,188,19.1,389.15,14.37,22
121,0.07165,0,25.65,0,0.581,6.004,84.1,2.1974,2,188,19.1,377.67,14.27,20.3
122,0.09299,0,25.65,0,0.581,5.961,92.9,2.0869,2,188,19.1,378.09,17.93,20.5
123,0.15038,0,25.65,0,0.581,5.856,97,1.9444,2,188,19.1,370.31,25.41,17.3
124,0.09849,0,25.65,0,0.581,5.879,95.8,2.0063,2,188,19.1,379.38,17.58,18.8
125,0.16902,0,25.65,0,0.581,5.986,88.4,1.9929,2,188,19.1,385.02,14.81,21.4
126,0.38735,0,25.65,0,0.581,5.613,95.6,1.7572,2,188,19.1,359.29,27.26,15.7
127,0.25915,0,21.89,0,0.624,5.693,96,1.7883,4,437,21.2,392.11,17.19,16.2
128,0.32543,0,21.89,0,0.624,6.431,98.8,1.8125,4,437,21.2,396.9,15.39,18
129,0.88125,0,21.89,0,0.624,5.637,94.7,1.9799,4,437,21.2,396.9,18.34,14.3
130,0.34006,0,21.89,0,0.624,6.458,98.9,2.1185,4,437,21.2,395.04,12.6,19.2
131,1.19294,0,21.89,0,0.624,6.326,97.7,2.271,4,437,21.2,396.9,12.26,19.6
132,0.59005,0,21.89,0,0.624,6.372,97.9,2.3274,4,437,21.2,385.76,11.12,23
133,0.32982,0,21.89,0,0.624,5.822,95.4,2.4699,4,437,21.2,388.69,15.03,18.4
134,0.97617,0,21.89,0,0.624,5.757,98.4,2.346,4,437,21.2,262.76,17.31,15.6
135,0.55778,0,21.89,0,0.624,6.335,98.2,2.1107,4,437,21.2,394.67,16.96,18.1
136,0.32264,0,21.89,0,0.624,5.942,93.5,1.9669,4,437,21.2,378.25,16.9,17.4
137,0.35233,0,21.89,0,0.624,6.454,98.4,1.8498,4,437,21.2,394.08,14.59,17.1
138,0.2498,0,21.89,0,0.624,5.857,98.2,1.6686,4,437,21.2,392.04,21.32,13.3
139,0.54452,0,21.89,0,0.624,6.151,97.9,1.6687,4,437,21.2,396.9,18.46,17.8
140,0.2909,0,21.89,0,0.624,6.174,93.6,1.6119,4,437,21.2,388.08,24.16,14
141,1.62864,0,21.89,0,0.624,5.019,100,1.4394,4,437,21.2,396.9,34.41,14.4
142,3.32105,0,19.58,1,0.871,5.403,100,1.3216,5,403,14.7,396.9,26.82,13.4
143,4.0974,0,19.58,0,0.871,5.468,100,1.4118,5,403,14.7,396.9,26.42,15.6
144,2.77974,0,19.58,0,0.871,4.903,97.8,1.3459,5,403,14.7,396.9,29.29,11.8
145,2.37934,0,19.58,0,0.871,6.13,100,1.4191,5,403,14.7,172.91,27.8,13.8
146,2.15505,0,19.58,0,0.871,5.628,100,1.5166,5,403,14.7,169.27,16.65,15.6
147,2.36862,0,19.58,0,0.871,4.926,95.7,1.4608,5,403,14.7,391.71,29.53,14.6
148,2.33099,0,19.58,0,0.871,5.186,93.8,1.5296,5,403,14.7,356.99,28.32,17.8
149,2.73397,0,19.58,0,0.871,5.597,94.9,1.5257,5,403,14.7,351.85,21.45,15.4
150,1.6566,0,19.58,0,0.871,6.122,97.3,1.618,5,403,14.7,372.8,14.1,21.5
151,1.49632,0,19.58,0,0.871,5.404,100,1.5916,5,403,14.7,341.6,13.28,19.6
152,1.12658,0,19.58,1,0.871,5.012,88,1.6102,5,403,14.7,343.28,12.12,15.3
153,2.14918,0,19.58,0,0.871,5.709,98.5,1.6232,5,403,14.7,261.95,15.79,19.4
154,1.41385,0,19.58,1,0.871,6.129,96,1.7494,5,403,14.7,321.02,15.12,17
155,3.53501,0,19.58,1,0.871,6.152,82.6,1.7455,5,403,14.7,88.01,15.02,15.6
156,2.44668,0,19.58,0,0.871,5.272,94,1.7364,5,403,14.7,88.63,16.14,13.1
157,1.22358,0,19.58,0,0.605,6.943,97.4,1.8773,5,403,14.7,363.43,4.59,41.3
158,1.34284,0,19.58,0,0.605,6.066,100,1.7573,5,403,14.7,353.89,6.43,24.3
159,1.42502,0,19.58,0,0.871,6.51,100,1.7659,5,403,14.7,364.31,7.39,23.3
160,1.27346,0,19.58,1,0.605,6.25,92.6,1.7984,5,403,14.7,338.92,5.5,27
161,1.46336,0,19.58,0,0.605,7.489,90.8,1.9709,5,403,14.7,374.43,1.73,50
162,1.83377,0,19.58,1,0.605,7.802,98.2,2.0407,5,403,14.7,389.61,1.92,50
163,1.51902,0,19.58,1,0.605,8.375,93.9,2.162,5,403,14.7,388.45,3.32,50
164,2.24236,0,19.58,0,0.605,5.854,91.8,2.422,5,403,14.7,395.11,11.64,22.7
165,2.924,0,19.58,0,0.605,6.101,93,2.2834,5,403,14.7,240.16,9.81,25
166,2.01019,0,19.58,0,0.605,7.929,96.2,2.0459,5,403,14.7,369.3,3.7,50
167,1.80028,0,19.58,0,0.605,5.877,79.2,2.4259,5,403,14.7,227.61,12.14,23.8
168,2.3004,0,19.58,0,0.605,6.319,96.1,2.1,5,403,14.7,297.09,11.1,23.8
169,2.44953,0,19.58,0,0.605,6.402,95.2,2.2625,5,403,14.7,330.04,11.32,22.3
170,1.20742,0,19.58,0,0.605,5.875,94.6,2.4259,5,403,14.7,292.29,14.43,17.4
171,2.3139,0,19.58,0,0.605,5.88,97.3,2.3887,5,403,14.7,348.13,12.03,19.1
172,0.13914,0,4.05,0,0.51,5.572,88.5,2.5961,5,296,16.6,396.9,14.69,23.1
173,0.09178,0,4.05,0,0.51,6.416,84.1,2.6463,5,296,16.6,395.5,9.04,23.6
174,0.08447,0,4.05,0,0.51,5.859,68.7,2.7019,5,296,16.6,393.23,9.64,22.6
175,0.06664,0,4.05,0,0.51,6.546,33.1,3.1323,5,296,16.6,390.96,5.33,29.4
176,0.07022,0,4.05,0,0.51,6.02,47.2,3.5549,5,296,16.6,393.23,10.11,23.2
177,0.05425,0,4.05,0,0.51,6.315,73.4,3.3175,5,296,16.6,395.6,6.29,24.6
178,0.06642,0,4.05,0,0.51,6.86,74.4,2.9153,5,296,16.6,391.27,6.92,29.9
179,0.0578,0,2.46,0,0.488,6.98,58.4,2.829,3,193,17.8,396.9,5.04,37.2
180,0.06588,0,2.46,0,0.488,7.765,83.3,2.741,3,193,17.8,395.56,7.56,39.8
181,0.06888,0,2.46,0,0.488,6.144,62.2,2.5979,3,193,17.8,396.9,9.45,36.2
182,0.09103,0,2.46,0,0.488,7.155,92.2,2.7006,3,193,17.8,394.12,4.82,37.9
183,0.10008,0,2.46,0,0.488,6.563,95.6,2.847,3,193,17.8,396.9,5.68,32.5
184,0.08308,0,2.46,0,0.488,5.604,89.8,2.9879,3,193,17.8,391,13.98,26.4
185,0.06047,0,2.46,0,0.488,6.153,68.8,3.2797,3,193,17.8,387.11,13.15,29.6
186,0.05602,0,2.46,0,0.488,7.831,53.6,3.1992,3,193,17.8,392.63,4.45,50
187,0.07875,45,3.44,0,0.437,6.782,41.1,3.7886,5,398,15.2,393.87,6.68,32
188,0.12579,45,3.44,0,0.437,6.556,29.1,4.5667,5,398,15.2,382.84,4.56,29.8
189,0.0837,45,3.44,0,0.437,7.185,38.9,4.5667,5,398,15.2,396.9,5.39,34.9
190,0.09068,45,3.44,0,0.437,6.951,21.5,6.4798,5,398,15.2,377.68,5.1,37
191,0.06911,45,3.44,0,0.437,6.739,30.8,6.4798,5,398,15.2,389.71,4.69,30.5
192,0.08664,45,3.44,0,0.437,7.178,26.3,6.4798,5,398,15.2,390.49,2.87,36.4
193,0.02187,60,2.93,0,0.401,6.8,9.9,6.2196,1,265,15.6,393.37,5.03,31.1
194,0.01439,60,2.93,0,0.401,6.604,18.8,6.2196,1,265,15.6,376.7,4.38,29.1
195,0.01381,80,0.46,0,0.422,7.875,32,5.6484,4,255,14.4,394.23,2.97,50
196,0.04011,80,1.52,0,0.404,7.287,34.1,7.309,2,329,12.6,396.9,4.08,33.3
197,0.04666,80,1.52,0,0.404,7.107,36.6,7.309,2,329,12.6,354.31,8.61,30.3
198,0.03768,80,1.52,0,0.404,7.274,38.3,7.309,2,329,12.6,392.2,6.62,34.6
199,0.0315,95,1.47,0,0.403,6.975,15.3,7.6534,3,402,17,396.9,4.56,34.9
200,0.01778,95,1.47,0,0.403,7.135,13.9,7.6534,3,402,17,384.3,4.45,32.9
201,0.03445,82.5,2.03,0,0.415,6.162,38.4,6.27,2,348,14.7,393.77,7.43,24.1
202,0.02177,82.5,2.03,0,0.415,7.61,15.7,6.27,2,348,14.7,395.38,3.11,42.3
203,0.0351,95,2.68,0,0.4161,7.853,33.2,5.118,4,224,14.7,392.78,3.81,48.5
204,0.02009,95,2.68,0,0.4161,8.034,31.9,5.118,4,224,14.7,390.55,2.88,50
205,0.13642,0,10.59,0,0.489,5.891,22.3,3.9454,4,277,18.6,396.9,10.87,22.6
206,0.22969,0,10.59,0,0.489,6.326,52.5,4.3549,4,277,18.6,394.87,10.97,24.4
207,0.25199,0,10.59,0,0.489,5.783,72.7,4.3549,4,277,18.6,389.43,18.06,22.5
208,0.13587,0,10.59,1,0.489,6.064,59.1,4.2392,4,277,18.6,381.32,14.66,24.4
209,0.43571,0,10.59,1,0.489,5.344,100,3.875,4,277,18.6,396.9,23.09,20
210,0.17446,0,10.59,1,0.489,5.96,92.1,3.8771,4,277,18.6,393.25,17.27,21.7
211,0.37578,0,10.59,1,0.489,5.404,88.6,3.665,4,277,18.6,395.24,23.98,19.3
212,0.21719,0,10.59,1,0.489,5.807,53.8,3.6526,4,277,18.6,390.94,16.03,22.4
213,0.14052,0,10.59,0,0.489,6.375,32.3,3.9454,4,277,18.6,385.81,9.38,28.1
214,0.28955,0,10.59,0,0.489,5.412,9.8,3.5875,4,277,18.6,348.93,29.55,23.7
215,0.19802,0,10.59,0,0.489,6.182,42.4,3.9454,4,277,18.6,393.63,9.47,25
216,0.0456,0,13.89,1,0.55,5.888,56,3.1121,5,276,16.4,392.8,13.51,23.3
217,0.07013,0,13.89,0,0.55,6.642,85.1,3.4211,5,276,16.4,392.78,9.69,28.7
218,0.11069,0,13.89,1,0.55,5.951,93.8,2.8893,5,276,16.4,396.9,17.92,21.5
219,0.11425,0,13.89,1,0.55,6.373,92.4,3.3633,5,276,16.4,393.74,10.5,23
220,0.35809,0,6.2,1,0.507,6.951,88.5,2.8617,8,307,17.4,391.7,9.71,26.7
221,0.40771,0,6.2,1,0.507,6.164,91.3,3.048,8,307,17.4,395.24,21.46,21.7
222,0.62356,0,6.2,1,0.507,6.879,77.7,3.2721,8,307,17.4,390.39,9.93,27.5
223,0.6147,0,6.2,0,0.507,6.618,80.8,3.2721,8,307,17.4,396.9,7.6,30.1
224,0.31533,0,6.2,0,0.504,8.266,78.3,2.8944,8,307,17.4,385.05,4.14,44.8
225,0.52693,0,6.2,0,0.504,8.725,83,2.8944,8,307,17.4,382,4.63,50
226,0.38214,0,6.2,0,0.504,8.04,86.5,3.2157,8,307,17.4,387.38,3.13,37.6
227,0.41238,0,6.2,0,0.504,7.163,79.9,3.2157,8,307,17.4,372.08,6.36,31.6
228,0.29819,0,6.2,0,0.504,7.686,17,3.3751,8,307,17.4,377.51,3.92,46.7
229,0.44178,0,6.2,0,0.504,6.552,21.4,3.3751,8,307,17.4,380.34,3.76,31.5
230,0.537,0,6.2,0,0.504,5.981,68.1,3.6715,8,307,17.4,378.35,11.65,24.3
231,0.46296,0,6.2,0,0.504,7.412,76.9,3.6715,8,307,17.4,376.14,5.25,31.7
232,0.57529,0,6.2,0,0.507,8.337,73.3,3.8384,8,307,17.4,385.91,2.47,41.7
233,0.33147,0,6.2,0,0.507,8.247,70.4,3.6519,8,307,17.4,378.95,3.95,48.3
234,0.44791,0,6.2,1,0.507,6.726,66.5,3.6519,8,307,17.4,360.2,8.05,29
235,0.33045,0,6.2,0,0.507,6.086,61.5,3.6519,8,307,17.4,376.75,10.88,24
236,0.52058,0,6.2,1,0.507,6.631,76.5,4.148,8,307,17.4,388.45,9.54,25.1
237,0.51183,0,6.2,0,0.507,7.358,71.6,4.148,8,307,17.4,390.07,4.73,31.5
238,0.08244,30,4.93,0,0.428,6.481,18.5,6.1899,6,300,16.6,379.41,6.36,23.7
239,0.09252,30,4.93,0,0.428,6.606,42.2,6.1899,6,300,16.6,383.78,7.37,23.3
240,0.11329,30,4.93,0,0.428,6.897,54.3,6.3361,6,300,16.6,391.25,11.38,22
241,0.10612,30,4.93,0,0.428,6.095,65.1,6.3361,6,300,16.6,394.62,12.4,20.1
242,0.1029,30,4.93,0,0.428,6.358,52.9,7.0355,6,300,16.6,372.75,11.22,22.2
243,0.12757,30,4.93,0,0.428,6.393,7.8,7.0355,6,300,16.6,374.71,5.19,23.7
244,0.20608,22,5.86,0,0.431,5.593,76.5,7.9549,7,330,19.1,372.49,12.5,17.6
245,0.19133,22,5.86,0,0.431,5.605,70.2,7.9549,7,330,19.1,389.13,18.46,18.5
246,0.33983,22,5.86,0,0.431,6.108,34.9,8.0555,7,330,19.1,390.18,9.16,24.3
247,0.19657,22,5.86,0,0.431,6.226,79.2,8.0555,7,330,19.1,376.14,10.15,20.5
248,0.16439,22,5.86,0,0.431,6.433,49.1,7.8265,7,330,19.1,374.71,9.52,24.5
249,0.19073,22,5.86,0,0.431,6.718,17.5,7.8265,7,330,19.1,393.74,6.56,26.2
250,0.1403,22,5.86,0,0.431,6.487,13,7.3967,7,330,19.1,396.28,5.9,24.4
251,0.21409,22,5.86,0,0.431,6.438,8.9,7.3967,7,330,19.1,377.07,3.59,24.8
252,0.08221,22,5.86,0,0.431,6.957,6.8,8.9067,7,330,19.1,386.09,3.53,29.6
253,0.36894,22,5.86,0,0.431,8.259,8.4,8.9067,7,330,19.1,396.9,3.54,42.8
254,0.04819,80,3.64,0,0.392,6.108,32,9.2203,1,315,16.4,392.89,6.57,21.9
255,0.03548,80,3.64,0,0.392,5.876,19.1,9.2203,1,315,16.4,395.18,9.25,20.9
256,0.01538,90,3.75,0,0.394,7.454,34.2,6.3361,3,244,15.9,386.34,3.11,44
257,0.61154,20,3.97,0,0.647,8.704,86.9,1.801,5,264,13,389.7,5.12,50
258,0.66351,20,3.97,0,0.647,7.333,100,1.8946,5,264,13,383.29,7.79,36
259,0.65665,20,3.97,0,0.647,6.842,100,2.0107,5,264,13,391.93,6.9,30.1
260,0.54011,20,3.97,0,0.647,7.203,81.8,2.1121,5,264,13,392.8,9.59,33.8
261,0.53412,20,3.97,0,0.647,7.52,89.4,2.1398,5,264,13,388.37,7.26,43.1
262,0.52014,20,3.97,0,0.647,8.398,91.5,2.2885,5,264,13,386.86,5.91,48.8
263,0.82526,20,3.97,0,0.647,7.327,94.5,2.0788,5,264,13,393.42,11.25,31
264,0.55007,20,3.97,0,0.647,7.206,91.6,1.9301,5,264,13,387.89,8.1,36.5
265,0.76162,20,3.97,0,0.647,5.56,62.8,1.9865,5,264,13,392.4,10.45,22.8
266,0.7857,20,3.97,0,0.647,7.014,84.6,2.1329,5,264,13,384.07,14.79,30.7
267,0.57834,20,3.97,0,0.575,8.297,67,2.4216,5,264,13,384.54,7.44,50
268,0.5405,20,3.97,0,0.575,7.47,52.6,2.872,5,264,13,390.3,3.16,43.5
269,0.09065,20,6.96,1,0.464,5.92,61.5,3.9175,3,223,18.6,391.34,13.65,20.7
270,0.29916,20,6.96,0,0.464,5.856,42.1,4.429,3,223,18.6,388.65,13,21.1
271,0.16211,20,6.96,0,0.464,6.24,16.3,4.429,3,223,18.6,396.9,6.59,25.2
272,0.1146,20,6.96,0,0.464,6.538,58.7,3.9175,3,223,18.6,394.96,7.73,24.4
273,0.22188,20,6.96,1,0.464,7.691,51.8,4.3665,3,223,18.6,390.77,6.58,35.2
274,0.05644,40,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53,32.4
275,0.09604,40,6.41,0,0.447,6.854,42.8,4.2673,4,254,17.6,396.9,2.98,32
276,0.10469,40,6.41,1,0.447,7.267,49,4.7872,4,254,17.6,389.25,6.05,33.2
277,0.06127,40,6.41,1,0.447,6.826,27.6,4.8628,4,254,17.6,393.45,4.16,33.1
278,0.07978,40,6.41,0,0.447,6.482,32.1,4.1403,4,254,17.6,396.9,7.19,29.1
279,0.21038,20,3.33,0,0.4429,6.812,32.2,4.1007,5,216,14.9,396.9,4.85,35.1
280,0.03578,20,3.33,0,0.4429,7.82,64.5,4.6947,5,216,14.9,387.31,3.76,45.4
281,0.03705,20,3.33,0,0.4429,6.968,37.2,5.2447,5,216,14.9,392.23,4.59,35.4
282,0.06129,20,3.33,1,0.4429,7.645,49.7,5.2119,5,216,14.9,377.07,3.01,46
283,0.01501,90,1.21,1,0.401,7.923,24.8,5.885,1,198,13.6,395.52,3.16,50
284,0.00906,90,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85,32.2
285,0.01096,55,2.25,0,0.389,6.453,31.9,7.3073,1,300,15.3,394.72,8.23,22
286,0.01965,80,1.76,0,0.385,6.23,31.5,9.0892,1,241,18.2,341.6,12.93,20.1
287,0.03871,52.5,5.32,0,0.405,6.209,31.3,7.3172,6,293,16.6,396.9,7.14,23.2
288,0.0459,52.5,5.32,0,0.405,6.315,45.6,7.3172,6,293,16.6,396.9,7.6,22.3
289,0.04297,52.5,5.32,0,0.405,6.565,22.9,7.3172,6,293,16.6,371.72,9.51,24.8
290,0.03502,80,4.95,0,0.411,6.861,27.9,5.1167,4,245,19.2,396.9,3.33,28.5
291,0.07886,80,4.95,0,0.411,7.148,27.7,5.1167,4,245,19.2,396.9,3.56,37.3
292,0.03615,80,4.95,0,0.411,6.63,23.4,5.1167,4,245,19.2,396.9,4.7,27.9
293,0.08265,0,13.92,0,0.437,6.127,18.4,5.5027,4,289,16,396.9,8.58,23.9
294,0.08199,0,13.92,0,0.437,6.009,42.3,5.5027,4,289,16,396.9,10.4,21.7
295,0.12932,0,13.92,0,0.437,6.678,31.1,5.9604,4,289,16,396.9,6.27,28.6
296,0.05372,0,13.92,0,0.437,6.549,51,5.9604,4,289,16,392.85,7.39,27.1
297,0.14103,0,13.92,0,0.437,5.79,58,6.32,4,289,16,396.9,15.84,20.3
298,0.06466,70,2.24,0,0.4,6.345,20.1,7.8278,5,358,14.8,368.24,4.97,22.5
299,0.05561,70,2.24,0,0.4,7.041,10,7.8278,5,358,14.8,371.58,4.74,29
300,0.04417,70,2.24,0,0.4,6.871,47.4,7.8278,5,358,14.8,390.86,6.07,24.8
301,0.03537,34,6.09,0,0.433,6.59,40.4,5.4917,7,329,16.1,395.75,9.5,22
302,0.09266,34,6.09,0,0.433,6.495,18.4,5.4917,7,329,16.1,383.61,8.67,26.4
303,0.1,34,6.09,0,0.433,6.982,17.7,5.4917,7,329,16.1,390.43,4.86,33.1
304,0.05515,33,2.18,0,0.472,7.236,41.1,4.022,7,222,18.4,393.68,6.93,36.1
305,0.05479,33,2.18,0,0.472,6.616,58.1,3.37,7,222,18.4,393.36,8.93,28.4
306,0.07503,33,2.18,0,0.472,7.42,71.9,3.0992,7,222,18.4,396.9,6.47,33.4
307,0.04932,33,2.18,0,0.472,6.849,70.3,3.1827,7,222,18.4,396.9,7.53,28.2
308,0.49298,0,9.9,0,0.544,6.635,82.5,3.3175,4,304,18.4,396.9,4.54,22.8
309,0.3494,0,9.9,0,0.544,5.972,76.7,3.1025,4,304,18.4,396.24,9.97,20.3
310,2.63548,0,9.9,0,0.544,4.973,37.8,2.5194,4,304,18.4,350.45,12.64,16.1
311,0.79041,0,9.9,0,0.544,6.122,52.8,2.6403,4,304,18.4,396.9,5.98,22.1
312,0.26169,0,9.9,0,0.544,6.023,90.4,2.834,4,304,18.4,396.3,11.72,19.4
313,0.26938,0,9.9,0,0.544,6.266,82.8,3.2628,4,304,18.4,393.39,7.9,21.6
314,0.3692,0,9.9,0,0.544,6.567,87.3,3.6023,4,304,18.4,395.69,9.28,23.8
315,0.25356,0,9.9,0,0.544,5.705,77.7,3.945,4,304,18.4,396.42,11.5,16.2
316,0.31827,0,9.9,0,0.544,5.914,83.2,3.9986,4,304,18.4,390.7,18.33,17.8
317,0.24522,0,9.9,0,0.544,5.782,71.7,4.0317,4,304,18.4,396.9,15.94,19.8
318,0.40202,0,9.9,0,0.544,6.382,67.2,3.5325,4,304,18.4,395.21,10.36,23.1
319,0.47547,0,9.9,0,0.544,6.113,58.8,4.0019,4,304,18.4,396.23,12.73,21
320,0.1676,0,7.38,0,0.493,6.426,52.3,4.5404,5,287,19.6,396.9,7.2,23.8
321,0.18159,0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1
322,0.35114,0,7.38,0,0.493,6.041,49.9,4.7211,5,287,19.6,396.9,7.7,20.4
323,0.28392,0,7.38,0,0.493,5.708,74.3,4.7211,5,287,19.6,391.13,11.74,18.5
324,0.34109,0,7.38,0,0.493,6.415,40.1,4.7211,5,287,19.6,396.9,6.12,25
325,0.19186,0,7.38,0,0.493,6.431,14.7,5.4159,5,287,19.6,393.68,5.08,24.6
326,0.30347,0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23
327,0.24103,0,7.38,0,0.493,6.083,43.7,5.4159,5,287,19.6,396.9,12.79,22.2
328,0.06617,0,3.24,0,0.46,5.868,25.8,5.2146,4,430,16.9,382.44,9.97,19.3
329,0.06724,0,3.24,0,0.46,6.333,17.2,5.2146,4,430,16.9,375.21,7.34,22.6
330,0.04544,0,3.24,0,0.46,6.144,32.2,5.8736,4,430,16.9,368.57,9.09,19.8
331,0.05023,35,6.06,0,0.4379,5.706,28.4,6.6407,1,304,16.9,394.02,12.43,17.1
332,0.03466,35,6.06,0,0.4379,6.031,23.3,6.6407,1,304,16.9,362.25,7.83,19.4
333,0.05083,0,5.19,0,0.515,6.316,38.1,6.4584,5,224,20.2,389.71,5.68,22.2
334,0.03738,0,5.19,0,0.515,6.31,38.5,6.4584,5,224,20.2,389.4,6.75,20.7
335,0.03961,0,5.19,0,0.515,6.037,34.5,5.9853,5,224,20.2,396.9,8.01,21.1
336,0.03427,0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.9,9.8,19.5
337,0.03041,0,5.19,0,0.515,5.895,59.6,5.615,5,224,20.2,394.81,10.56,18.5
338,0.03306,0,5.19,0,0.515,6.059,37.3,4.8122,5,224,20.2,396.14,8.51,20.6
339,0.05497,0,5.19,0,0.515,5.985,45.4,4.8122,5,224,20.2,396.9,9.74,19
340,0.06151,0,5.19,0,0.515,5.968,58.5,4.8122,5,224,20.2,396.9,9.29,18.7
341,0.01301,35,1.52,0,0.442,7.241,49.3,7.0379,1,284,15.5,394.74,5.49,32.7
342,0.02498,0,1.89,0,0.518,6.54,59.7,6.2669,1,422,15.9,389.96,8.65,16.5
343,0.02543,55,3.78,0,0.484,6.696,56.4,5.7321,5,370,17.6,396.9,7.18,23.9
344,0.03049,55,3.78,0,0.484,6.874,28.1,6.4654,5,370,17.6,387.97,4.61,31.2
345,0.03113,0,4.39,0,0.442,6.014,48.5,8.0136,3,352,18.8,385.64,10.53,17.5
346,0.06162,0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67,17.2
347,0.0187,85,4.15,0,0.429,6.516,27.7,8.5353,4,351,17.9,392.43,6.36,23.1
348,0.01501,80,2.01,0,0.435,6.635,29.7,8.344,4,280,17,390.94,5.99,24.5
349,0.02899,40,1.25,0,0.429,6.939,34.5,8.7921,1,335,19.7,389.85,5.89,26.6
350,0.06211,40,1.25,0,0.429,6.49,44.4,8.7921,1,335,19.7,396.9,5.98,22.9
351,0.0795,60,1.69,0,0.411,6.579,35.9,10.7103,4,411,18.3,370.78,5.49,24.1
352,0.07244,60,1.69,0,0.411,5.884,18.5,10.7103,4,411,18.3,392.33,7.79,18.6
353,0.01709,90,2.02,0,0.41,6.728,36.1,12.1265,5,187,17,384.46,4.5,30.1
354,0.04301,80,1.91,0,0.413,5.663,21.9,10.5857,4,334,22,382.8,8.05,18.2
355,0.10659,80,1.91,0,0.413,5.936,19.5,10.5857,4,334,22,376.04,5.57,20.6
356,8.98296,0,18.1,1,0.77,6.212,97.4,2.1222,24,666,20.2,377.73,17.6,17.8
357,3.8497,0,18.1,1,0.77,6.395,91,2.5052,24,666,20.2,391.34,13.27,21.7
358,5.20177,0,18.1,1,0.77,6.127,83.4,2.7227,24,666,20.2,395.43,11.48,22.7
359,4.26131,0,18.1,0,0.77,6.112,81.3,2.5091,24,666,20.2,390.74,12.67,22.6
360,4.54192,0,18.1,0,0.77,6.398,88,2.5182,24,666,20.2,374.56,7.79,25
361,3.83684,0,18.1,0,0.77,6.251,91.1,2.2955,24,666,20.2,350.65,14.19,19.9
362,3.67822,0,18.1,0,0.77,5.362,96.2,2.1036,24,666,20.2,380.79,10.19,20.8
363,4.22239,0,18.1,1,0.77,5.803,89,1.9047,24,666,20.2,353.04,14.64,16.8
364,3.47428,0,18.1,1,0.718,8.78,82.9,1.9047,24,666,20.2,354.55,5.29,21.9
365,4.55587,0,18.1,0,0.718,3.561,87.9,1.6132,24,666,20.2,354.7,7.12,27.5
366,3.69695,0,18.1,0,0.718,4.963,91.4,1.7523,24,666,20.2,316.03,14,21.9
367,13.5222,0,18.1,0,0.631,3.863,100,1.5106,24,666,20.2,131.42,13.33,23.1
368,4.89822,0,18.1,0,0.631,4.97,100,1.3325,24,666,20.2,375.52,3.26,50
369,5.66998,0,18.1,1,0.631,6.683,96.8,1.3567,24,666,20.2,375.33,3.73,50
370,6.53876,0,18.1,1,0.631,7.016,97.5,1.2024,24,666,20.2,392.05,2.96,50
371,9.2323,0,18.1,0,0.631,6.216,100,1.1691,24,666,20.2,366.15,9.53,50
372,8.26725,0,18.1,1,0.668,5.875,89.6,1.1296,24,666,20.2,347.88,8.88,50
373,11.1081,0,18.1,0,0.668,4.906,100,1.1742,24,666,20.2,396.9,34.77,13.8
374,18.4982,0,18.1,0,0.668,4.138,100,1.137,24,666,20.2,396.9,37.97,13.8
375,19.6091,0,18.1,0,0.671,7.313,97.9,1.3163,24,666,20.2,396.9,13.44,15
376,15.288,0,18.1,0,0.671,6.649,93.3,1.3449,24,666,20.2,363.02,23.24,13.9
377,9.82349,0,18.1,0,0.671,6.794,98.8,1.358,24,666,20.2,396.9,21.24,13.3
378,23.6482,0,18.1,0,0.671,6.38,96.2,1.3861,24,666,20.2,396.9,23.69,13.1
379,17.8667,0,18.1,0,0.671,6.223,100,1.3861,24,666,20.2,393.74,21.78,10.2
380,88.9762,0,18.1,0,0.671,6.968,91.9,1.4165,24,666,20.2,396.9,17.21,10.4
381,15.8744,0,18.1,0,0.671,6.545,99.1,1.5192,24,666,20.2,396.9,21.08,10.9
382,9.18702,0,18.1,0,0.7,5.536,100,1.5804,24,666,20.2,396.9,23.6,11.3
383,7.99248,0,18.1,0,0.7,5.52,100,1.5331,24,666,20.2,396.9,24.56,12.3
384,20.0849,0,18.1,0,0.7,4.368,91.2,1.4395,24,666,20.2,285.83,30.63,8.8
385,16.8118,0,18.1,0,0.7,5.277,98.1,1.4261,24,666,20.2,396.9,30.81,7.2
386,24.3938,0,18.1,0,0.7,4.652,100,1.4672,24,666,20.2,396.9,28.28,10.5
387,22.5971,0,18.1,0,0.7,5,89.5,1.5184,24,666,20.2,396.9,31.99,7.4
388,14.3337,0,18.1,0,0.7,4.88,100,1.5895,24,666,20.2,372.92,30.62,10.2
389,8.15174,0,18.1,0,0.7,5.39,98.9,1.7281,24,666,20.2,396.9,20.85,11.5
390,6.96215,0,18.1,0,0.7,5.713,97,1.9265,24,666,20.2,394.43,17.11,15.1
391,5.29305,0,18.1,0,0.7,6.051,82.5,2.1678,24,666,20.2,378.38,18.76,23.2
392,11.5779,0,18.1,0,0.7,5.036,97,1.77,24,666,20.2,396.9,25.68,9.7
393,8.64476,0,18.1,0,0.693,6.193,92.6,1.7912,24,666,20.2,396.9,15.17,13.8
394,13.3598,0,18.1,0,0.693,5.887,94.7,1.7821,24,666,20.2,396.9,16.35,12.7
395,8.71675,0,18.1,0,0.693,6.471,98.8,1.7257,24,666,20.2,391.98,17.12,13.1
396,5.87205,0,18.1,0,0.693,6.405,96,1.6768,24,666,20.2,396.9,19.37,12.5
397,7.67202,0,18.1,0,0.693,5.747,98.9,1.6334,24,666,20.2,393.1,19.92,8.5
398,38.3518,0,18.1,0,0.693,5.453,100,1.4896,24,666,20.2,396.9,30.59,5
399,9.91655,0,18.1,0,0.693,5.852,77.8,1.5004,24,666,20.2,338.16,29.97,6.3
400,25.0461,0,18.1,0,0.693,5.987,100,1.5888,24,666,20.2,396.9,26.77,5.6
401,14.2362,0,18.1,0,0.693,6.343,100,1.5741,24,666,20.2,396.9,20.32,7.2
402,9.59571,0,18.1,0,0.693,6.404,100,1.639,24,666,20.2,376.11,20.31,12.1
403,24.8017,0,18.1,0,0.693,5.349,96,1.7028,24,666,20.2,396.9,19.77,8.3
404,41.5292,0,18.1,0,0.693,5.531,85.4,1.6074,24,666,20.2,329.46,27.38,8.5
405,67.9208,0,18.1,0,0.693,5.683,100,1.4254,24,666,20.2,384.97,22.98,5
406,20.7162,0,18.1,0,0.659,4.138,100,1.1781,24,666,20.2,370.22,23.34,11.9
407,11.9511,0,18.1,0,0.659,5.608,100,1.2852,24,666,20.2,332.09,12.13,27.9
408,7.40389,0,18.1,0,0.597,5.617,97.9,1.4547,24,666,20.2,314.64,26.4,17.2
409,14.4383,0,18.1,0,0.597,6.852,100,1.4655,24,666,20.2,179.36,19.78,27.5
410,51.1358,0,18.1,0,0.597,5.757,100,1.413,24,666,20.2,2.6,10.11,15
411,14.0507,0,18.1,0,0.597,6.657,100,1.5275,24,666,20.2,35.05,21.22,17.2
412,18.811,0,18.1,0,0.597,4.628,100,1.5539,24,666,20.2,28.79,34.37,17.9
413,28.6558,0,18.1,0,0.597,5.155,100,1.5894,24,666,20.2,210.97,20.08,16.3
414,45.7461,0,18.1,0,0.693,4.519,100,1.6582,24,666,20.2,88.27,36.98,7
415,18.0846,0,18.1,0,0.679,6.434,100,1.8347,24,666,20.2,27.25,29.05,7.2
416,10.8342,0,18.1,0,0.679,6.782,90.8,1.8195,24,666,20.2,21.57,25.79,7.5
417,25.9406,0,18.1,0,0.679,5.304,89.1,1.6475,24,666,20.2,127.36,26.64,10.4
418,73.5341,0,18.1,0,0.679,5.957,100,1.8026,24,666,20.2,16.45,20.62,8.8
419,11.8123,0,18.1,0,0.718,6.824,76.5,1.794,24,666,20.2,48.45,22.74,8.4
420,11.0874,0,18.1,0,0.718,6.411,100,1.8589,24,666,20.2,318.75,15.02,16.7
421,7.02259,0,18.1,0,0.718,6.006,95.3,1.8746,24,666,20.2,319.98,15.7,14.2
422,12.0482,0,18.1,0,0.614,5.648,87.6,1.9512,24,666,20.2,291.55,14.1,20.8
423,7.05042,0,18.1,0,0.614,6.103,85.1,2.0218,24,666,20.2,2.52,23.29,13.4
424,8.79212,0,18.1,0,0.584,5.565,70.6,2.0635,24,666,20.2,3.65,17.16,11.7
425,15.8603,0,18.1,0,0.679,5.896,95.4,1.9096,24,666,20.2,7.68,24.39,8.3
426,12.2472,0,18.1,0,0.584,5.837,59.7,1.9976,24,666,20.2,24.65,15.69,10.2
427,37.6619,0,18.1,0,0.679,6.202,78.7,1.8629,24,666,20.2,18.82,14.52,10.9
428,7.36711,0,18.1,0,0.679,6.193,78.1,1.9356,24,666,20.2,96.73,21.52,11
429,9.33889,0,18.1,0,0.679,6.38,95.6,1.9682,24,666,20.2,60.72,24.08,9.5
430,8.49213,0,18.1,0,0.584,6.348,86.1,2.0527,24,666,20.2,83.45,17.64,14.5
431,10.0623,0,18.1,0,0.584,6.833,94.3,2.0882,24,666,20.2,81.33,19.69,14.1
432,6.44405,0,18.1,0,0.584,6.425,74.8,2.2004,24,666,20.2,97.95,12.03,16.1
433,5.58107,0,18.1,0,0.713,6.436,87.9,2.3158,24,666,20.2,100.19,16.22,14.3
434,13.9134,0,18.1,0,0.713,6.208,95,2.2222,24,666,20.2,100.63,15.17,11.7
435,11.1604,0,18.1,0,0.74,6.629,94.6,2.1247,24,666,20.2,109.85,23.27,13.4
436,14.4208,0,18.1,0,0.74,6.461,93.3,2.0026,24,666,20.2,27.49,18.05,9.6
437,15.1772,0,18.1,0,0.74,6.152,100,1.9142,24,666,20.2,9.32,26.45,8.7
438,13.6781,0,18.1,0,0.74,5.935,87.9,1.8206,24,666,20.2,68.95,34.02,8.4
439,9.39063,0,18.1,0,0.74,5.627,93.9,1.8172,24,666,20.2,396.9,22.88,12.8
440,22.0511,0,18.1,0,0.74,5.818,92.4,1.8662,24,666,20.2,391.45,22.11,10.5
441,9.72418,0,18.1,0,0.74,6.406,97.2,2.0651,24,666,20.2,385.96,19.52,17.1
442,5.66637,0,18.1,0,0.74,6.219,100,2.0048,24,666,20.2,395.69,16.59,18.4
443,9.96654,0,18.1,0,0.74,6.485,100,1.9784,24,666,20.2,386.73,18.85,15.4
444,12.8023,0,18.1,0,0.74,5.854,96.6,1.8956,24,666,20.2,240.52,23.79,10.8
445,10.6718,0,18.1,0,0.74,6.459,94.8,1.9879,24,666,20.2,43.06,23.98,11.8
446,6.28807,0,18.1,0,0.74,6.341,96.4,2.072,24,666,20.2,318.01,17.79,14.9
447,9.92485,0,18.1,0,0.74,6.251,96.6,2.198,24,666,20.2,388.52,16.44,12.6
448,9.32909,0,18.1,0,0.713,6.185,98.7,2.2616,24,666,20.2,396.9,18.13,14.1
449,7.52601,0,18.1,0,0.713,6.417,98.3,2.185,24,666,20.2,304.21,19.31,13
450,6.71772,0,18.1,0,0.713,6.749,92.6,2.3236,24,666,20.2,0.32,17.44,13.4
451,5.44114,0,18.1,0,0.713,6.655,98.2,2.3552,24,666,20.2,355.29,17.73,15.2
452,5.09017,0,18.1,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27,16.1
453,8.24809,0,18.1,0,0.713,7.393,99.3,2.4527,24,666,20.2,375.87,16.74,17.8
454,9.51363,0,18.1,0,0.713,6.728,94.1,2.4961,24,666,20.2,6.68,18.71,14.9
455,4.75237,0,18.1,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1
456,4.66883,0,18.1,0,0.713,5.976,87.9,2.5806,24,666,20.2,10.48,19.01,12.7
457,8.20058,0,18.1,0,0.713,5.936,80.3,2.7792,24,666,20.2,3.5,16.94,13.5
458,7.75223,0,18.1,0,0.713,6.301,83.7,2.7831,24,666,20.2,272.21,16.23,14.9
459,6.80117,0,18.1,0,0.713,6.081,84.4,2.7175,24,666,20.2,396.9,14.7,20
460,4.81213,0,18.1,0,0.713,6.701,90,2.5975,24,666,20.2,255.23,16.42,16.4
461,3.69311,0,18.1,0,0.713,6.376,88.4,2.5671,24,666,20.2,391.43,14.65,17.7
462,6.65492,0,18.1,0,0.713,6.317,83,2.7344,24,666,20.2,396.9,13.99,19.5
463,5.82115,0,18.1,0,0.713,6.513,89.9,2.8016,24,666,20.2,393.82,10.29,20.2
464,7.83932,0,18.1,0,0.655,6.209,65.4,2.9634,24,666,20.2,396.9,13.22,21.4
465,3.1636,0,18.1,0,0.655,5.759,48.2,3.0665,24,666,20.2,334.4,14.13,19.9
466,3.77498,0,18.1,0,0.655,5.952,84.7,2.8715,24,666,20.2,22.01,17.15,19
467,4.42228,0,18.1,0,0.584,6.003,94.5,2.5403,24,666,20.2,331.29,21.32,19.1
468,15.5757,0,18.1,0,0.58,5.926,71,2.9084,24,666,20.2,368.74,18.13,19.1
469,13.0751,0,18.1,0,0.58,5.713,56.7,2.8237,24,666,20.2,396.9,14.76,20.1
470,4.34879,0,18.1,0,0.58,6.167,84,3.0334,24,666,20.2,396.9,16.29,19.9
471,4.03841,0,18.1,0,0.532,6.229,90.7,3.0993,24,666,20.2,395.33,12.87,19.6
472,3.56868,0,18.1,0,0.58,6.437,75,2.8965,24,666,20.2,393.37,14.36,23.2
473,4.64689,0,18.1,0,0.614,6.98,67.6,2.5329,24,666,20.2,374.68,11.66,29.8
474,8.05579,0,18.1,0,0.584,5.427,95.4,2.4298,24,666,20.2,352.58,18.14,13.8
475,6.39312,0,18.1,0,0.584,6.162,97.4,2.206,24,666,20.2,302.76,24.1,13.3
476,4.87141,0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7
477,15.0234,0,18.1,0,0.614,5.304,97.3,2.1007,24,666,20.2,349.48,24.91,12
478,10.233,0,18.1,0,0.614,6.185,96.7,2.1705,24,666,20.2,379.7,18.03,14.6
479,14.3337,0,18.1,0,0.614,6.229,88,1.9512,24,666,20.2,383.32,13.11,21.4
480,5.82401,0,18.1,0,0.532,6.242,64.7,3.4242,24,666,20.2,396.9,10.74,23
481,5.70818,0,18.1,0,0.532,6.75,74.9,3.3317,24,666,20.2,393.07,7.74,23.7
482,5.73116,0,18.1,0,0.532,7.061,77,3.4106,24,666,20.2,395.28,7.01,25
483,2.81838,0,18.1,0,0.532,5.762,40.3,4.0983,24,666,20.2,392.92,10.42,21.8
484,2.37857,0,18.1,0,0.583,5.871,41.9,3.724,24,666,20.2,370.73,13.34,20.6
485,3.67367,0,18.1,0,0.583,6.312,51.9,3.9917,24,666,20.2,388.62,10.58,21.2
486,5.69175,0,18.1,0,0.583,6.114,79.8,3.5459,24,666,20.2,392.68,14.98,19.1
487,4.83567,0,18.1,0,0.583,5.905,53.2,3.1523,24,666,20.2,388.22,11.45,20.6
488,0.15086,0,27.74,0,0.609,5.454,92.7,1.8209,4,711,20.1,395.09,18.06,15.2
489,0.18337,0,27.74,0,0.609,5.414,98.3,1.7554,4,711,20.1,344.05,23.97,7
490,0.20746,0,27.74,0,0.609,5.093,98,1.8226,4,711,20.1,318.43,29.68,8.1
491,0.10574,0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6
492,0.11132,0,27.74,0,0.609,5.983,83.5,2.1099,4,711,20.1,396.9,13.35,20.1
493,0.17331,0,9.69,0,0.585,5.707,54,2.3817,6,391,19.2,396.9,12.01,21.8
494,0.27957,0,9.69,0,0.585,5.926,42.6,2.3817,6,391,19.2,396.9,13.59,24.5
495,0.17899,0,9.69,0,0.585,5.67,28.8,2.7986,6,391,19.2,393.29,17.6,23.1
496,0.2896,0,9.69,0,0.585,5.39,72.9,2.7986,6,391,19.2,396.9,21.14,19.7
497,0.26838,0,9.69,0,0.585,5.794,70.6,2.8927,6,391,19.2,396.9,14.1,18.3
498,0.23912,0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.9,12.92,21.2
499,0.17783,0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.1,17.5
500,0.22438,0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.9,14.33,16.8
501,0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21,391.99,9.67,22.4
502,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6
503,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9
504,0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21,393.45,6.48,22
505,0.04741,0,11.93,0,0.573,6.03,80.8,2.505,1,273,21,396.9,7.88,11.9

+ 200
- 0
axolotl/tests/data/datasets/database_dataset_1/datasetDoc.json View File

@@ -0,0 +1,200 @@
{
"about": {
"datasetID": "database_dataset_1",
"datasetName": "A dataset simulating a database dump",
"description": "A synthetic dataset trying to be similar to a database dump, with tables with different relations between them.",
"license": "CC",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0",
"digest": "044b0c8724c80f672fb5a6e233b154d3342e4528de9a0245df77066f770c3d8f"
},
"dataResources": [
{
"resID": "codes",
"resPath": "tables/codes.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 3,
"columns": [
{
"colIndex": 0,
"colName": "code",
"colType": "categorical",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "name",
"colType": "string",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "author",
"colType": "integer",
"role": [
"attribute"
],
"refersTo": {
"resID": "authors",
"resObject": {
"columnIndex": 0
}
}
}
]
},
{
"resID": "authors",
"resPath": "tables/authors.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 2,
"columns": [
{
"colIndex": 0,
"colName": "id",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "name",
"colType": "string",
"role": [
"attribute"
]
}
]
},
{
"resID": "values",
"resPath": "tables/values.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 4,
"columns": [
{
"colIndex": 0,
"colName": "code",
"colType": "categorical",
"role": [
"attribute"
],
"refersTo": {
"resID": "codes",
"resObject": {
"columnName": "code"
}
}
},
{
"colIndex": 1,
"colName": "key",
"colType": "categorical",
"role": [
"attribute"
]
},
{
"colIndex": 2,
"colName": "year",
"colType": "dateTime",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "value",
"colType": "real",
"role": [
"attribute"
]
}
]
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 5,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "code",
"colType": "categorical",
"role": [
"attribute"
],
"refersTo": {
"resID": "codes",
"resObject": {
"columnName": "code"
}
}
},
{
"colIndex": 2,
"colName": "key",
"colType": "categorical",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "year",
"colType": "dateTime",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "value",
"colType": "real",
"role": [
"suggestedTarget"
]
}
]
}
]
}

+ 4
- 0
axolotl/tests/data/datasets/database_dataset_1/tables/authors.csv View File

@@ -0,0 +1,4 @@
id,name
1,1 name
2,2 name
3,3 name

+ 4
- 0
axolotl/tests/data/datasets/database_dataset_1/tables/codes.csv View File

@@ -0,0 +1,4 @@
code,name,author
AAA,AAA name,1
BBB,BBB name,2
CCC,CCC name,

+ 46
- 0
axolotl/tests/data/datasets/database_dataset_1/tables/learningData.csv View File

@@ -0,0 +1,46 @@
d3mIndex,code,key,year,value
1,AAA,aaa,1990,46.8470585590128
2,BBB,aaa,1990,62.2717948419606
3,CCC,aaa,1990,67.8237620898007
4,AAA,aaa,2000,48.2983166483242
5,BBB,aaa,2000,45.549658223693
6,CCC,aaa,2000,69.2737110112599
7,AAA,aaa,2010,67.0300272167932
8,BBB,aaa,2010,61.2105314015372
9,CCC,aaa,2010,69.2795439452043
10,AAA,bbb,1990,41.4118186427085
11,BBB,bbb,1990,39.5640865244041
12,CCC,bbb,1990,67.9914975626095
13,AAA,bbb,2000,65.1345599343405
14,BBB,bbb,2000,68.7315746449879
15,CCC,bbb,2000,56.0433735350634
16,AAA,bbb,2010,54.3830733627441
17,BBB,bbb,2010,40.2346487255306
18,CCC,bbb,2010,45.8856701879045
19,AAA,ccc,1990,39.1645043487628
20,BBB,ccc,1990,45.648691933945
21,CCC,ccc,1990,46.7133286046023
22,AAA,ccc,2000,46.6525032867438
23,BBB,ccc,2000,46.4240035362404
24,CCC,ccc,2000,69.89360375709
25,AAA,ccc,2010,67.3439641718359
26,BBB,ccc,2010,45.0358709995812
27,CCC,ccc,2010,53.6699876392329
28,AAA,ddd,1990,51.7980168708906
29,BBB,ddd,1990,41.9604584970723
30,CCC,ddd,1990,40.1696772984881
31,AAA,ddd,2000,40.1734778285386
32,BBB,ddd,2000,47.1396106716371
33,CCC,ddd,2000,52.4998419337555
34,AAA,ddd,2010,32.328512195122
35,BBB,ddd,2010,62.2543658536585
36,CCC,ddd,2010,46.1351219512195
37,AAA,eee,1990,32.9848292682927
38,BBB,eee,1990,61.7827317073171
39,CCC,eee,1990,65.2155365853659
40,AAA,eee,2000,65.8634634146342
41,BBB,eee,2000,65.5693658536586
42,CCC,eee,2000,70.8170731707317
43,AAA,eee,2010,68.5856097560976
44,BBB,eee,2010,60.836243902439
45,CCC,eee,2010,62.7290487804878

+ 65
- 0
axolotl/tests/data/datasets/database_dataset_1/tables/values.csv View File

@@ -0,0 +1,65 @@
code,key,year,value
AAA,fff,1980,47.6978880950964
AAA,ggg,1980,8.5102073882048
AAA,hhh,1980,28937699
AAA,iii,1980,31.2853842116054
BBB,fff,1980,40.017000543436
BBB,ggg,1980,0.088668203582195
BBB,hhh,1980,1324191
BBB,iii,1980,31.5974898513652
CCC,fff,1980,19.1174351291049
CCC,ggg,1980,3.93265041799129
CCC,hhh,1980,40680946
CCC,iii,1980,44.5079211390026
DDD,fff,1980,26.3623503812716
DDD,ggg,1980,1.92536841530696
DDD,hhh,1980,231871389
DDD,iii,1980,22.4711322042954
AAA,fff,1990,47.4553181402273
AAA,ggg,1990,8.70342011230219
AAA,hhh,1990,30440944
AAA,iii,1990,32.0290757076957
BBB,fff,1990,39.7447929589982
BBB,ggg,1990,0.208550573514077
BBB,hhh,1990,1358967
BBB,iii,1990,31.8190266061615
CCC,fff,1990,18.1079412466031
CCC,ggg,1990,4.17195041631032
CCC,hhh,1990,41697325
CCC,iii,1990,45.206665319194
DDD,fff,1990,24.5407077023555
DDD,ggg,1990,2.16383747049959
DDD,hhh,1990,239260861
DDD,iii,1990,23.1221808000131
AAA,fff,2000,47.1985790784782
AAA,ggg,2000,7.57819143910044
AAA,hhh,2000,32023589
AAA,iii,2000,32.780322721748
BBB,fff,2000,39.3127951747193
BBB,ggg,2000,0.017809439002671
BBB,hhh,2000,1395610
BBB,iii,2000,32.0578605237552
CCC,fff,2000,17.2041249395661
CCC,ggg,2000,4.85743169997929
CCC,hhh,2000,42662735
CCC,iii,2000,45.866564696018
DDD,fff,2000,36.4059763551675
DDD,ggg,2000,1.78711972147311
DDD,hhh,2000,247991158
DDD,iii,2000,23.651145327486
AAA,fff,2010,46.9193698023565
AAA,ggg,2010,8.06572068707991
AAA,hhh,2010,33702951
AAA,iii,2010,33.5558808208595
BBB,fff,2010,38.7256964353179
BBB,ggg,2010,0
BBB,hhh,2010,1433393
BBB,iii,2010,32.3139419105331
CCC,fff,2010,16.9941914183028
CCC,ggg,2010,3.60319993005617
CCC,hhh,2010,43670268
CCC,iii,2010,46.5340927663649
DDD,fff,2010,40.2972491778884
DDD,ggg,2010,1.70584120202376
DDD,hhh,2010,258930584
DDD,iii,2010,24.1179584421312

+ 196
- 0
axolotl/tests/data/datasets/database_dataset_2/datasetDoc.json View File

@@ -0,0 +1,196 @@
{
"about": {
"datasetSchemaVersion": "4.0.0",
"datasetID": "database_dataset_2",
"datasetName": "Database dataset of type COUNTS_PER_USER",
"description": "Database dataset of type COUNTS_PER_USER, size 100, random seed 0",
"digest": "0eafe8b08646e4c684bb1776fee8af92cc232ba1bd2840ca0c70e7ae5a59d976",
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "users",
"isCollection": false,
"columnsCount": 2,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/users.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "name",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "posts",
"isCollection": false,
"columnsCount": 3,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/posts.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "author_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "post",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "comments",
"isCollection": false,
"columnsCount": 4,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/comments.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "post_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "posts",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "author_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 3,
"colName": "comment",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "learningData",
"isCollection": false,
"columnsCount": 4,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/learningData.csv",
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "user_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "posts_count",
"role": [
"suggestedTarget"
],
"colType": "integer"
},
{
"colIndex": 3,
"colName": "comments_count",
"role": [
"suggestedTarget"
],
"colType": "integer"
}
]
}
]
}

+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_2/tables/comments.csv
File diff suppressed because it is too large
View File


+ 101
- 0
axolotl/tests/data/datasets/database_dataset_2/tables/learningData.csv View File

@@ -0,0 +1,101 @@
d3mIndex,user_id,posts_count,comments_count
0,0,7,4
1,1,8,9
2,2,9,2
3,3,5,425
4,4,2,5
5,5,8,6
6,6,4,6
7,7,16,5
8,8,48,40
9,9,6,0
10,10,15,2
11,11,2,7
12,12,3,1
13,13,26,8
14,14,2,5
15,15,6,5
16,16,4,1
17,17,10,8
18,18,13,9
19,19,6,1
20,20,83,14
21,21,11,4
22,22,9,5
23,23,10,7
24,24,4,2
25,25,6,0
26,26,1,3
27,27,34,3
28,28,6,2
29,29,6,4
30,30,3,4
31,31,9,8
32,32,8,5
33,33,8,7
34,34,2,1
35,35,1,3
36,36,5,4
37,37,12,13
38,38,38,42
39,39,4,2
40,40,1,3
41,41,6,5
42,42,13,2
43,43,5,2
44,44,1,11
45,45,4,20
46,46,4,1
47,47,3,13
48,48,5,10
49,49,4,5
50,50,7,5
51,51,4,4
52,52,118,3
53,53,2,2
54,54,3,3
55,55,1,1
56,56,10,2
57,57,1,3
58,58,2,4
59,59,2,37
60,60,2,3
61,61,2,3
62,62,5,0
63,63,1,1
64,64,2,4
65,65,4,4
66,66,11,8
67,67,3,15
68,68,17,4
69,69,5,3
70,70,67,8
71,71,5,3
72,72,49,6
73,73,2,3
74,74,14,5
75,75,3,0
76,76,5,3
77,77,2,2
78,78,6,2
79,79,5,3
80,80,7,0
81,81,3,3
82,82,3,2
83,83,8,4
84,84,5,5
85,85,4,5
86,86,3,2
87,87,3,4
88,88,4,1
89,89,31,0
90,90,4,3
91,91,8,1
92,92,6,4
93,93,16,19
94,94,3,5
95,95,6,9
96,96,6,23
97,97,1,3
98,98,9,6
99,99,4,3

+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_2/tables/posts.csv
File diff suppressed because it is too large
View File


+ 101
- 0
axolotl/tests/data/datasets/database_dataset_2/tables/users.csv View File

@@ -0,0 +1,101 @@
id,name
0,User 0
1,User 1
2,User 2
3,User 3
4,User 4
5,User 5
6,User 6
7,User 7
8,User 8
9,User 9
10,User 10
11,User 11
12,User 12
13,User 13
14,User 14
15,User 15
16,User 16
17,User 17
18,User 18
19,User 19
20,User 20
21,User 21
22,User 22
23,User 23
24,User 24
25,User 25
26,User 26
27,User 27
28,User 28
29,User 29
30,User 30
31,User 31
32,User 32
33,User 33
34,User 34
35,User 35
36,User 36
37,User 37
38,User 38
39,User 39
40,User 40
41,User 41
42,User 42
43,User 43
44,User 44
45,User 45
46,User 46
47,User 47
48,User 48
49,User 49
50,User 50
51,User 51
52,User 52
53,User 53
54,User 54
55,User 55
56,User 56
57,User 57
58,User 58
59,User 59
60,User 60
61,User 61
62,User 62
63,User 63
64,User 64
65,User 65
66,User 66
67,User 67
68,User 68
69,User 69
70,User 70
71,User 71
72,User 72
73,User 73
74,User 74
75,User 75
76,User 76
77,User 77
78,User 78
79,User 79
80,User 80
81,User 81
82,User 82
83,User 83
84,User 84
85,User 85
86,User 86
87,User 87
88,User 88
89,User 89
90,User 90
91,User 91
92,User 92
93,User 93
94,User 94
95,User 95
96,User 96
97,User 97
98,User 98
99,User 99

+ 188
- 0
axolotl/tests/data/datasets/database_dataset_3/datasetDoc.json View File

@@ -0,0 +1,188 @@
{
"about": {
"datasetSchemaVersion": "4.0.0",
"datasetID": "database_dataset_3",
"datasetName": "Database dataset of type COMMENTS_PER_POST",
"description": "Database dataset of type COMMENTS_PER_POST, size 100, random seed 0",
"digest": "7dc0973f7fcb22fe487fe37fdaa8c269589074504c53b9728b5a5ec85e2ebb9b",
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "users",
"isCollection": false,
"columnsCount": 2,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/users.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "name",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "posts",
"isCollection": false,
"columnsCount": 3,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/posts.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "author_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "post",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "comments",
"isCollection": false,
"columnsCount": 4,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/comments.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "post_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "posts",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "author_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 3,
"colName": "comment",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "learningData",
"isCollection": false,
"columnsCount": 3,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/learningData.csv",
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "post_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "posts",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "comments_count",
"role": [
"suggestedTarget"
],
"colType": "integer"
}
]
}
]
}

+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_3/tables/comments.csv
File diff suppressed because it is too large
View File


+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_3/tables/learningData.csv
File diff suppressed because it is too large
View File


+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_3/tables/posts.csv
File diff suppressed because it is too large
View File


+ 101
- 0
axolotl/tests/data/datasets/database_dataset_3/tables/users.csv View File

@@ -0,0 +1,101 @@
id,name
0,User 0
1,User 1
2,User 2
3,User 3
4,User 4
5,User 5
6,User 6
7,User 7
8,User 8
9,User 9
10,User 10
11,User 11
12,User 12
13,User 13
14,User 14
15,User 15
16,User 16
17,User 17
18,User 18
19,User 19
20,User 20
21,User 21
22,User 22
23,User 23
24,User 24
25,User 25
26,User 26
27,User 27
28,User 28
29,User 29
30,User 30
31,User 31
32,User 32
33,User 33
34,User 34
35,User 35
36,User 36
37,User 37
38,User 38
39,User 39
40,User 40
41,User 41
42,User 42
43,User 43
44,User 44
45,User 45
46,User 46
47,User 47
48,User 48
49,User 49
50,User 50
51,User 51
52,User 52
53,User 53
54,User 54
55,User 55
56,User 56
57,User 57
58,User 58
59,User 59
60,User 60
61,User 61
62,User 62
63,User 63
64,User 64
65,User 65
66,User 66
67,User 67
68,User 68
69,User 69
70,User 70
71,User 71
72,User 72
73,User 73
74,User 74
75,User 75
76,User 76
77,User 77
78,User 78
79,User 79
80,User 80
81,User 81
82,User 82
83,User 83
84,User 84
85,User 85
86,User 86
87,User 87
88,User 88
89,User 89
90,User 90
91,User 91
92,User 92
93,User 93
94,User 94
95,User 95
96,User 96
97,User 97
98,User 98
99,User 99

+ 202
- 0
axolotl/tests/data/datasets/database_dataset_4/datasetDoc.json View File

@@ -0,0 +1,202 @@
{
"about": {
"datasetSchemaVersion": "4.0.0",
"datasetID": "database_dataset_4",
"datasetName": "Database dataset of type HAS_USER_MADE_COMMENT_ON_POST",
"description": "Database dataset of type HAS_USER_MADE_COMMENT_ON_POST, size 100, random seed 0",
"digest": "61fe05fb19ff803c67eedf2fdbb131e3124fadf77abdd650d34ab6068b85c35f",
"datasetVersion": "4.0.0"
},
"dataResources": [
{
"resID": "users",
"isCollection": false,
"columnsCount": 2,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/users.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "name",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "posts",
"isCollection": false,
"columnsCount": 3,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/posts.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "author_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "post",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "comments",
"isCollection": false,
"columnsCount": 4,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/comments.csv",
"columns": [
{
"colIndex": 0,
"colName": "id",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "post_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "posts",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "author_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 3,
"colName": "comment",
"role": [
"attribute"
],
"colType": "string"
}
]
},
{
"resID": "learningData",
"isCollection": false,
"columnsCount": 4,
"resFormat": {
"text/csv": [
"csv"
]
},
"resType": "table",
"resPath": "tables/learningData.csv",
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"role": [
"index"
],
"colType": "integer"
},
{
"colIndex": 1,
"colName": "user_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "users",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 2,
"colName": "post_id",
"role": [
"attribute"
],
"colType": "integer",
"refersTo": {
"resID": "posts",
"resObject": {
"columnIndex": 0
}
}
},
{
"colIndex": 3,
"colName": "made_comment",
"role": [
"suggestedTarget"
],
"colType": "boolean"
}
]
}
]
}

+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_4/tables/comments.csv
File diff suppressed because it is too large
View File


+ 201
- 0
axolotl/tests/data/datasets/database_dataset_4/tables/learningData.csv View File

@@ -0,0 +1,201 @@
d3mIndex,user_id,post_id,made_comment
0,3,490,yes
1,64,897,no
2,64,219,no
3,64,620,no
4,47,546,yes
5,16,562,yes
6,64,551,no
7,29,327,yes
8,3,407,yes
9,64,653,no
10,1,224,yes
11,64,435,no
12,27,271,yes
13,45,168,yes
14,28,857,yes
15,84,526,yes
16,64,817,no
17,64,712,no
18,70,27,yes
19,97,385,yes
20,3,137,yes
21,91,519,yes
22,37,857,yes
23,64,25,no
24,3,719,yes
25,64,928,no
26,38,451,yes
27,64,22,no
28,64,839,no
29,64,872,no
30,3,198,yes
31,64,887,no
32,64,847,no
33,64,909,no
34,33,857,yes
35,64,947,no
36,64,848,no
37,64,723,no
38,64,715,no
39,3,127,yes
40,64,442,no
41,64,250,no
42,56,637,yes
43,59,358,yes
44,11,924,yes
45,64,751,no
46,64,671,no
47,3,327,yes
48,21,488,yes
49,64,196,no
50,64,394,no
51,3,258,yes
52,45,931,yes
53,64,761,no
54,64,999,no
55,3,941,yes
56,45,856,yes
57,65,919,yes
58,64,942,no
59,64,826,no
60,64,576,no
61,64,34,no
62,20,556,yes
63,59,358,yes
64,64,500,no
65,95,938,yes
66,64,673,no
67,64,184,no
68,47,329,yes
69,64,764,no
70,64,899,no
71,66,51,yes
72,64,11,no
73,64,132,no
74,64,179,no
75,64,198,no
76,84,568,yes
77,3,828,yes
78,93,895,yes
79,3,763,yes
80,30,638,yes
81,64,963,no
82,64,791,no
83,15,584,yes
84,64,800,no
85,64,359,no
86,98,25,yes
87,3,945,yes
88,64,892,no
89,64,936,no
90,64,319,no
91,47,122,yes
92,8,451,yes
93,64,191,no
94,64,968,no
95,64,822,no
96,64,121,no
97,3,488,yes
98,3,290,yes
99,3,488,yes
100,3,682,yes
101,64,996,no
102,3,305,yes
103,64,949,no
104,64,194,no
105,3,931,yes
106,59,784,yes
107,8,587,yes
108,64,429,no
109,64,940,no
110,64,786,no
111,3,425,yes
112,78,806,yes
113,3,473,yes
114,64,10,no
115,3,833,yes
116,64,120,no
117,97,198,yes
118,44,277,yes
119,64,430,no
120,3,458,yes
121,64,168,no
122,3,447,yes
123,3,775,yes
124,64,127,no
125,3,36,yes
126,96,96,yes
127,72,504,yes
128,64,601,no
129,3,294,yes
130,64,808,no
131,48,744,yes
132,64,343,no
133,64,896,no
134,64,318,no
135,96,758,yes
136,85,185,yes
137,64,532,no
138,3,964,yes
139,64,571,no
140,93,423,yes
141,87,451,yes
142,17,308,yes
143,64,134,no
144,64,28,no
145,3,330,yes
146,3,286,yes
147,64,245,no
148,64,252,no
149,18,792,yes
150,3,452,yes
151,64,506,no
152,64,518,no
153,64,918,no
154,3,735,yes
155,3,600,yes
156,64,293,no
157,30,393,yes
158,3,428,yes
159,64,413,no
160,57,258,yes
161,3,945,yes
162,64,49,no
163,64,813,no
164,64,422,no
165,49,526,yes
166,3,451,yes
167,64,585,no
168,64,411,no
169,45,68,yes
170,59,857,yes
171,38,121,yes
172,64,583,no
173,64,512,no
174,3,713,yes
175,45,304,yes
176,11,488,yes
177,64,490,no
178,3,327,yes
179,64,160,no
180,3,25,yes
181,64,679,no
182,64,39,no
183,64,484,no
184,64,363,no
185,47,857,yes
186,67,857,yes
187,64,926,no
188,64,924,no
189,64,480,no
190,66,198,yes
191,64,755,no
192,3,952,yes
193,13,492,yes
194,64,592,no
195,35,719,yes
196,64,591,no
197,64,664,no
198,64,885,no
199,3,451,yes

+ 1001
- 0
axolotl/tests/data/datasets/database_dataset_4/tables/posts.csv
File diff suppressed because it is too large
View File


+ 101
- 0
axolotl/tests/data/datasets/database_dataset_4/tables/users.csv View File

@@ -0,0 +1,101 @@
id,name
0,User 0
1,User 1
2,User 2
3,User 3
4,User 4
5,User 5
6,User 6
7,User 7
8,User 8
9,User 9
10,User 10
11,User 11
12,User 12
13,User 13
14,User 14
15,User 15
16,User 16
17,User 17
18,User 18
19,User 19
20,User 20
21,User 21
22,User 22
23,User 23
24,User 24
25,User 25
26,User 26
27,User 27
28,User 28
29,User 29
30,User 30
31,User 31
32,User 32
33,User 33
34,User 34
35,User 35
36,User 36
37,User 37
38,User 38
39,User 39
40,User 40
41,User 41
42,User 42
43,User 43
44,User 44
45,User 45
46,User 46
47,User 47
48,User 48
49,User 49
50,User 50
51,User 51
52,User 52
53,User 53
54,User 54
55,User 55
56,User 56
57,User 57
58,User 58
59,User 59
60,User 60
61,User 61
62,User 62
63,User 63
64,User 64
65,User 65
66,User 66
67,User 67
68,User 68
69,User 69
70,User 70
71,User 71
72,User 72
73,User 73
74,User 74
75,User 75
76,User 76
77,User 77
78,User 78
79,User 79
80,User 80
81,User 81
82,User 82
83,User 83
84,User 84
85,User 85
86,User 86
87,User 87
88,User 88
89,User 89
90,User 90
91,User 91
92,User 92
93,User 93
94,User 94
95,User 95
96,User 96
97,User 97
98,User 98
99,User 99

+ 68
- 0
axolotl/tests/data/datasets/graph_dataset_1/datasetDoc.json View File

@@ -0,0 +1,68 @@
{
"about": {
"datasetID": "graph_dataset_1",
"datasetName": "Test graph dataset in GML format",
"description": "Based on LL1_net_nomination_seed_dataset",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0",
"digest": "fcca6f975627035daa978a24e93aa74861aa93bb885eb8d782d24e5f46376f14"
},
"dataResources": [
{
"resID": "G1",
"resPath": "graphs/G1.gml",
"resType": "graph",
"resFormat": {
"text/vnd.gml": [
"gml"
]
},
"isCollection": false
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 3,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "G1.nodeID",
"colType": "integer",
"role": [
"attribute"
],
"refersTo": {
"resID": "G1",
"resObject": {
"nodeAttribute": "nodeID"
}
}
},
{
"colIndex": 2,
"colName": "classLabel",
"colType": "categorical",
"role": [
"suggestedTarget"
]
}
]
}
]
}

+ 98
- 0
axolotl/tests/data/datasets/graph_dataset_1/graphs/G1.gml View File

@@ -0,0 +1,98 @@
graph [
node [
id 0
nodeID 0
attr1 10.931067281322685
attr2 -0.6081826070068602
]
node [
id 1
nodeID 1
attr1 -7.6385951806129615
attr2 -0.6245401364066232
]
node [
id 2
nodeID 2
attr1 -7.827982471789538
attr2 -1.6583217791337177
]
node [
id 3
nodeID 3
attr1 -13.175150644300572
attr2 0.059255494425681
]
node [
id 4
nodeID 4
attr1 -2.858879300645913
attr2 0.2877095029910792
]
node [
id 5
nodeID 5
attr1 3.1166256979193085
attr2 -0.9558118873968128
]
node [
id 6
nodeID 6
attr1 -2.3460257528493025
attr2 -0.9912505454192136
]
node [
id 7
nodeID 7
attr1 4.279456640630548
attr2 0.9571850297129592
]
node [
id 8
nodeID 8
attr1 -1.3274504027623684
attr2 0.008863588431931045
]
node [
id 9
nodeID 9
attr1 2.9854996729947567
attr2 -0.6257664276530307
]
node [
id 10
nodeID 10
attr1 -8.126081560478179
attr2 2.830732320647184
]
edge [
source 0
target 4
edge_weight 1.0
]
edge [
source 0
target 5
edge_weight 1.0
]
edge [
source 1
target 2
edge_weight 0.9
]
edge [
source 1
target 3
edge_weight 0.6
]
edge [
source 1
target 10
edge_weight 1.0
]
edge [
source 5
target 7
edge_weight 1.0
]
]

+ 12
- 0
axolotl/tests/data/datasets/graph_dataset_1/tables/learningData.csv View File

@@ -0,0 +1,12 @@
d3mIndex,G1.nodeID,classLabel
0,0,2
1,1,0
2,2,0
3,3,0
4,4,1
5,5,2
6,6,1
7,7,2
8,8,1
9,9,2
10,10,0

+ 118
- 0
axolotl/tests/data/datasets/graph_dataset_2/datasetDoc.json View File

@@ -0,0 +1,118 @@
{
"about": {
"datasetID": "graph_dataset_2",
"datasetName": "Test graph dataset in edgelist format",
"description": "Based on LL1_EDGELIST_net_nomination_seed_dataset",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0",
"digest": "ef138e993861a11f1d09a8b3662179eb0661c85a4e38d572be8555e32da712f1"
},
"dataResources": [
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 5,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "nodeID",
"colType": "integer",
"role": [
"attribute",
"key"
]
},
{
"colIndex": 2,
"colName": "attr1",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 3,
"colName": "attr2",
"colType": "real",
"role": [
"attribute"
]
},
{
"colIndex": 4,
"colName": "classLabel",
"colType": "categorical",
"role": [
"suggestedTarget"
]
}
]
},
{
"resID": "edgeList",
"resPath": "tables/edgeList.csv",
"resType": "edgeList",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 3,
"columns": [
{
"colIndex": 0,
"colName": "source",
"colType": "integer",
"role": [
"attribute"
],
"refersTo": {
"resID": "learningData",
"resObject": {
"columnName": "nodeID"
}
}
},
{
"colIndex": 1,
"colName": "target",
"colType": "integer",
"role": [
"attribute"
],
"refersTo": {
"resID": "learningData",
"resObject": {
"columnName": "nodeID"
}
}
},
{
"colIndex": 2,
"colName": "edge_weight",
"colType": "real",
"role": [
"attribute"
]
}
]
}
]
}

+ 7
- 0
axolotl/tests/data/datasets/graph_dataset_2/tables/edgeList.csv View File

@@ -0,0 +1,7 @@
source,target,edge_weight
0,4,1.0
0,5,1.0
1,2,0.9
1,3,0.6
1,10,1.0
5,7,1.0

+ 12
- 0
axolotl/tests/data/datasets/graph_dataset_2/tables/learningData.csv View File

@@ -0,0 +1,12 @@
d3mIndex,nodeID,attr1,attr2,classLabel
0,0,10.931067281322685,-0.6081826070068602,2
1,1,-7.6385951806129615,-0.6245401364066232,0
2,2,-7.827982471789538,-1.6583217791337177,0
3,3,-13.175150644300572,0.059255494425681,0
4,4,-2.858879300645913,0.2877095029910792,1
5,5,3.1166256979193085,-0.9558118873968128,2
6,6,-2.3460257528493025,-0.9912505454192136,1
7,7,4.279456640630548,0.9571850297129592,2
8,8,-1.3274504027623684,0.008863588431931045,1
9,9,2.9854996729947567,-0.6257664276530307,2
10,10,-8.126081560478179,2.830732320647184,0

+ 71
- 0
axolotl/tests/data/datasets/image_dataset_1/datasetDoc.json View File

@@ -0,0 +1,71 @@
{
"about": {
"datasetID": "image_dataset_1",
"datasetName": "Image dataset to be used for tests",
"description": "There are a total of 5 image files, one is a left hand from the handgeometry dataset, two birds from cifar10 dataset and 2 figures from mnist dataset.",
"license": "Creative Commons Attribution-NonCommercial 4.0",
"approximateSize": "24 KB",
"datasetSchemaVersion": "4.0.0",
"redacted": false,
"datasetVersion": "4.0.0",
"digest": "7d8dc3f1f7bd1edce4caf2848f05114cc5c7c8bb0221310bde87965e02a2a927"
},
"dataResources": [
{
"resID": "0",
"resPath": "media/",
"resType": "image",
"resFormat": {
"image/jpeg": [
"jpg"
],
"image/png": [
"png"
]
},
"isCollection": true
},
{
"resID": "learningData",
"resPath": "tables/learningData.csv",
"resType": "table",
"resFormat": {
"text/csv": [
"csv"
]
},
"isCollection": false,
"columnsCount": 3,
"columns": [
{
"colIndex": 0,
"colName": "d3mIndex",
"colType": "integer",
"role": [
"index"
]
},
{
"colIndex": 1,
"colName": "image_file",
"colType": "string",
"role": [
"attribute"
],
"refersTo": {
"resID": "0",
"resObject": "item"
}
},
{
"colIndex": 2,
"colName": "class",
"colType": "categorical",
"role": [
"suggestedTarget"
]
}
]
}
]
}

BIN
axolotl/tests/data/datasets/image_dataset_1/media/001_HandPhoto_left_01.jpg View File

Before After
Width: 150  |  Height: 225  |  Size: 6.7 kB

BIN
axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_1.png View File

Before After
Width: 32  |  Height: 32  |  Size: 2.3 kB

BIN
axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_2.png View File

Before After
Width: 32  |  Height: 32  |  Size: 2.0 kB

BIN
axolotl/tests/data/datasets/image_dataset_1/media/mnist_0_2.png View File

Before After
Width: 28  |  Height: 28  |  Size: 289 B

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save