OpenI
/
tods

#!/usr/bin/env python3
#
# This script validates that problem and dataset descriptions match
# standards and conventions (schemas, naming and directory structure, etc.).
#
# This script expects a that there is a clone of the "data-supply"
# repository in the same directory as this script.
#
# Checks done by this script:
#  - Dataset description validates according to its schema.
#  - Problem description validates according to its schema.
#  - Dataset description filename should be "datasetDoc.json".
#  - Problem description filename should be "problemDoc.json".
#  - There should be no duplicate dataset IDs or problem IDs.
#  - Dataset directory names should match the dataset IDs, and be under
#    a matching parent directory based on that ID (where ID should
#    have an expected suffix).
#  - All problem descriptions for dataset views/splits should be the same.
#  - Dataset splits should match in ID the original dataset based on the directory
#    structure they are in, but have "TEST, "TRAIN", or "SCORE" suffix.
#  - Problem descriptions should reference existing datasets and columns.
#  - Dataset and problem descriptions should be (almost) equal between splits.
#  - Clustering problems require numClusters in target specifications.
#  - Clustering problems should not have data splitting configuration.
#  - Test and train split of datasets used in clustering problems should be the same.
#  - Require dataset digest.
#  - Dataset entry points should have "learningData" as resource ID.
#  - Problem descriptions using "f1", "precision", "recall", and "jaccardSimilarityScore"
#    metrics should have only two distinct values in target columns, have "posLabel" provided,
#    and that "posLabel" value should be among target values.
#  - No other should have "posLabel" set.
#  - "hammingLoss" metric can be used only with multi-label problems.
#  - "precisionAtTopK" should be used only with forecasting.
#  - Problem descriptions should have only one target, except for multi-variate
#    and object detection problems which should have more than one.
#  - Dataset entry point cannot be a collection.
#  - Dataset entry point has to have columns metadata.
#  - There is at most one "index" or "multiIndex" column per resource.
#  - "index" and "multiIndex" cannot be set at the same time.
#  - Dataset entry point is required to have an "index" or "multiIndex" column.
#  - Columns cannot be both "index" and "key" at the same time.
#  - Columns cannot be both "multiIndex" and "key" at the same time.
#  - "index" columns have to have unique values and no missing values.
#  - "multiIndex" columns have to have no missing values.
#  - "key" columns have to have unique values.
#  - Every metric should be listed only once in a problem description.
#  - Some task keywords can be used only with corresponding task keywords.
#  - All resource formats used by a resource should be from the standard list of them.
#  - All files used in a collection resource should have a file extension of a resource
#    format from the standard list of them.
#  - Collection resource should contain at least one file.
#  - Resource path of a collection resource should end with "/".
#  - Any file referenced in a collection resource must exist.
#  - On edgelist resources, both "edgeSource" and "edgeTarget" columns should exist in
#    same resource, only one each. It should have additional two column roles for direction
#    and simple/multi. Those should match between columns (so both should be directed or not,
#    and simple or multi, but not mix).
#  - When there is "multiIndex" column, all rows for same index value should have the same
#    values in all columns except "suggestedTarget" columns.
#  - Makes sure that "columnsCount" matches the number of columns, when it exists.

import argparse
import collections
import copy
import functools
import json
import traceback
import os
import os.path
import sys

import cerberus
import deep_dircmp
import pandas

LIMIT_OUTPUT = 10
EDGELIST_COLUMN_ROLES = [
    'edgeSource',
    'directedEdgeSource',
    'undirectedEdgeSource',
    'multiEdgeSource',
    'simpleEdgeSource',
    'edgeTarget',
    'directedEdgeTarget',
    'undirectedEdgeTarget',
    'multiEdgeTarget',
    'simpleEdgeTarget',
]

if not os.path.exists(os.path.join(os.path.dirname(__file__), 'data-supply')):
    raise Exception("\"data-supply\" directory is missing. You should clone the repository to be in the same directory as this script.")

with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'schemas', 'datasetSchema.json')) as dataset_description_schema_file:
    dataset_description_validator = cerberus.Validator(json.load(dataset_description_schema_file))

with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'schemas', 'problemSchema.json')) as problem_description_schema_file:
    problem_description_validator = cerberus.Validator(json.load(problem_description_schema_file))

with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'documentation', 'supportedResourceTypesFormats.json')) as supported_resource_types_formats_file:
    supported_resource_types_formats = json.load(supported_resource_types_formats_file)
    res_format_to_extensions = {}
    for supported_resource in supported_resource_types_formats['supported_resource_types_and_formats']:
        for res_format, extensions in supported_resource['resFormat'].items():
            if res_format not in res_format_to_extensions:
                res_format_to_extensions[res_format] = sorted(set(extensions))
            else:
                res_format_to_extensions[res_format] = sorted(set(extensions) | set(res_format_to_extensions[res_format]))


@functools.lru_cache(maxsize=10)
def read_csv(data_path):
    return pandas.read_csv(
        data_path,
        # We do not want to do any conversion of values.
        dtype=str,
        # We always expect one row header.
        header=0,
        # We want empty strings and not NaNs.
        na_filter=False,
        encoding='utf8',
    )


def validate_dataset_path(description_id, description_path, *, strict_naming=True):
    if os.path.basename(description_path) != 'datasetDoc.json':
        print("ERROR: Dataset description filename is not 'datasetDoc.json'.")
        return True

    if strict_naming:
        split_path = os.path.dirname(description_path).split(os.sep)
        for suffix in ['_dataset_TEST', '_dataset_TRAIN', '_dataset_SCORE']:
            if description_id.endswith(suffix):
                expected_paths = [[description_id[:-len(suffix)], suffix[len('_dataset_'):], suffix[1:]]]

                # A special case, SCORE dataset/problem can be in TEST directory.
                if suffix == '_dataset_SCORE':
                    expected_paths.append([description_id[:-len(suffix)], suffix[len('_dataset_'):], 'dataset_TEST'])

                if split_path[-3:] not in expected_paths:
                    print("ERROR: Dataset directory path {directory_path} does not match any of expected paths: {expected_paths}".format(
                        directory_path=split_path[-3:],
                        expected_paths=', '.join(str(expected_path) for expected_path in expected_paths),
                    ))
                    return True

                break
        else:
            if not description_id.endswith('_dataset'):
                print("ERROR: Dataset ID does not end with allowed suffix: {description_id}".format(
                    description_id=description_id,
                ))
                return True

            expected_path = [description_id[:-len('_dataset')], description_id]

            if split_path[-2:] != expected_path:
                print("ERROR: Dataset directory path {directory_path} does not match expected path: {expected_path}".format(
                    directory_path=split_path[-2:],
                    expected_path=expected_path,
                ))
                return True

    return False


def validate_metrics(problem_description):
    error = False

    existing_metrics = set()
    for metric in problem_description.get('inputs', {}).get('performanceMetrics', []):
        if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']:
            if 'posLabel' not in metric:
                print("ERROR: Problem uses '{metric}' metric, but 'posLabel' is not provided.".format(
                    metric=metric['metric'],
                ))
                error = True
            if set(problem_description['about']['taskKeywords']) & {'multiClass', 'multiLabel'}:
                print("ERROR: Problem uses '{metric}' metric, but it is a multi-class or a multi-label problem.".format(
                    metric=metric['metric'],
                ))
                error = True
        elif 'posLabel' in metric:
            print("ERROR: Problem does not use 'f1', 'precision', 'recall', or 'jaccardSimilarityScore' metric, but 'posLabel' is provided.".format(
                metric=metric['metric'],
            ))
            error = True

        if metric['metric'] == 'hammingLoss' and 'multiLabel' not in set(problem_description['about']['taskKeywords']):
            print("ERROR: Problem uses 'hammingLoss' metric, but it is not a multi-label problem.")
            error = True

        if metric['metric'] == 'precisionAtTopK' and 'forecasting' not in set(problem_description['about']['taskKeywords']):
            print("ERROR: Problem uses 'precisionAtTopK' metric, but it is not forecasting problem.")
            error = True

        if metric['metric'] in existing_metrics:
            print("ERROR: Problem uses same metric '{metric}' multiple times.".format(metric=metric['metric']))
            error = True
        existing_metrics.add(metric['metric'])

    return error


def validate_keywords(problem_description):
    task_keywords = set(problem_description['about']['taskKeywords'])

    targets_number = 0
    for data in problem_description.get('inputs', {}).get('data', []):
        targets_number += len(data.get('targets', []))

    if 'regression' in task_keywords and 'multivariate' in task_keywords:
        if targets_number < 2:
            print("ERROR: Problem is a multi-variate problem, but it does not have more than 1 target.")
            return True
    elif 'objectDetection' in task_keywords:
        if targets_number != 1 and targets_number != 2:
            print("ERROR: Problem is an object detection problem, but it does not have 1 or 2 targets.")
            return True
    elif targets_number != 1:
        print("ERROR: Problem has more than 1 target.")
        return True

    if task_keywords & {'binary', 'multiClass', 'multiLabel'} and not task_keywords & {'classification', 'vertexClassification'}:
        print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
            task_keywords=task_keywords,
        ))
        return True
    if task_keywords & {'classification', 'vertexClassification'} and not task_keywords & {'binary', 'multiClass', 'multiLabel'}:
        print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
            task_keywords=task_keywords,
        ))
        return True

    if task_keywords & {'univariate', 'multivariate'} and 'regression' not in task_keywords:
        print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
            task_keywords=task_keywords,
        ))
        return True
    if 'regression' in task_keywords and not task_keywords & {'univariate', 'multivariate'}:
        print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
            task_keywords=task_keywords,
        ))
        return True

    if task_keywords & {'overlapping', 'nonOverlapping'} and not task_keywords & {'clustering', 'communityDetection'}:
        print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
            task_keywords=task_keywords,
        ))
        return True
    if task_keywords & {'clustering', 'communityDetection'} and not task_keywords & {'overlapping', 'nonOverlapping'}:
        print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format(
            task_keywords=task_keywords,
        ))
        return True

    return False


def validate_files(dataset_description_path, data_resource, dataset_description, column_index, collection_resource_id):
    for collection_data_resource in dataset_description['dataResources']:
        if collection_data_resource['resID'] == collection_resource_id:
            break
    else:
        print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a collection resource '{collection_resource_id}', but the resource does not exixt.".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
            column_index=column_index,
            collection_resource_id=collection_resource_id,
        ))
        # We cannot do much more here.
        return True

    if not collection_data_resource.get('isCollection', False):
        print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a collection resource '{collection_resource_id}', but the resource is not a collection.".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
            column_index=column_index,
            collection_resource_id=collection_resource_id,
        ))
        # We cannot do much more here.
        return True

    error = False

    data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])

    data = read_csv(data_path)

    collection_dir = os.path.join(os.path.dirname(dataset_description_path), collection_data_resource['resPath'])

    count = 0
    for filename in data.iloc[:, column_index]:
        filepath = os.path.join(collection_dir, filename)

        if not os.path.isfile(filepath):
            count += 1

            print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a file in a collection resource '{collection_resource_id}', but the file does not exist: {filename}".format(
                dataset_path=dataset_description_path,
                resource_id=data_resource['resID'],
                column_index=column_index,
                collection_resource_id=collection_resource_id,
                filename=filename,
            ))
            error = True

        if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
            break

    return error


def validate_collection(dataset_description_path, data_resource):
    error = False

    if not data_resource['resPath'].endswith('/'):
        print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' where resource path is not ending with '/': {res_path}".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
            res_path=data_resource['resPath'],
        ))
        error = True

    allowed_file_extensions = set()
    for res_format, extensions in data_resource['resFormat'].items():
        unsupported_extensions = set(extensions) - set(res_format_to_extensions[res_format])
        if unsupported_extensions:
            print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' and resource format '{res_format}' with unsupported extensions: {unsupported_extensions}".format(
                dataset_path=dataset_description_path,
                resource_id=data_resource['resID'],
                res_format=res_format,
                unsupported_extensions=sorted(unsupported_extensions),
            ))
            error = True
        allowed_file_extensions.update(extensions)

    collection_dir = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])
    is_empty = True
    count = 0
    for dirpath, dirnames, filenames in os.walk(collection_dir):
        for filename in filenames:
            is_empty = False

            filepath = os.path.join(dirpath, filename)

            file_extension = get_file_extension(filepath)
            if file_extension not in allowed_file_extensions:
                count += 1

                print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' with a file with unsupported file extension: {filepath}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    filepath=filepath,
                ))
                error = True

            if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
                break

        if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
            break

    if is_empty:
        print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' without any files.".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
        ))
        error = True

    return error


def validate_multi_index(dataset_description_path, data_resource, multi_index_column):
    error = False

    suggested_target_columns = []
    for column_description in data_resource['columns']:
        if 'suggestedTarget' in column_description['role']:
            suggested_target_columns.append(column_description['colIndex'])

    data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])

    data = read_csv(data_path)

    attribute_columns = [column_index for column_index in range(len(data.columns)) if column_index != multi_index_column and column_index not in suggested_target_columns]
    attributes = data.iloc[:, attribute_columns].set_index(data.iloc[:, multi_index_column])

    count = 0
    for group_name, group in attributes.groupby(level=0):
        # The first row in a group is not marked, so we add 1 to number of duplicated rows.
        if group.duplicated(keep='first').sum() + 1 != len(group):
            count += 1

            print("ERROR: Dataset '{dataset_path}' has a multi-index resource '{resource_id}' with all attributes in rows not equal for index value '{value}'.".format(
                dataset_path=dataset_description_path,
                resource_id=data_resource['resID'],
                value=group_name,
            ))
            error = True

        if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT:
            break

    return error


def validate_edgelist(dataset_description_path, data_resource):
    error = False

    found_source = False
    is_directed_source = None
    is_multi_source = None
    found_target = False
    is_directed_target = None
    is_multi_target = None
    for column_description in data_resource['columns']:
        if 'edgeSource' in column_description['role']:
            # We have to check this only here or only in "edgeTarget" case. We check it here.
            if 'edgeTarget' in column_description['role']:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting source vs. target column roles.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True

            if found_source:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with multiple edge source columns.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True
                continue
            found_source = True

            if 'multiEdgeSource' in column_description['role']:
                if is_multi_source is None:
                    is_multi_source = True
                elif is_multi_source != True:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if 'simpleEdgeSource' in column_description['role']:
                if is_multi_source is None:
                    is_multi_source = False
                elif is_multi_source != False:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if is_multi_source is None:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing multi vs. simple column role.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True

            if 'directedEdgeSource' in column_description['role']:
                if is_directed_source is None:
                    is_directed_source = True
                elif is_directed_source != True:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if 'undirectedEdgeSource' in column_description['role']:
                if is_directed_source is None:
                    is_directed_source = False
                elif is_directed_source != False:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if is_directed_source is None:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing directed vs. undirected column role.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True

        if 'edgeTarget' in column_description['role']:
            if found_target:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with multiple edge target columns.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True
                continue
            found_target = True

            if 'multiEdgeTarget' in column_description['role']:
                if is_multi_target is None:
                    is_multi_target = True
                elif is_multi_target != True:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if 'simpleEdgeTarget' in column_description['role']:
                if is_multi_target is None:
                    is_multi_target = False
                elif is_multi_target != False:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if is_multi_target is None:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing multi vs. simple column role.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True

            if 'directedEdgeTarget' in column_description['role']:
                if is_directed_target is None:
                    is_directed_target = True
                elif is_directed_target != True:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if 'undirectedEdgeTarget' in column_description['role']:
                if is_directed_target is None:
                    is_directed_target = False
                elif is_directed_target != False:
                    print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                    ))
                    error = True

            if is_directed_target is None:
                print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing directed vs. undirected column role.".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                ))
                error = True

    if not found_source:
        print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing edge source column role.".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
        ))
        error = True
    if not found_target:
        print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing edge target column role.".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
        ))
        error = True

    if found_source and found_target:
        if is_directed_source != is_directed_target:
            print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format(
                dataset_path=dataset_description_path,
                resource_id=data_resource['resID'],
            ))
            error = True

        if is_multi_source != is_multi_target:
            print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format(
                dataset_path=dataset_description_path,
                resource_id=data_resource['resID'],
            ))
            error = True

    return error


def get_file_extension(path):
    extension = os.path.splitext(path)[1]
    if extension:
        # We remove leading dot as returned from "splitext".
        return extension[1:]
    else:
        raise ValueError(f"Cannot get file extension of '{path}'.")


def validate_dataset(dataset_description_path, dataset_description):
    error = False

    for data_resource in dataset_description['dataResources']:
        if os.path.splitext(os.path.basename(data_resource['resPath']))[0] == 'learningData' and data_resource['resID'] != 'learningData':
            print("ERROR: Dataset '{dataset_path}' has a dataset entry point without 'learningData' as resource's ID, but '{resource_id}'.".format(
                dataset_path=dataset_description_path,
                resource_id=data_resource['resID'],
            ))
            error = True

        if data_resource['resID'] == 'learningData':
            if data_resource.get('isCollection', False):
                print("ERROR: Dataset '{dataset_path}' has a dataset entry point which is a collection.".format(
                    dataset_path=dataset_description_path,
                ))
                error = True

            if 'columns' not in data_resource:
                print("ERROR: Dataset '{dataset_path}' has a dataset entry point without columns metadata.".format(
                    dataset_path=dataset_description_path,
                ))
                error = True

        if 'columns' in data_resource:
            index_columns = []
            multi_index_columns = []
            key_columns = []
            edgelist_columns = []
            for column_description in data_resource['columns']:
                if 'index' in column_description['role']:
                    index_columns.append(column_description['colIndex'])
                if 'multiIndex' in column_description['role']:
                    multi_index_columns.append(column_description['colIndex'])
                if 'key' in column_description['role']:
                    key_columns.append(column_description['colIndex'])
                if any(edgelist_column_role in column_description['role'] for edgelist_column_role in EDGELIST_COLUMN_ROLES):
                    edgelist_columns.append(column_description['colIndex'])

            index_columns_set = set(index_columns)
            multi_index_columns_set = set(multi_index_columns)
            key_columns_set = set(key_columns)

            if index_columns_set & multi_index_columns_set:
                print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both index and multi-index at the same time: {index_columns}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    index_columns=sorted(index_columns_set & multi_index_columns_set),
                ))
                error = True
            elif data_resource['resID'] == 'learningData' and len(index_columns) + len(multi_index_columns) == 0:
                print("ERROR: Dataset '{dataset_path}' has a dataset entry point with no index columns.".format(
                    dataset_path=dataset_description_path,
                ))
                error = True
            elif len(index_columns) + len(multi_index_columns) > 1:
                print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with multiple index columns: {index_columns}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    index_columns=index_columns + multi_index_columns,
                ))
                error = True

            if index_columns_set & key_columns_set:
                print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both index and key at the same time: {index_columns}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    index_columns=sorted(index_columns_set & key_columns_set),
                ))
                error = True

            if multi_index_columns_set & key_columns_set:
                print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both multi-index and key at the same time: {index_columns}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    index_columns=sorted(multi_index_columns_set & key_columns_set),
                ))
                error = True

            if data_resource.get('isCollection', False):
                continue

            for column_index in index_columns:
                error = validate_column_values(dataset_description_path, data_resource, column_index, unique=True, no_missing=True) or error
            for column_index in multi_index_columns:
                error = validate_column_values(dataset_description_path, data_resource, column_index, unique=False, no_missing=True) or error
            for column_index in key_columns:
                error = validate_column_values(dataset_description_path, data_resource, column_index, unique=True, no_missing=False) or error

            for column_description in data_resource['columns']:
                if 'refersTo' in column_description and column_description['refersTo']['resObject'] == 'item':
                    error = validate_files(dataset_description_path, data_resource, dataset_description, column_description['colIndex'], column_description['refersTo']['resID']) or error

            if edgelist_columns:
                error = validate_edgelist(dataset_description_path, data_resource) or error

            if len(multi_index_columns) == 1:
                error = validate_multi_index(dataset_description_path, data_resource, multi_index_columns[0]) or error

        for res_format in data_resource['resFormat'].keys():
            if res_format not in res_format_to_extensions:
                print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with unsupported format: {res_format}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    res_format=res_format,
                ))
                error = True

        if data_resource.get('isCollection', False):
            error = validate_collection(dataset_description_path, data_resource) or error
        else:
            if len(data_resource['resFormat']) == 1:
                file_extension = get_file_extension(data_resource['resPath'])
                # There should be only one resource format listed for non-collection resources.
                if file_extension not in list(data_resource['resFormat'].values())[0]:
                    print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with invalid resource path file extension: {file_extension}".format(
                        dataset_path=dataset_description_path,
                        resource_id=data_resource['resID'],
                        file_extension=file_extension,
                    ))
                    error = True
            else:
                print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with invalid number of listed formats: {count}".format(
                    dataset_path=dataset_description_path,
                    resource_id=data_resource['resID'],
                    count=len(data_resource['resFormat']),
                ))
                error = True

    return error


def validate_dataset_description(dataset_description_path, known_dataset_descriptions, *, strict_naming=True):
    print("Validating dataset '{dataset_description_path}'.".format(dataset_description_path=dataset_description_path))

    try:
        with open(dataset_description_path) as dataset_description_file:
            dataset_description = json.load(dataset_description_file)

        if not dataset_description_validator.validate(dataset_description):
            print("ERROR: Schema validation: {errors}".format(errors=dataset_description_validator.errors))
            return True

        dataset_id = dataset_description['about']['datasetID']

        # Handle a special case for SCORE dataset splits (those which have "targets.csv" file).
        # They are the same as TEST dataset splits, but we present them differently, so that
        # SCORE dataset splits have targets as part of data. Because of this we also update
        # corresponding dataset ID.
        # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176
        if os.path.exists(os.path.join(os.path.dirname(dataset_description_path), '..', 'targets.csv')) and dataset_id.endswith('_TEST'):
            dataset_id = dataset_id[:-5] + '_SCORE'
        if dataset_id in known_dataset_descriptions:
            print("ERROR: Duplicate dataset ID '{dataset_id}': '{first_path}' and '{second_path}'".format(
                dataset_id=dataset_id,
                first_path=known_dataset_descriptions[dataset_id]['path'],
                second_path=dataset_description_path,
            ))
            return True

        known_dataset_descriptions[dataset_id] = {
            'path': dataset_description_path,
            'description': dataset_description,
        }

        if validate_dataset_path(dataset_id, dataset_description_path, strict_naming=strict_naming):
            return True

        #if 'digest' not in dataset_description['about']:
        #    print("ERROR: Dataset '{dataset_path}' missing digest.".format(dataset_path=dataset_description_path))
        #    return True

        if validate_dataset(dataset_description_path, dataset_description):
            return True

    except Exception:
        print("ERROR: Unexpected exception:")
        traceback.print_exc()
        return True

    return False


def validate_problem_description(problem_description_path, known_problem_descriptions):
    print("Validating problem '{problem_description_path}'.".format(problem_description_path=problem_description_path))

    try:
        with open(problem_description_path) as problem_description_file:
            problem_description = json.load(problem_description_file)

        if not problem_description_validator.validate(problem_description):
            print("ERROR: Schema validation: {errors}".format(errors=problem_description_validator.errors))
            return True

        problem_id = problem_description['about']['problemID']

        # Handle a special case for SCORE dataset splits (those which have "targets.csv" file).
        # They are the same as TEST dataset splits, but we present them differently, so that
        # SCORE dataset splits have targets as part of data. Because of this we also update
        # corresponding problem ID.
        # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176
        if os.path.exists(os.path.join(os.path.dirname(problem_description_path), '..', 'targets.csv')) and problem_id.endswith('_TEST'):
            problem_id = problem_id[:-5] + '_SCORE'

            # Also update dataset references.
            for data in problem_description.get('inputs', {}).get('data', []):
                if data['datasetID'].endswith('_TEST'):
                    data['datasetID'] = data['datasetID'][:-5] + '_SCORE'

        # All problem descriptions show be the same.
        if problem_id.endswith('_TRAIN') or problem_id.endswith('_TEST') or problem_id.endswith('_SCORE'):
            print("ERROR: Invalid problem ID '{problem_id}' in '{problem_description_path}'.".format(
                problem_id=problem_id,
                problem_description_path=problem_description_path,
            ))
            return True

        if problem_id in known_problem_descriptions:
            # Problem descriptions with same ID should have the same content.
            if problem_description == known_problem_descriptions[problem_id]['description']:
                known_problem_descriptions[problem_id]['paths'].append(problem_description_path)
            else:
                print("ERROR: Duplicate problem ID '{problem_id}', but different problem description: {first_paths} and '{second_path}'".format(
                    problem_id=problem_id,
                    first_paths=known_problem_descriptions[problem_id]['paths'],
                    second_path=problem_description_path,
                ))
                return True

        else:
            known_problem_descriptions[problem_id] = {
                'paths': [problem_description_path],
                'description': problem_description,
            }

        if os.path.basename(problem_description_path) != 'problemDoc.json':
            print("ERROR: Problem description filename '{problem_description_path}' is not 'problemDoc.json'.".format(
                problem_description_path=problem_description_path,
            ))
            return True

        if validate_metrics(problem_description):
            return True

        if validate_keywords(problem_description):
            return True

        split_path = os.path.dirname(problem_description_path).split(os.sep)
        for split_directory in ['problem_TRAIN', 'problem_TEST', 'problem_SCORE']:
            if split_directory in split_path and 'datasetViewMaps' not in problem_description.get('inputs', {}).get('dataSplits', {}):
                print("ERROR: Problem '{problem_description_path}' is missing dataset view maps.".format(
                    problem_description_path=problem_description_path,
                ))
                return True

    except Exception:
        print("ERROR: Unexpected exception:")
        traceback.print_exc()
        return True

    return False


def validate_column_values(dataset_description_path, data_resource, column_index, *, unique, no_missing):
    error = False

    data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath'])

    data = read_csv(data_path)

    column_values = data.iloc[:, column_index]

    # We assume missing values is represented as empty strings.
    column_values_without_missing = column_values[column_values != '']

    # There should be no NA anyway anymore.
    value_counts = column_values_without_missing.value_counts(dropna=True)

    if unique and (value_counts > 1).sum():
        duplicate = list(value_counts[value_counts > 1].keys())
        if LIMIT_OUTPUT is not None:
            duplicate = duplicate[:LIMIT_OUTPUT]

        print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with column {column_index} which should have unique values but it does not. Example duplicate values: {duplicate}".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
            column_index=column_index,
            duplicate=duplicate,
        ))
        error = True

    if no_missing and len(column_values) != len(column_values_without_missing):
        print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with column {column_index} which should have no missing values but it does have them.".format(
            dataset_path=dataset_description_path,
            resource_id=data_resource['resID'],
            column_index=column_index,
        ))
        error = True

    return error


def validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target):
    error = False

    data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath'])

    data = read_csv(data_path)

    target_values = data.iloc[:, target['colIndex']]
    distinct_values = list(target_values.value_counts(dropna=False).keys())
    number_distinct_values = len(distinct_values)
    # We assume missing values is represented as empty strings.
    has_missing_values = '' in distinct_values
    if has_missing_values:
        # We do not count missing values as distinct values.
        number_distinct_values -= 1
    task_keywords = set(problem_description['about']['taskKeywords'])

    if 'binary' in task_keywords:
        if number_distinct_values != 2:
            print("ERROR: Problem {problem_paths} has 'binary' keyword, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
                problem_paths=problem_paths,
                number_distinct_values=number_distinct_values,
            ))
            error = True
    elif 'multiClass' in task_keywords:
        if number_distinct_values < 3:
            print("ERROR: Problem {problem_paths} has 'multiClass' keyword, but target column does not have more than 2 distinct values, but {number_distinct_values}.".format(
                problem_paths=problem_paths,
                number_distinct_values=number_distinct_values,
            ))
            error = True

    for metric in problem_description.get('inputs', {}).get('performanceMetrics', []):
        if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']:
            if number_distinct_values != 2:
                print("ERROR: Problem {problem_paths} uses '{metric}' metric, but target column does not have 2 distinct values, but {number_distinct_values}.".format(
                    problem_paths=problem_paths,
                    metric=metric['metric'],
                    number_distinct_values=number_distinct_values,
                ))
                error = True
            if 'posLabel' in metric and metric['posLabel'] not in distinct_values:
                print("ERROR: Problem {problem_paths} provides 'posLabel' for metric '{metric}' with value '{value}', but possible values are: {distinct_values}".format(
                    problem_paths=problem_paths,
                    metric=metric['metric'],
                    value=metric['posLabel'],
                    distinct_values=sorted(distinct_values),
                ))
                error = True

    if has_missing_values and not task_keywords & {'semiSupervised', 'clustering'}:
        print("ERROR: Problem {problem_paths} has target column with missing values, but it not a semi-supervised or clustering task.".format(
            problem_paths=problem_paths,
        ))
        error = True
    if 'semiSupervised' in task_keywords and not has_missing_values:
        print("ERROR: Problem {problem_paths} is a semi-supervised task, but does not have a target column with missing values.".format(
            problem_paths=problem_paths,
        ))
        error = True

    return error


def get_all_columns(dataset_path, resource_id, data_resource):
    data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath'])

    data = read_csv(data_path)

    data_columns = [{
        'colIndex': column_index,
        'colName': column_name,
        'colType': 'unknown',
        'role': []
    } for column_index, column_name in enumerate(data.columns)]

    columns = data_resource.get('columns', None)

    if columns is None:
        return data_columns

    if 'columnsCount' in data_resource and data_resource['columnsCount'] != len(data_columns):
        raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' with incorrect columns count {columns_count} (correct {correct_count}).".format(
            dataset_path=dataset_path,
            resource_id=resource_id,
            columns_count=data_resource['columnsCount'],
            correct_count=len(data_columns),
        ))

    if len(columns) >= len(data_columns):
        columns_names = [{'colIndex': c['colIndex'], 'colName': c['colName']} for c in columns]
        data_columns_names = [{'colIndex': c['colIndex'], 'colName': c['colName']} for c in data_columns]

        if columns_names != data_columns_names:
            raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' where metadata columns do not match data columns.".format(
                dataset_path=dataset_path,
                resource_id=resource_id,
            ))

        return columns

    else:
        for column in columns:
            if column['colName'] != data_columns[column['colIndex']]['colName']:
                raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' where column name '{metadata_name}' in metadata does not match column name '{data_name}' in data.".format(
                    dataset_path=dataset_path,
                    resource_id=resource_id,
                    metadata_name=column['colName'],
                    data_name=data_columns[column['colIndex']]['colName'],
                ))

            data_columns[column['colIndex']] = column

        return data_columns


def validate_target(problem_paths, dataset_path, problem_description, dataset_description, target, check_target_values):
    error = False

    try:
        for data_resource in dataset_description['dataResources']:
            if data_resource['resID'] == target['resID']:
                columns = get_all_columns(dataset_path, data_resource['resID'], data_resource)
                for column in columns:
                    if target['colName'] == column['colName'] or target['colIndex'] == column['colIndex']:
                        if not (target['colName'] == column['colName'] and target['colIndex'] == column['colIndex']):
                            print("ERROR: Problem {problem_paths} has a target '{target_index}' which does not match a column '{column_index}' in dataset '{dataset_path}' fully.".format(
                                problem_paths=problem_paths,
                                target_index=target['targetIndex'],
                                column_index=column['colIndex'],
                                dataset_path=dataset_path,
                            ))
                            error = True

                        if check_target_values:
                            error = validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target) or error

                        break
                else:
                    raise KeyError("Cannot find column with column name '{column_name}' or column index '{column_index}'.".format(
                        column_name=target['colName'],
                        column_index=target['colIndex'],
                    ))

                break
        else:
            raise KeyError("Cannot find data resource with resource ID '{resource_id}'.".format(
                resource_id=target['resID'],
            ))

    except (IndexError, KeyError):
        print("ERROR: Problem {problem_paths} has target with index '{target_index}' which does not resolve.".format(
            problem_paths=problem_paths,
            target_index=target['targetIndex'],
        ))
        return True

    except ValueError as error:
        print("ERROR: {error}".format(
            error=error,
        ))
        return True

    return error


def canonical_dataset_description(dataset_description):
    dataset_description = copy.deepcopy(dataset_description)

    del dataset_description['about']['datasetID']
    if 'digest' in dataset_description['about']:
        del dataset_description['about']['digest']

    return dataset_description


def datasets_equal(first_dataset_path, second_dataset_path):
    if first_dataset_path == second_dataset_path:
        return True

    first_dataset_base_path = os.path.dirname(first_dataset_path)
    second_dataset_base_path = os.path.dirname(second_dataset_path)

    dir_comparison = deep_dircmp.DeepDirCmp(first_dataset_base_path, second_dataset_base_path, hide=[], ignore=[])

    different_files = dir_comparison.get_left_only_recursive() + dir_comparison.get_right_only_recursive() + dir_comparison.get_common_funny_recursive() + dir_comparison.get_diff_files_recursive()

    # This one can be different. And if it is different, we compare it elsewhere for allowed differences.
    if 'datasetDoc.json' in different_files:
        different_files.remove('datasetDoc.json')

    if different_files:
        print("ERROR: Dataset '{first_dataset_path}' and dataset '{second_dataset_path}' are not the same: {differences}".format(
            first_dataset_path=first_dataset_path,
            second_dataset_path=second_dataset_path,
            differences=different_files,
        ))
        return False

    return True


def validate_dataset_reference(dataset_id, dataset_descriptions, targets, problem_description_value, check_target_values):
    error = False

    if dataset_id not in dataset_descriptions:
        print("ERROR: Problem {problem_paths} is referencing unknown dataset '{dataset_id}'.".format(
            problem_paths=problem_description_value['paths'],
            dataset_id=dataset_id,
        ))
        error = True
    else:
        dataset_description_value = dataset_descriptions[dataset_id]
        dataset_description = dataset_description_value['description']
        for i, target in enumerate(targets):
            if target['targetIndex'] != i:
                print("ERROR: Problem {problem_paths} has target with invalid target index '{target_index}'.".format(
                    problem_paths=problem_description_value['paths'],
                    target_index=target['targetIndex'],
                ))
                error = True
            error = validate_target(problem_description_value['paths'], dataset_description_value['path'], problem_description_value['description'], dataset_description, target, check_target_values) or error

    return error


def map_dataset_id(dataset_id, dataset_view_map):
    for view_map in dataset_view_map:
        if view_map['from'] == dataset_id:
            return view_map['to']
    else:
        raise KeyError("Could not map '{dataset_id}' in dataset view map.".format(dataset_id=dataset_id))


def validate(dataset_descriptions, problem_descriptions):
    print("Validating all datasets and problems.")

    error = False

    dataset_description_groups = collections.defaultdict(list)

    for problem_description_value in problem_descriptions.values():
        problem_description = problem_description_value['description']
        for data in problem_description.get('inputs', {}).get('data', []):
            error = validate_dataset_reference(data['datasetID'], dataset_descriptions, data.get('targets', []), problem_description_value, True) or error

            if 'datasetViewMaps' in problem_description.get('inputs', {}).get('dataSplits', {}):
                if {'train', 'test', 'score'} != set(problem_description['inputs']['dataSplits']['datasetViewMaps'].keys()):
                    print("ERROR: Problem {problem_paths} has dataset view maps with invalid keys.".format(
                        problem_paths=problem_description_value['paths'],
                    ))
                    error = True
                else:
                    error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['train']), dataset_descriptions, data.get('targets', []), problem_description_value, True) or error

                    # Test and score splits do not have all values, so we do not validate target values there.
                    error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['test']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error
                    error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['score']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error

        if 'clustering' in problem_description['about']['taskKeywords']:
            for data in problem_description.get('inputs', {}).get('data', []):
                for target in data.get('targets', []):
                    if 'numClusters' not in target:
                        print("ERROR: Problem {problem_paths} is a clustering problem but is missing 'numClusters' in target '{target_index}'.".format(
                            problem_paths=problem_description_value['paths'],
                            target_index=target['targetIndex'],
                        ))
                        error = True

            if 'dataSplits' in problem_description['inputs'] and set(problem_description['inputs']['dataSplits'].keys()) - {'datasetViewMaps'}:
                print("ERROR: Problem {problem_paths} is a clustering problem with data splitting configuration, but it should not have one.".format(
                    problem_paths=problem_description_value['paths'],
                ))
                error = True

    for dataset_description_value in dataset_descriptions.values():
        dataset_description = dataset_description_value['description']

        dataset_id = dataset_description['about']['datasetID']

        for suffix in ['_TEST', '_TRAIN', '_SCORE']:
            if dataset_id.endswith(suffix):
                dataset_description_groups[dataset_id[:-len(suffix)]].append(dataset_description_value)
                break

    for problem_description_value in problem_descriptions.values():
        problem_description = problem_description_value['description']

        # If any clustering problem is using dataset splits, we validate those splits.
        if 'clustering' in problem_description['about']['taskKeywords']:
            for data in problem_description.get('inputs', {}).get('data', []):
                # We check this elsewhere.
                if data['datasetID'] not in dataset_descriptions:
                    continue

                dataset_id = data['datasetID']

                for suffix in ['_TEST', '_TRAIN', '_SCORE']:
                    if dataset_id.endswith(suffix):
                        base_dataset_id = dataset_id[:-len(suffix)]
                        break
                else:
                    base_dataset_id = dataset_id

                # There should always be at least one dataset.
                datasets = dataset_description_groups[base_dataset_id]
                if len(datasets) > 1:
                    first_dataset_path = datasets[0]['path']
                    for second_dataset_value in datasets[1:]:
                        second_dataset_path = second_dataset_value['path']
                        if not datasets_equal(first_dataset_path, second_dataset_path):
                            print("ERROR: Problem {problem_paths} is a clustering problem, but its data splits are not all the same, for example, {first_dataset_path} and {second_dataset_path}.".format(
                                problem_paths=problem_description_value['paths'],
                                first_dataset_path=first_dataset_path,
                                second_dataset_path=second_dataset_path,
                            ))
                            error = True
                            break

    for dataset_description_group in dataset_description_groups.values():
        first_dataset_description_value = dataset_description_group[0]
        first_dataset_description = canonical_dataset_description(first_dataset_description_value['description'])
        for dataset_description_value in dataset_description_group[1:]:
            dataset_description = canonical_dataset_description(dataset_description_value['description'])

            if first_dataset_description != dataset_description:
                print("ERROR: Dataset '{first_dataset_path}' and dataset '{dataset_path}' are not the same.".format(
                    first_dataset_path=first_dataset_description_value['path'],
                    dataset_path=dataset_description_value['path'],
                ))
                error = True

    return error


def search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, *, strict_naming=True):
    error = False

    datasets_directory = os.path.abspath(datasets_directory)

    for dirpath, dirnames, filenames in os.walk(datasets_directory, followlinks=True):
        if 'datasetDoc.json' in filenames:
            # Do not traverse further (to not parse "datasetDoc.json" if they
            # exists in raw data filename).
            dirnames[:] = []

            dataset_description_path = os.path.join(dirpath, 'datasetDoc.json')

            error = validate_dataset_description(dataset_description_path, known_dataset_descriptions, strict_naming=strict_naming) or error

        if 'problemDoc.json' in filenames:
            # We continue traversing further in this case.

            problem_description_path = os.path.join(dirpath, 'problemDoc.json')

            error = validate_problem_description(problem_description_path, known_problem_descriptions) or error

    return error


def configure_parser(parser: argparse.ArgumentParser, *, skip_arguments=()):
    if 'no_strict_naming' not in skip_arguments:
        parser.add_argument(
            '-n', '--no-strict-naming', default=True, action='store_false', dest='strict_naming',
            help="do not require strict naming convention",
        )
    if 'directories' not in skip_arguments:
        parser.add_argument(
            'directories', metavar='DIR', nargs='*', default=['.'],
            help="path to a directory with datasets, default is current directory",
        )


def handler(arguments):
    error = False

    known_dataset_descriptions = {}
    known_problem_descriptions = {}

    for datasets_directory in arguments.directories:
        error = search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, strict_naming=arguments.strict_naming) or error

    error = validate(known_dataset_descriptions, known_problem_descriptions) or error

    if error:
        print("There are ERRORS.")
        sys.exit(1)
    else:
        print("There are no errors.")


def main(argv):
    parser = argparse.ArgumentParser(description="Validate datasets.")
    configure_parser(parser)

    arguments = parser.parse_args(argv[1:])

    handler(arguments)


if __name__ == '__main__':
    main(sys.argv)