|
- # TODO: Wrap it as a class and connect it to GUI
- # A script to transform anomaly data to d3m format
- import pandas as pd
- import numpy as np
- import os
- import json
-
- ##############################
- # Some information for the dataset to be transformed
- # Designed for time series data
- name = 'kpi'
- src_path = './raw_data/kpi.csv'
- label_name = 'label'
- timestamp_name = 'timestamp'
- value_names = ['value']
- ratio = 0.8 # Ratio of training data, the rest is for testing
-
- ###############################
-
-
-
- dst_root = './' + name
- dirs = ['./', 'SCORE', 'TEST', 'TRAIN']
- maps = {'./': None, 'SCORE': 'TEST', 'TEST': 'TEST', 'TRAIN': 'TRAIN'}
-
- # Create the corresponding directories
- for d in dirs:
- if maps[d] is not None:
- dataset_name = 'dataset_' + maps[d]
- problem_name = 'problem_' + maps[d]
- else:
- dataset_name = name + '_dataset'
- problem_name = name + '_problem'
- tables_dir = os.path.join(dst_root, d, dataset_name, 'tables')
- if not os.path.exists(tables_dir):
- os.makedirs(tables_dir)
- problem_dir = os.path.join(dst_root, d, problem_name)
- if not os.path.exists(problem_dir):
- os.makedirs(problem_dir)
-
- # Process data
- _df = pd.DataFrame()
- df = pd.read_csv(src_path)
- _df['d3mIndex'] = df.index
- _df['timestamp'] = df[timestamp_name]
- for value_name in value_names:
- _df[value_name] = df[value_name]
- _df['ground_truth'] = df[label_name]
- df = _df
- cols = df.columns.tolist()
-
- # Save all the data
- df.to_csv(os.path.join(dst_root, name+'_dataset', 'tables', 'learningData.csv'), index=False)
-
- # Save training and testing data
- train_df, test_df = df[:int(df.shape[0]*ratio)], df[int(df.shape[0]*ratio):]
-
- train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'tables', 'learningData.csv'), index=False)
- test_df.to_csv(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'tables', 'learningData.csv'), index=False)
- test_df.to_csv(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'tables', 'learningData.csv'), index=False)
-
- # Data splits
- row_0 = train_df.shape[0]
- row_1 = train_df.shape[0]
- row = row_0 + row_1
- df = pd.DataFrame(np.array([[i for i in range(row)], ['TRAIN' for _ in range(row_0)] + ['TEST' for _ in range(row_1)], [0 for _ in range(row)], [0 for _ in range(row)]]).transpose(), columns = ['d3mIndex', 'type', 'repeat', 'fold'])
-
- # Save data splits for all data
- train_df.to_csv(os.path.join(dst_root, name+'_problem', 'dataSplits.csv'), index=False)
-
- # Save training and testing splits
- train_df, test_df = df[:row_0], df[row_0:]
- train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'dataSplits.csv'), index=False)
- test_df.to_csv(os.path.join(dst_root, 'TEST', 'problem_TEST', 'dataSplits.csv'), index=False)
- test_df.to_csv(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'dataSplits.csv'), index=False)
-
-
- # Dataset JSON files
- # Load template
- with open('template/datasetDoc.json') as json_file:
- data = json.load(json_file)
- columns = []
- for i in range(len(cols)):
- c = {}
- c['colIndex'] = i
- c['colName'] = cols[i]
- if i == 0:
- c['colType'] = 'integer'
- c['role'] = ['index']
- elif i == 1:
- c['colType'] = 'integer'
- c['role'] = ['attribute']
- elif i == len(cols)-1:
- c['colType'] = 'integer'
- c['role'] = ['suggestedTarget']
- else:
- c['colType'] = 'real'
- c['role'] = ['attribute']
- columns.append(c)
- data['dataResources'][0]['columns'] = columns
- data['dataResources'][0]['columnsCount'] = len(cols)
-
- data['about']['datasetID'] = name + '_dataset'
- data['about']['datasetName'] = name
- with open(os.path.join(dst_root, name+'_dataset', 'datasetDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- data['about']['datasetID'] = name +'_dataset_TRAIN'
- data['about']['datasetName'] = "NULL"
- with open(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'datasetDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- data['about']['datasetID'] = name + '_dataset_TEST'
- data['about']['datasetName'] = 'NULL'
- with open(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- data['about']['datasetID'] = name + '_dataset_TEST'
- data['about']['datasetName'] = 'NULL'
- with open(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- # Problem JSON files
- # Load template
- with open('template/problemDoc.json') as json_file:
- data = json.load(json_file)
-
- data['about']['problemID'] = name+'_problem'
- data['about']['problemName'] = name+'_problem'
- data['about']['problemDescription'] = 'Anomaly detection'
- data['about']['taskKeywords'] = ['classification', 'binary', 'tabular']
- data['inputs']['data'][0]['datasetID'] = name + '_dataset'
- data['inputs']['data'][0]['targets'][0]['colIndex'] = len(cols)-1
- data['inputs']['data'][0]['targets'][0]['colName'] = cols[-1]
- data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['from'] = name+'_dataset'
- data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['from'] = name+'_dataset'
- data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['from'] = name+'_dataset'
- data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['to'] = name+'_dataset_TRAIN'
- data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['to'] = name+'_dataset_TEST'
- data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['to'] = name+'_dataset_SCORE'
-
- with open(os.path.join(dst_root, name+'_problem', 'problemDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- with open(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'problemDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- with open(os.path.join(dst_root, 'TEST', 'problem_TEST', 'problemDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- with open(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'problemDoc.json'), 'w') as outfile:
- json.dump(data, outfile, indent=4)
-
- # Make an empty targets.csv
- with open(os.path.join(dst_root, 'SCORE', 'targets.csv'), 'w') as outfile:
- outfile.write('')
-
-
-
|