import pandas as pd import os import requests def preprocess_gecco(): def get_data(): link="https://ndownloader.figshare.com/articles/12451142/versions/1" r = requests.get(link) with open('./raw_data/gecco.zip', 'wb') as f: f.write(r.content) os.system("unzip ./raw_data/gecco.zip -d ./raw_data") os.system("rm ./raw_data/*.pdf ./raw_data/4_ResourcePackage_GECCO_Industrial_Challenge_2018.zip") get_data() df = pd.read_csv("./raw_data/1_gecco2018_water_quality.csv") # drop nan and str columns df = df.dropna() df = df.drop(columns=['Time', df.columns[0]]) cols = df.columns.tolist() cols = cols[-1:] + cols[:-1] df = df[cols] df['EVENT'] = df['EVENT'].map({False:"0", True: "1"}) df = df.rename(columns={"EVENT": "label"}) #df['Class'] = df['Class'].map({0:"nominal", 1: "anomaly"}) #df = df.sample(frac=0.025, replace=False, random_state=1) #df = df.sort_values(by=['Time']) #df = df.drop(columns=['Time']) df.to_csv("../water_quality.csv", index=False, encoding='utf-8') if __name__ == "__main__": preprocess_gecco()