You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cicids.py 1.7 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import pandas as pd
  2. import numpy as np
  3. import os
  4. import requests
  5. def preprocess_web_attack():
  6. def get_data():
  7. link="http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/MachineLearningCSV.zip"
  8. r = requests.get(link)
  9. with open('./raw_data/cicids.zip', 'wb') as f:
  10. f.write(r.content)
  11. os.system("unzip ./raw_data/cicids.zip -d ./raw_data")
  12. os.system("mv ./raw_data/MachineLearningCSV/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv ./raw_data/cicids.csv")
  13. os.system("rm -r MachineLearningCSV")
  14. os.system("rm cicids.zip")
  15. if get_data == True:
  16. get_data()
  17. #df = pd.read_csv("./raw_data/MachineLearningCSV/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
  18. df = pd.read_csv("./raw_data/cicids.csv")
  19. df.replace([float('inf'), 'Infinity',''], np.nan, inplace=True)
  20. df = df.dropna()
  21. #df = df.sample(frac=0.05, replace=False, random_state=1)
  22. df[' Timestamp'] = pd.to_datetime(df[' Timestamp'], infer_datetime_format=True)
  23. df = df.sort_values(by=[' Timestamp'])
  24. # drop nan and str columns
  25. drop_cols = list(df.columns)[0:5]
  26. drop_cols = list(df.columns)[0:5]
  27. drop_cols.append(list(df.columns)[6])
  28. df = df.drop(columns=drop_cols)
  29. # relabeing and put label in the first column
  30. df[' Label'] = df[' Label'].map({'BENIGN':"0", "Web Attack Brute Force": "1","Web Attack Sql Injection": "1", "Web Attack XSS": "1"})
  31. cols = df.columns.tolist()
  32. cols = cols[-1:] + cols[:-1]
  33. df = df[cols]
  34. df.to_csv("../web_attack.csv", index=False, encoding='utf-8')
  35. if __name__ == "__main__":
  36. preprocess_web_attack()

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算