|
- import pandas as pd
- import numpy as np
- import os
- import requests
-
-
- def preprocess_web_attack():
- def get_data():
- link="http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/MachineLearningCSV.zip"
- r = requests.get(link)
- with open('./raw_data/cicids.zip', 'wb') as f:
- f.write(r.content)
- os.system("unzip ./raw_data/cicids.zip -d ./raw_data")
- os.system("mv ./raw_data/MachineLearningCSV/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv ./raw_data/cicids.csv")
- os.system("rm -r MachineLearningCSV")
- os.system("rm cicids.zip")
-
- if get_data == True:
- get_data()
- #df = pd.read_csv("./raw_data/MachineLearningCSV/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
- df = pd.read_csv("./raw_data/cicids.csv")
-
- df.replace([float('inf'), 'Infinity',''], np.nan, inplace=True)
- df = df.dropna()
- #df = df.sample(frac=0.05, replace=False, random_state=1)
- df[' Timestamp'] = pd.to_datetime(df[' Timestamp'], infer_datetime_format=True)
- df = df.sort_values(by=[' Timestamp'])
-
- # drop nan and str columns
- drop_cols = list(df.columns)[0:5]
- drop_cols = list(df.columns)[0:5]
- drop_cols.append(list(df.columns)[6])
- df = df.drop(columns=drop_cols)
-
- # relabeing and put label in the first column
- df[' Label'] = df[' Label'].map({'BENIGN':"0", "Web Attack Brute Force": "1","Web Attack Sql Injection": "1", "Web Attack XSS": "1"})
- cols = df.columns.tolist()
- cols = cols[-1:] + cols[:-1]
- df = df[cols]
- df.to_csv("../web_attack.csv", index=False, encoding='utf-8')
-
- if __name__ == "__main__":
- preprocess_web_attack()
|