### Load and Split Dataset Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Loads a .npy dataset, cleans it by removing NaN/Inf values, optionally drops the destination port feature, and performs a stratified train-test split. ```python import numpy as np from sklearn.model_selection import train_test_split DATA_VERSION = "no_artefacts_with_payload_filter" DITCH_DEST_PORT = True DATA_SPLIT_RANDOM_STATE = 44 def load_data(): full_dataset = np.load("NumpyFriendlyData/full_dataset_" + DATA_VERSION + ".npy") full_dataset = full_dataset[~np.isnan(full_dataset).any(axis=1)] full_dataset = full_dataset[~np.isinf(full_dataset).any(axis=1)] data_x, data_y = full_dataset[:, :-1], full_dataset[:, -1] if DITCH_DEST_PORT: data_x = data_x[:, 1:] # drop Dst Port (index 0) return train_test_split(data_x, data_y, test_size=0.25, stratify=data_y, random_state=DATA_SPLIT_RANDOM_STATE) X_train, X_test, Y_train, Y_test = load_data() print(X_train.shape, X_test.shape) # e.g. (2123057, 76) (707686, 76) ``` -------------------------------- ### Dataset Statistics Function Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Calculates and prints the distribution of attack labels for a given day and version tag from a labeled CSV. Useful for verifying label consistency. ```python # Print label distribution for Friday's corrected CSV dataset_stat_attack(5, 'REVI') # Output: # REVI Stat Friday: # {'BENIGN': 189067, 'Bot': 1956, 'PortScan': 158930, 'DDoS': 41835, # 'Bot - Attempted': 288} # Total: 392076 ``` -------------------------------- ### Full Pipeline for NumPy Array Creation Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Concatenates processed NumPy arrays from all five days into a single dataset and saves it to disk. This is the final step for preparing data for machine learning models. ```python import numpy as np # Each day is imported, relabelled, and converted individually monday_arr = listOfDictToNumpyArray(monday_dict) tuesday_arr = listOfDictToNumpyArray(tuesday_dict) wednesday_arr = listOfDictToNumpyArray(wednesday_dict) thursday_arr = listOfDictToNumpyArray(thursday_dict) fri_arr = listOfDictToNumpyArray(friday_dict) full_dataset = np.concatenate( (monday_arr, tuesday_arr, wednesday_arr, thursday_arr, friday_arr), axis=0 ) saved_numpy_name = 'full_dataset_no_artefacts_with_payload_filter.npy' np.save('NumpyFriendlyData/' + saved_numpy_name, full_dataset) print(full_dataset.shape) # e.g. (2830743, 78) ``` -------------------------------- ### Convert String Labels to Numerical Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Maps string-based attack labels to integer indices in-place within a list of dictionaries. 'Attempted' labels are mapped to '0' (BENIGN) by default, but this mapping is configurable. ```python label_dictionary = { 'BENIGN': '0', 'FTP-Patator': '1', 'SSH-Patator': '2', 'DoS GoldenEye': '3', 'DoS Hulk': '4', 'DoS Slowhttptest': '5', 'DoS slowloris': '6', 'Heartbleed': '7', 'Web Attack - Brute Force': '8', 'Web Attack - XSS': '9', 'Web Attack - Sql Injection': '10', 'Infiltration': '11', 'Bot': '12', 'PortScan': '13', 'DDoS': '14', # All "X - Attempted" → '0' (treated as BENIGN in paper experiments) 'FTP-Patator - Attempted': '0', 'DDoS - Attempted': '0', # ... } rows = [{'Label': 'DoS Hulk'}, {'Label': 'BENIGN'}, {'Label': 'DoS Hulk - Attempted'}] convertToNumericalLabels(rows) print([r['Label'] for r in rows]) # Output: ['4', '0', '0'] ``` -------------------------------- ### Train and Evaluate Random Forest Classifier Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Trains a RandomForestClassifier and evaluates its performance using precision, recall, F-score, and a classification report. Metrics are saved to the Scores/ directory. ```python from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import precision_recall_fscore_support, classification_report RF_RANDOM_STATE = 44 rf_classifier = RandomForestClassifier( n_estimators=50, max_depth=20, random_state=RF_RANDOM_STATE ) rf_classifier.fit(X_train, Y_train) Y_pred = rf_classifier.predict(X_test) prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted') print("Precision, Recall, F-Score, Support:", prfs) # e.g. (0.9991, 0.9991, 0.9991, None) classes = ["Benign", "FTP-Patator", "SSH-Patator", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", "Heartbleed", "Web Attack - Brute Force", "Web Attack - XSS", "Web Attack - Sql Injection", "Infiltration", "Bot", "PortScan", "DDoS"] report = classification_report(Y_test, Y_pred, target_names=classes, zero_division="warn", digits=4) print(report) # Writes to: Scores/RF_no_artefacts_with_payload_filter_metrics_class_based_.txt ``` -------------------------------- ### Label FTP-Patator Attacks on Tuesdays in Python Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt This function identifies FTP-Patator attacks by matching source/destination IPs, ports, and timestamps within a specific window. It delegates to payload_filter for final label assignment. Flows not matching conditions default to None. ```python from datetime import datetime, timedelta DATE_FORMAT_DATASET = '%d/%m/%Y %I:%M:%S %p' TIME_DIFFERENCE = timedelta(hours=5) # CICFlowMeter host TZ − New Brunswick TZ def tuesday_ftp_patator(row): t_start = datetime.strptime('04/07/2017 09:17:00 AM', DATE_FORMAT_DATASET) t_end = datetime.strptime('04/07/2017 10:30:00 AM', DATE_FORMAT_DATASET) attacker, victim = '172.16.0.1', '192.168.10.50' t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: return payload_filter(row, "FTP-Patator") return None # Simulated flow row: [uid, src_ip, src_port, dst_ip, dst_port, proto, # timestamp, ..., fwd_payload_len, ...] flow = ['uid1', '172.16.0.1', '12345', '192.168.10.50', '21', '6', '04/07/2017 02:20:00 PM', '', '', '', '512.0'] + [''] * 68 + [''] print(tuesday_ftp_patator(flow)) # Output: "FTP-Patator" (timestamp is within window after TZ correction) ``` -------------------------------- ### Label Dataset Function Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Defines a function to label network traffic data based on the day of the week. It maps numerical days to string names and selects appropriate attack filters for labeling. ```python def dataset_labeling(day): day_str = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"][day] day_filters = [None, [monday_benign], [tuesday_ftp_patator, tuesday_ssh_patator], [wednesday_dos_slowloris, wednesday_dos_slowhttptest, wednesday_dos_hulk, wednesday_dos_goldeneye, wednesday_heartbleed], [thursday_web_attack_brute_force, thursday_web_attack_xss, thursday_web_attack_sql_injection, thursday_web_attack_infiltration], [friday_botnet, friday_portscan, friday_ddos] ][day] # ... reads INPUT_DIR CSV, rewrites last column with correct label, # writes to OUTPUT_DIR CSV # Label all five days and print per-day statistics label_all_datasets() # Console output example for Tuesday: # REVI Stat Tuesday: # {'BENIGN': 432631, 'FTP-Patator': 7935, 'SSH-Patator': 5897, # 'FTP-Patator - Attempted': 12, 'SSH-Patator - Attempted': 34} # Total: 446509 ``` -------------------------------- ### Plot Confusion Matrix Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Renders and optionally saves a normalized confusion matrix. Cell colors represent row-normalized rates, while displayed numbers are raw counts. ```python import matplotlib.pyplot as plt plot_confusion_matrix( Y_test, Y_pred, classes=classes, normalize=True, save=True, name="RF_no_artefacts_with_payload_filter_04-07_120000" ) # Saves: Figures/RF_no_artefacts_with_payload_filter_04-07_120000.pdf # Displays an interactive 9×9 inch heatmap ``` -------------------------------- ### Import CSV as Dictionary Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Reads a CSV file into a list of dictionaries, using the header row as keys. This function is essential for data ingestion before further processing. ```python import csv def importCsvAsDict(path): csvfile = csv.DictReader(open(path), delimiter=',') return [x for x in csvfile] rows = importCsvAsDict('LabelledDataset/Monday-WorkingHours.pcap_REVI.csv') print(rows[0].keys()) # dict_keys(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', # 'Protocol', 'Timestamp', ..., 'Label']) print(rows[0]['Label']) # Output: 'BENIGN' ``` -------------------------------- ### Process Daily Unlabelled CSVs in Python Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt This function processes a single day's unlabelled CSV file from the UnlabelledDataset directory. It applies registered attack filters, writes a corrected CSV to LabelledDataset, and outputs a label distribution summary. ```python # Configuration INPUT_DIR = 'UnlabelledDataset/' OUTPUT_DIR = 'LabelledDataset/' ``` -------------------------------- ### Calculate Class Feature Importance Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Computes per-class feature importance scores by combining scaled mean feature values with global RF importances. Results are saved as JSON. ```python import json from sklearn.preprocessing import scale def class_feature_importance(X, Y, feature_importances): import numpy as np X = scale(X) out = {} for c in set(Y): out[c] = dict( zip(range(X.shape[1]), np.mean(X[Y == c, :], axis=0) * feature_importances) ) return out feature_importances = rf_classifier.feature_importances_ result = class_feature_importance(X_test, Y_pred, feature_importances) # Inspect top feature for class 13 (PortScan) top_feat = max(result[13.0], key=result[13.0].get) print(f"Feature index {top_feat}: importance {result[13.0][top_feat]:.4f}") # e.g. Feature index 7: importance 0.0412 with open("FeatureImportance/feature_importance_full_dataset_" + DATA_VERSION + "_" + time_id + ".json", 'w') as f: json.dump(result, f) ``` -------------------------------- ### Filter TCP Flows for Payload in Python Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt This function checks if a TCP flow has a forward payload. If the global PAYLOAD_FILTER_ACTIVE flag is true, it downgrades flows without payload to an 'Attempted' sub-label. Non-TCP flows are unaffected. ```python # row[5] = Protocol (6 = TCP) # row[10] = Total Length of Fwd Payload bytes PAYLOAD_FILTER_ACTIVE = True def payload_filter(row, attack_class): if PAYLOAD_FILTER_ACTIVE and int(row[5]) == 6: if float(row[10]) > 0.0: return attack_class # e.g. "DoS Hulk" else: return attack_class + " - Attempted" # e.g. "DoS Hulk - Attempted" else: return attack_class # Example: a TCP flow with no forward payload bytes sample_row = ['...', '172.16.0.1', '80', '192.168.10.50', '443', '6', '04/07/2017 10:45:00 AM', '...', '...', '...', '0.0', ...] print(payload_filter(sample_row, "DoS Hulk")) # Output: "DoS Hulk - Attempted" # Example: a UDP flow — payload filter never applies sample_row_udp = sample_row.copy() sample_row_udp[5] = '17' print(payload_filter(sample_row_udp, "DDoS")) # Output: "DDoS" ``` -------------------------------- ### Convert List of Dictionaries to NumPy Array Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt Transforms a list of dictionaries (representing CSV rows) into a NumPy array of floats. It excludes identifying columns and converts labels to numerical format. ```python import pandas as pd, numpy as np def listOfDictToNumpyArray(list_of_dict): dataframe = pd.DataFrame(list_of_dict) numpy_string_array = dataframe.values # Keep cols 4-5 (Dst Port, Protocol) and cols 7-end (all features + Label) trimmed = np.concatenate( (numpy_string_array[:, 4:6], numpy_string_array[:, 7:]), axis=1 ) return trimmed.astype(np.float) monday_dict = importCsvAsDict('LabelledDataset/Monday-WorkingHours.pcap_REVI.csv') convertToNumericalLabels(monday_dict) arr = listOfDictToNumpyArray(monday_dict) print(arr.shape) # e.g. (529918, 78) — 77 features + 1 label column print(arr[0, -1]) # 0.0 (BENIGN) ``` === COMPLETE CONTENT === This response contains all available snippets from this library. No additional content exists. Do not make further requests.