### Load and Split Dataset

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Loads a .npy dataset, cleans it by removing NaN/Inf values, optionally drops the destination port feature, and performs a stratified train-test split.

```python
import numpy as np
from sklearn.model_selection import train_test_split

DATA_VERSION           = "no_artefacts_with_payload_filter"
DITCH_DEST_PORT        = True
DATA_SPLIT_RANDOM_STATE = 44

def load_data():
    full_dataset = np.load("NumpyFriendlyData/full_dataset_" + DATA_VERSION + ".npy")
    full_dataset = full_dataset[~np.isnan(full_dataset).any(axis=1)]
    full_dataset = full_dataset[~np.isinf(full_dataset).any(axis=1)]
    data_x, data_y = full_dataset[:, :-1], full_dataset[:, -1]
    if DITCH_DEST_PORT:
        data_x = data_x[:, 1:]   # drop Dst Port (index 0)
    return train_test_split(data_x, data_y, test_size=0.25,
                            stratify=data_y,
                            random_state=DATA_SPLIT_RANDOM_STATE)

X_train, X_test, Y_train, Y_test = load_data()
print(X_train.shape, X_test.shape)
# e.g. (2123057, 76) (707686, 76)
```

--------------------------------

### Dataset Statistics Function

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Calculates and prints the distribution of attack labels for a given day and version tag from a labeled CSV. Useful for verifying label consistency.

```python
# Print label distribution for Friday's corrected CSV
dataset_stat_attack(5, 'REVI')
# Output:
# REVI Stat Friday:
# {'BENIGN': 189067, 'Bot': 1956, 'PortScan': 158930, 'DDoS': 41835,
#  'Bot - Attempted': 288}
# Total: 392076
```

--------------------------------

### Full Pipeline for NumPy Array Creation

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Concatenates processed NumPy arrays from all five days into a single dataset and saves it to disk. This is the final step for preparing data for machine learning models.

```python
import numpy as np

# Each day is imported, relabelled, and converted individually
monday_arr    = listOfDictToNumpyArray(monday_dict)
tuesday_arr   = listOfDictToNumpyArray(tuesday_dict)
wednesday_arr = listOfDictToNumpyArray(wednesday_dict)
thursday_arr  = listOfDictToNumpyArray(thursday_dict)
fri_arr    = listOfDictToNumpyArray(friday_dict)

full_dataset = np.concatenate(
    (monday_arr, tuesday_arr, wednesday_arr, thursday_arr, friday_arr), axis=0
)

saved_numpy_name = 'full_dataset_no_artefacts_with_payload_filter.npy'
np.save('NumpyFriendlyData/' + saved_numpy_name, full_dataset)
print(full_dataset.shape)  # e.g. (2830743, 78)
```

--------------------------------

### Convert String Labels to Numerical

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Maps string-based attack labels to integer indices in-place within a list of dictionaries. 'Attempted' labels are mapped to '0' (BENIGN) by default, but this mapping is configurable.

```python
label_dictionary = {
    'BENIGN': '0', 'FTP-Patator': '1', 'SSH-Patator': '2',
    'DoS GoldenEye': '3', 'DoS Hulk': '4', 'DoS Slowhttptest': '5',
    'DoS slowloris': '6', 'Heartbleed': '7',
    'Web Attack - Brute Force': '8', 'Web Attack - XSS': '9',
    'Web Attack - Sql Injection': '10', 'Infiltration': '11',
    'Bot': '12', 'PortScan': '13', 'DDoS': '14',
    # All "X - Attempted" → '0' (treated as BENIGN in paper experiments)
    'FTP-Patator - Attempted': '0', 'DDoS - Attempted': '0',  # ...
}

rows = [{'Label': 'DoS Hulk'}, {'Label': 'BENIGN'}, {'Label': 'DoS Hulk - Attempted'}]
convertToNumericalLabels(rows)
print([r['Label'] for r in rows])
# Output: ['4', '0', '0']
```

--------------------------------

### Train and Evaluate Random Forest Classifier

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Trains a RandomForestClassifier and evaluates its performance using precision, recall, F-score, and a classification report. Metrics are saved to the Scores/ directory.

```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, classification_report

RF_RANDOM_STATE = 44

rf_classifier = RandomForestClassifier(
    n_estimators=50, max_depth=20, random_state=RF_RANDOM_STATE
)
rf_classifier.fit(X_train, Y_train)
Y_pred = rf_classifier.predict(X_test)

prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted')
print("Precision, Recall, F-Score, Support:", prfs)
# e.g. (0.9991, 0.9991, 0.9991, None)

classes = ["Benign", "FTP-Patator", "SSH-Patator", "DoS GoldenEye",
           "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", "Heartbleed",
           "Web Attack - Brute Force", "Web Attack - XSS",
           "Web Attack - Sql Injection", "Infiltration",
           "Bot", "PortScan", "DDoS"]

report = classification_report(Y_test, Y_pred, target_names=classes,
                                zero_division="warn", digits=4)
print(report)
# Writes to: Scores/RF_no_artefacts_with_payload_filter_metrics_class_based_<id>.txt
```

--------------------------------

### Label FTP-Patator Attacks on Tuesdays in Python

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

This function identifies FTP-Patator attacks by matching source/destination IPs, ports, and timestamps within a specific window. It delegates to payload_filter for final label assignment. Flows not matching conditions default to None.

```python
from datetime import datetime, timedelta

DATE_FORMAT_DATASET = '%d/%m/%Y %I:%M:%S %p'
TIME_DIFFERENCE = timedelta(hours=5)   # CICFlowMeter host TZ − New Brunswick TZ

def tuesday_ftp_patator(row):
    t_start = datetime.strptime('04/07/2017 09:17:00 AM', DATE_FORMAT_DATASET)
    t_end   = datetime.strptime('04/07/2017 10:30:00 AM', DATE_FORMAT_DATASET)
    attacker, victim = '172.16.0.1', '192.168.10.50'
    t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
    if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
        return payload_filter(row, "FTP-Patator")
    return None

# Simulated flow row: [uid, src_ip, src_port, dst_ip, dst_port, proto,
#                      timestamp, ..., fwd_payload_len, ...]
flow = ['uid1', '172.16.0.1', '12345', '192.168.10.50', '21', '6',
        '04/07/2017 02:20:00 PM', '', '', '', '512.0'] + [''] * 68 + ['']
print(tuesday_ftp_patator(flow))
# Output: "FTP-Patator"  (timestamp is within window after TZ correction)
```

--------------------------------

### Label Dataset Function

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Defines a function to label network traffic data based on the day of the week. It maps numerical days to string names and selects appropriate attack filters for labeling.

```python
def dataset_labeling(day):
    day_str = [None, "Monday", "Tuesday", "Wednesday",
               "Thursday", "Friday"][day]
    day_filters = [None,
        [monday_benign],
        [tuesday_ftp_patator, tuesday_ssh_patator],
        [wednesday_dos_slowloris, wednesday_dos_slowhttptest,
         wednesday_dos_hulk, wednesday_dos_goldeneye, wednesday_heartbleed],
        [thursday_web_attack_brute_force, thursday_web_attack_xss,
         thursday_web_attack_sql_injection, thursday_web_attack_infiltration],
        [friday_botnet, friday_portscan, friday_ddos]
    ][day]
    # ... reads INPUT_DIR CSV, rewrites last column with correct label,
    #     writes to OUTPUT_DIR CSV

# Label all five days and print per-day statistics
label_all_datasets()
# Console output example for Tuesday:
# REVI Stat Tuesday:
# {'BENIGN': 432631, 'FTP-Patator': 7935, 'SSH-Patator': 5897,
#  'FTP-Patator - Attempted': 12, 'SSH-Patator - Attempted': 34}
# Total: 446509
```

--------------------------------

### Plot Confusion Matrix

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Renders and optionally saves a normalized confusion matrix. Cell colors represent row-normalized rates, while displayed numbers are raw counts.

```python
import matplotlib.pyplot as plt

plot_confusion_matrix(
    Y_test, Y_pred,
    classes=classes,
    normalize=True,
    save=True,
    name="RF_no_artefacts_with_payload_filter_04-07_120000"
)
# Saves: Figures/RF_no_artefacts_with_payload_filter_04-07_120000.pdf
# Displays an interactive 9×9 inch heatmap
```

--------------------------------

### Import CSV as Dictionary

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Reads a CSV file into a list of dictionaries, using the header row as keys. This function is essential for data ingestion before further processing.

```python
import csv

def importCsvAsDict(path):
    csvfile = csv.DictReader(open(path), delimiter=',')
    return [x for x in csvfile]

rows = importCsvAsDict('LabelledDataset/Monday-WorkingHours.pcap_REVI.csv')
print(rows[0].keys())
# dict_keys(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port',
#            'Protocol', 'Timestamp', ..., 'Label'])
print(rows[0]['Label'])
# Output: 'BENIGN'
```

--------------------------------

### Process Daily Unlabelled CSVs in Python

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

This function processes a single day's unlabelled CSV file from the UnlabelledDataset directory. It applies registered attack filters, writes a corrected CSV to LabelledDataset, and outputs a label distribution summary.

```python
# Configuration
INPUT_DIR  = 'UnlabelledDataset/'
OUTPUT_DIR = 'LabelledDataset/'


```

--------------------------------

### Calculate Class Feature Importance

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Computes per-class feature importance scores by combining scaled mean feature values with global RF importances. Results are saved as JSON.

```python
import json
from sklearn.preprocessing import scale

def class_feature_importance(X, Y, feature_importances):
    import numpy as np
    X = scale(X)
    out = {}
    for c in set(Y):
        out[c] = dict(
            zip(range(X.shape[1]),
                np.mean(X[Y == c, :], axis=0) * feature_importances)
        )
    return out

feature_importances = rf_classifier.feature_importances_
result = class_feature_importance(X_test, Y_pred, feature_importances)

# Inspect top feature for class 13 (PortScan)
top_feat = max(result[13.0], key=result[13.0].get)
print(f"Feature index {top_feat}: importance {result[13.0][top_feat]:.4f}")
# e.g. Feature index 7: importance 0.0412

with open("FeatureImportance/feature_importance_full_dataset_"
          + DATA_VERSION + "_" + time_id + ".json", 'w') as f:
    json.dump(result, f)
```

--------------------------------

### Filter TCP Flows for Payload in Python

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

This function checks if a TCP flow has a forward payload. If the global PAYLOAD_FILTER_ACTIVE flag is true, it downgrades flows without payload to an 'Attempted' sub-label. Non-TCP flows are unaffected.

```python
# row[5]  = Protocol (6 = TCP)
# row[10] = Total Length of Fwd Payload bytes

PAYLOAD_FILTER_ACTIVE = True

def payload_filter(row, attack_class):
    if PAYLOAD_FILTER_ACTIVE and int(row[5]) == 6:
        if float(row[10]) > 0.0:
            return attack_class              # e.g. "DoS Hulk"
        else:
            return attack_class + " - Attempted"  # e.g. "DoS Hulk - Attempted"
    else:
        return attack_class

# Example: a TCP flow with no forward payload bytes
sample_row = ['...', '172.16.0.1', '80', '192.168.10.50', '443', '6',
              '04/07/2017 10:45:00 AM', '...', '...', '...', '0.0', ...]
print(payload_filter(sample_row, "DoS Hulk"))
# Output: "DoS Hulk - Attempted"

# Example: a UDP flow — payload filter never applies
sample_row_udp = sample_row.copy()
sample_row_udp[5] = '17'
print(payload_filter(sample_row_udp, "DDoS"))
# Output: "DDoS"
```

--------------------------------

### Convert List of Dictionaries to NumPy Array

Source: https://context7.com/gintsengelen/wtmc2021-code/llms.txt

Transforms a list of dictionaries (representing CSV rows) into a NumPy array of floats. It excludes identifying columns and converts labels to numerical format.

```python
import pandas as pd, numpy as np

def listOfDictToNumpyArray(list_of_dict):
    dataframe = pd.DataFrame(list_of_dict)
    numpy_string_array = dataframe.values
    # Keep cols 4-5 (Dst Port, Protocol) and cols 7-end (all features + Label)
    trimmed = np.concatenate(
        (numpy_string_array[:, 4:6], numpy_string_array[:, 7:]), axis=1
    )
    return trimmed.astype(np.float)

monday_dict = importCsvAsDict('LabelledDataset/Monday-WorkingHours.pcap_REVI.csv')
convertToNumericalLabels(monday_dict)
arr = listOfDictToNumpyArray(monday_dict)
print(arr.shape)   # e.g. (529918, 78)  — 77 features + 1 label column
print(arr[0, -1])  # 0.0  (BENIGN)
```

=== COMPLETE CONTENT === This response contains all available snippets from this library. No additional content exists. Do not make further requests.