项目作者: kahramankostas

项目描述 :
NSL KDD binary classification with Transformer
高级语言: Jupyter Notebook
项目地址: git://github.com/kahramankostas/NSL-KDD-binary-classification-with-Transformer.git


NSL KDD binary classification with Transformer

I used it to classify the NSL-KDD dataset by making a slight change on the code I got from the keras documentation page.

importing of required libraries

  1. import tensorflow as tf
  2. from tensorflow import keras
  3. from tensorflow.keras import layers
  4. import numpy as np
  5. import pandas as pd
  6. from sklearn import preprocessing
  7. from sklearn.model_selection import train_test_split

Implement multi head self attention as a Keras layer

  1. class MultiHeadSelfAttention(layers.Layer):
  2. def __init__(self, embed_dim, num_heads=8):
  3. super(MultiHeadSelfAttention, self).__init__()
  4. self.embed_dim = embed_dim
  5. self.num_heads = num_heads
  6. if embed_dim % num_heads != 0:
  7. raise ValueError(
  8. f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
  9. )
  10. self.projection_dim = embed_dim // num_heads
  11. self.query_dense = layers.Dense(embed_dim)
  12. self.key_dense = layers.Dense(embed_dim)
  13. self.value_dense = layers.Dense(embed_dim)
  14. self.combine_heads = layers.Dense(embed_dim)
  15. def attention(self, query, key, value):
  16. score = tf.matmul(query, key, transpose_b=True)
  17. dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
  18. scaled_score = score / tf.math.sqrt(dim_key)
  19. weights = tf.nn.softmax(scaled_score, axis=-1)
  20. output = tf.matmul(weights, value)
  21. return output, weights
  22. def separate_heads(self, x, batch_size):
  23. x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
  24. return tf.transpose(x, perm=[0, 2, 1, 3])
  25. def call(self, inputs):
  26. # x.shape = [batch_size, seq_len, embedding_dim]
  27. batch_size = tf.shape(inputs)[0]
  28. query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
  29. key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
  30. value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
  31. query = self.separate_heads(
  32. query, batch_size
  33. ) # (batch_size, num_heads, seq_len, projection_dim)
  34. key = self.separate_heads(
  35. key, batch_size
  36. ) # (batch_size, num_heads, seq_len, projection_dim)
  37. value = self.separate_heads(
  38. value, batch_size
  39. ) # (batch_size, num_heads, seq_len, projection_dim)
  40. attention, weights = self.attention(query, key, value)
  41. attention = tf.transpose(
  42. attention, perm=[0, 2, 1, 3]
  43. ) # (batch_size, seq_len, num_heads, projection_dim)
  44. concat_attention = tf.reshape(
  45. attention, (batch_size, -1, self.embed_dim)
  46. ) # (batch_size, seq_len, embed_dim)
  47. output = self.combine_heads(
  48. concat_attention
  49. ) # (batch_size, seq_len, embed_dim)
  50. return output

Implement a Transformer block as a layer

  1. class TransformerBlock(layers.Layer):
  2. def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
  3. super(TransformerBlock, self).__init__()
  4. self.att = MultiHeadSelfAttention(embed_dim, num_heads)
  5. self.ffn = keras.Sequential(
  6. [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
  7. )
  8. self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
  9. self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
  10. self.dropout1 = layers.Dropout(rate)
  11. self.dropout2 = layers.Dropout(rate)
  12. def call(self, inputs, training):
  13. attn_output = self.att(inputs)
  14. attn_output = self.dropout1(attn_output, training=training)
  15. out1 = self.layernorm1(inputs + attn_output)
  16. ffn_output = self.ffn(out1)
  17. ffn_output = self.dropout2(ffn_output, training=training)
  18. return self.layernorm2(out1 + ffn_output)

Implement embedding layer

Two seperate embedding layers, one for tokens, one for token index (positions).

  1. class TokenAndPositionEmbedding(layers.Layer):
  2. def __init__(self, maxlen, vocab_size, embed_dim):
  3. super(TokenAndPositionEmbedding, self).__init__()
  4. self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
  5. self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
  6. def call(self, x):
  7. maxlen = tf.shape(x)[-1]
  8. positions = tf.range(start=0, limit=maxlen, delta=1)
  9. positions = self.pos_emb(positions)
  10. x = self.token_emb(x)
  11. return x + positions

prepare NSL KDD dataset

reading CSV files

  1. # c_names ---> column names
  2. c_names = ["duration","protocol_type","service","flag","src_bytes",
  3. "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
  4. "logged_in","num_compromised","root_shell","su_attempted","num_root",
  5. "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
  6. "is_host_login","is_guest_login","count","srv_count","serror_rate",
  7. "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
  8. "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
  9. "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
  10. "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
  11. "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels","difficulty_degree"]
  12. train = pd.read_csv( "data/KDDTrain+.csv", names=c_names) # train file
  13. test = pd.read_csv("data/KDDTest+.csv", names=c_names) # test file

deletion of unnecessary feature (difficulty_degree)

  1. del train["difficulty_degree"]
  2. del test["difficulty_degree"]

Converting object features to categories first and then to dummy tables (except “labels”)

  1. for i in c_names:
  2. print((train[i].dtypes))
  3. if train[i].dtypes==object:
  4. train[i] = train[i].astype('category')
  5. test[i] = test[i].astype('category')
  6. if i=="labels":
  7. break
  8. train=pd.get_dummies(train, columns=[i])
  9. test=pd.get_dummies(test, columns=[i])
  1. int64
  2. object
  3. object
  4. object
  5. int64
  6. int64
  7. int64
  8. int64
  9. int64
  10. int64
  11. int64
  12. int64
  13. int64
  14. int64
  15. int64
  16. int64
  17. int64
  18. int64
  19. int64
  20. int64
  21. int64
  22. int64
  23. int64
  24. int64
  25. float64
  26. float64
  27. float64
  28. float64
  29. float64
  30. float64
  31. float64
  32. int64
  33. int64
  34. float64
  35. float64
  36. float64
  37. float64
  38. float64
  39. float64
  40. float64
  41. float64
  42. object

labels feature converts to binary

  1. # TRAIN
  2. attack_or_not=[]
  3. for i in train["labels"]:#it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
  4. if i =="normal":
  5. attack_or_not.append(1)
  6. else:
  7. attack_or_not.append(0)
  8. train["labels"]=attack_or_not
  1. # TEST
  2. attack_or_not=[]
  3. for i in test["labels"]:#it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
  4. if i =="normal":
  5. attack_or_not.append(1)
  6. else:
  7. attack_or_not.append(0)
  8. test["labels"]=attack_or_not

Synchronizing Test and Train datasets.

Add “0” for the feature that does not exist in one of these two datasets.

  1. f=list(train.columns)
  2. e=list(test.columns)
  3. for i in f:
  4. if i not in e:
  5. zero_data =pd.array(np.zeros(len(test["labels"])))
  6. print(len(zero_data))
  7. test[i] = zero_data
  8. print(i)
  9. for i in e:
  10. if i not in f:
  11. zero_data = np.zeros(len(train["labels"]))
  12. train[i] = zero_data
  13. print(i)
  1. 22543
  2. service_aol
  3. 22543
  4. service_harvest
  5. 22543
  6. service_http_2784
  7. 22543
  8. service_http_8001
  9. 22543
  10. service_red_i
  11. 22543
  12. service_tftp_u
  13. 22543
  14. service_urh_i

separation of features (data) and Label (target)

  1. y = train["labels"] #this section separates the label and the data into two separate pieces, as Label=y Data=X
  2. del train["labels"]
  3. X = train
  1. y_test = test["labels"] #this section separates the label and the data into two separate pieces, as Label=y Data=X
  2. del test["labels"]
  3. x_test=test

Normalization and Standardization

  1. X = preprocessing.scale(X)
  2. X = preprocessing.normalize(X)
  1. x_test = preprocessing.scale(x_test)
  2. x_test = preprocessing.normalize(x_test)

Separating Train data into two parts as train and validation

  1. x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y)
  2. print(len(x_train), "Training sequences",x_train.shape)
  3. print(len(x_val), "Validation sequences",x_val.shape)
  4. print(len(x_test), "Test sequences",x_test.shape)
  1. 100778 Training sequences (100778, 122)
  2. 25195 Validation sequences (25195, 122)
  3. 22543 Test sequences (22543, 122)

Create classifier model using transformer layer

Transformer layer outputs one vector for each time step of our input sequence.
Here, we take the mean across all time steps and
use a feed forward network on top of it to classify text.

  1. maxlen=122
  2. vocab_size = 100000 # Only consider the top 20k words
  3. embed_dim = 32 # Embedding size for each token
  4. num_heads = 2 # Number of attention heads
  5. ff_dim = 32 # Hidden layer size in feed forward network inside transformer
  6. inputs = layers.Input(shape=(maxlen,))
  7. embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
  8. x = embedding_layer(inputs)
  9. transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
  10. x = transformer_block(x)
  11. x = layers.GlobalAveragePooling1D()(x)
  12. x = layers.Dropout(0.1)(x)
  13. x = layers.Dense(20, activation="relu")(x)
  14. x = layers.Dropout(0.1)(x)
  15. outputs = layers.Dense(2, activation="softmax")(x)
  16. model = keras.Model(inputs=inputs, outputs=outputs)
  1. x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
  2. x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)
  3. x_test = keras.preprocessing.sequence.pad_sequences(x_test , maxlen=maxlen)

Train

  1. model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
  2. history = model.fit(
  3. x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
  4. )
  1. Train on 100778 samples, validate on 25195 samples
  2. Epoch 1/2
  3. 100778/100778 [==============================] - 240s 2ms/sample - loss: 0.6915 - accuracy: 0.5331 - val_loss: 0.6908 - val_accuracy: 0.5346
  4. Epoch 2/2
  5. 100778/100778 [==============================] - 221s 2ms/sample - loss: 0.6908 - accuracy: 0.5345 - val_loss: 0.6910 - val_accuracy: 0.5346

Evaluate

  1. score = model.evaluate(x_test, y_test, verbose=0)
  2. print("Test loss:", score[0])
  3. print("Test accuracy:", score[1])
  1. Test loss: 0.7010403732089466
  2. Test accuracy: 0.43073237
  1. score = model.evaluate(x_val, y_val, verbose=0)
  2. print("Test loss:", score[0])
  3. print("Test accuracy:", score[1])
  1. Test loss: 0.690967743196618
  2. Test accuracy: 0.5345902