LSTM识别恶意HTTP请求
[Data Science for Cyber Security]
In [1]:
import sys import os import json import pandas import numpy import optparse from keras.callbacks import TensorBoard from keras.models import Sequential from keras.layers import LSTM, Dense, Dropout from keras.layers.embeddings import Embedding from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer from collections import OrderedDict
Using TensorFlow backend.
In [2]:
dataframe = pandas.read_csv('./data/dev-access.csv', engine='python', quotechar='|', header=None)
In [3]:
dataframe.head(10)
Out[3]:
0 | 1 | |
---|---|---|
0 | {"timestamp":1502738402847,"method":"post","qu... | 0 |
1 | {"timestamp":1502738402849,"method":"post","qu... | 0 |
2 | {"timestamp":1502738402852,"method":"post","qu... | 0 |
3 | {"timestamp":1502738402852,"method":"post","qu... | 0 |
4 | {"timestamp":1502738402853,"method":"post","qu... | 0 |
5 | {"timestamp":1502738402853,"method":"post","qu... | 0 |
6 | {"timestamp":1502738402854,"method":"post","qu... | 0 |
7 | {"timestamp":1502738402855,"method":"post","qu... | 0 |
8 | {"timestamp":1502738402856,"method":"post","qu... | 0 |
9 | {"timestamp":1502738402856,"method":"post","qu... | 0 |
In [4]:
dataset = dataframe.sample(frac=1).values
In [5]:
# Preprocess dataset X = dataset[:,0] Y = dataset[:,1]
In [6]:
X
Out[6]:
array([ '{"timestamp":1502738602036,"method":"get","query":{"query":"Tops&_method=PUT"},"path":"/search","statusCode":404,"source":{"remoteAddress":"22.148.143.9","referer":"http://localhost:8002/enter"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","accept":"*/*","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","referer":"http://localhost:8002/enter","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}', '{"timestamp":1502738461497,"method":"get","query":{"query":"etudzum"},"path":"/search","statusCode":404,"source":{"remoteAddress":"81.27.152.121"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}', '{"timestamp":1502738585782,"method":"get","query":{"query":"Area & Accent Rugs/*"},"path"quot;:"/search","statusCode":404,"source":{"remoteAddress":"251.8.39.54"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}', ..., '{"timestamp":1502738590561,"method":"get","query":{"query":"Watering Equipment&_method=POST&isAdmin=true"},"path":"/search","statusCode":404,"source":{"remoteAddress":"99.98.90.102","referer":"http://localhost:8002/enter"},"route":"/search","headers":{"host":"localhost:8002","connection":"keep-alive","accept":"*/*","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","referer":"http://localhost:8002/enter","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}', '{"timestamp":1502738403024,"method":"get","query":{},"path":"/PMA2013","statusCode":404,"source":{"remoteAddress":"243.15.81.191","referer":"http://localhost:8002/enter"},"route":"/{p*}","headers":{"host":"localhost:8002","connection":"keep-alive","accept":"*/*","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","referer":"http://localhost:8002/enter","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6"},"requestPayload":null,"responsePayload":{"statusCode":404,"error":"Not Found","message":"Not Found"}}', '{"timestamp":1502738643645,"method":"post","query":{},"path":"/login","statusCode":200,"source":{"remoteAddress":"77.11.74.111"},"route":"/login","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6","content-type":"application/json","content-length":"52"},"requestPayload":{"username":"Herb2","password":"pizzaloaflasdf0v32"},"responsePayload":"LOGIN"}'], dtype=object)
In [7]:
Y
Out[7]:
array([1, 0, 1, ..., 1, 1, 0], dtype=object)
In [8]:
for index, item in enumerate(X): # Quick hack to space out json elements reqJson = json.loads(item, object_pairs_hook=OrderedDict) del reqJson['timestamp'] del reqJson['headers'] del reqJson['source'] del reqJson['route'] del reqJson['responsePayload'] X[index] = json.dumps(reqJson, separators=(',', ':'))
In [9]:
X
Out[9]:
array([ '{"method":"get","query":{"query":"Tops&_method=PUT"},"path":"/search","statusCode":404,"requestPayload":null}', '{"method":"get","query":{"query":"etudzum"},"path":"/search","statusCode":404,"requestPayload":null}', '{"method":"get","query":{"query":"Area & Accent Rugs/*"},"path":"/search","statusCode":404,"requestPayload":null}', ..., '{"method":"get","query":{"query":"Watering Equipment&_method=POST&isAdmin=true"},"path":"/search","statusCode":404,"requestPayload":null}', '{"method":"get","query":{},"path":"/PMA2013","statusCode":404,"requestPayload":null}', '{"method":"post","query":{},"path":"/login","statusCode":200,"requestPayload":{"username":"Herb2","password":"pizzaloaflasdf0v32"}}'], dtype=object)
Tokenize¶
In [10]:
tokenizer = Tokenizer(filters='\t\n', char_level=True) tokenizer.fit_on_texts(X)
In [11]:
# Extract and save word dictionary word_dict_file = 'build/word-dictionary.json' if not os.path.exists(os.path.dirname(word_dict_file)): os.makedirs(os.path.dirname(word_dict_file)) with open(word_dict_file, 'w') as outfile: json.dump(tokenizer.word_index, outfile, ensure_ascii=False)
In [12]:
num_words = len(tokenizer.word_index)+1 X = tokenizer.texts_to_sequences(X) X[0]
Out[12]:
[16, 1, 19, 2, 3, 13, 7, 11, 1, 4, 1, 28, 2, 3, 1, 10, 1, 15, 8, 2, 9, 14, 1, 4, 16, 1, 15, 8, 2, 9, 14, 1, 4, 1, 36, 7, 18, 6, 32, 62, 19, 2, 3, 13, 7, 11, 37, 26, 72, 36, 1, 17, 10, 1, 18, 5, 3, 13, 1, 4, 1, 24, 6, 2, 5, 9, 23, 13, 1, 10, 1, 6, 3, 5, 3, 8, 6, 25, 7, 11, 2, 1, 4, 22, 20, 22, 10, 1, 9, 2, 15, 8, 2, 6, 3, 26, 5, 14, 12, 7, 5, 11, 1, 4, 21, 8, 12, 12, 17]
In [13]:
max_log_length = 1024 train_size = int(len(dataset) * .75)
In [14]:
# padding X_processed = sequence.pad_sequences(X, maxlen=max_log_length) # 划分样本集 X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)] Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]
In [15]:
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)
Model¶
In [16]:
model = Sequential() model.add(Embedding(num_words, 32, input_length=max_log_length)) model.add(Dropout(0.5)) model.add(LSTM(64, recurrent_dropout=0.5)) model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
In [17]:
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 1024, 32) 2816 _________________________________________________________________ dropout_1 (Dropout) (None, 1024, 32) 0 _________________________________________________________________ lstm_1 (LSTM) (None, 64) 24832 _________________________________________________________________ dropout_2 (Dropout) (None, 64) 0 _________________________________________________________________ dense_1 (Dense) (None, 1) 65 ================================================================= Total params: 27,713 Trainable params: 27,713 Non-trainable params: 0 _________________________________________________________________
In [18]:
model.fit(X_train, Y_train, validation_split=0.25, epochs=3, batch_size=128, callbacks=[tb_callback])
Train on 15059 samples, validate on 5020 samples Epoch 1/3 15059/15059 [==============================] - 452s - loss: 0.5981 - acc: 0.6662 - val_loss: 0.3296 - val_acc: 0.8990 Epoch 2/3 15059/15059 [==============================] - 454s - loss: 0.2738 - acc: 0.9179 - val_loss: 0.1216 - val_acc: 0.9783 Epoch 3/3 15059/15059 [==============================] - 621s - loss: 0.1647 - acc: 0.9618 - val_loss: 0.0631 - val_acc: 0.9914
Out[18]:
<keras.callbacks.History at 0x12772c890>
In [19]:
# Evaluate model score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=128) print("Model Accuracy: {:0.2f}%".format(acc * 100))
6694/6694 [==============================] - 64s Model Accuracy: 99.12%
In [20]:
# Save model model.save_weights('securitai-lstm-weights.h5') model.save('securitai-lstm-model.h5') with open('securitai-lstm-model.json', 'w') as outfile: outfile.write(model.to_json())
Evaluate 2¶
测试另外一个WAF的数据集 https://github.com/faizann24/Fwaf-Machine-Learning-driven-Web-Application-Firewall
In [21]:
df_black = pandas.read_csv('/Users/xy/workspace/Fwaf-Machine-Learning-driven-Web-Application-Firewall/badqueries.txt',engine='python',sep='!@#$%^&*',header=0) df_white = pandas.read_csv('/Users/xy/workspace/Fwaf-Machine-Learning-driven-Web-Application-Firewall/goodqueries.txt',engine='python',sep='!@#$%^&*',header=0).sample(n=50000)
In [22]:
df_black['label'] = 1 df_white['label'] = 0 new_dataset = df_black.append(df_white)
In [23]:
new_dataset = new_dataset.sample(n=10000)
In [24]:
X_waf = new_dataset['uri'].values.astype('str') Y_waf = new_dataset['label'].values.astype('str')
In [25]:
X_sequences = tokenizer.texts_to_sequences(X_waf) X_processed = sequence.pad_sequences(X_sequences, maxlen=max_log_length)
In [26]:
# Evaluate model score, acc = model.evaluate(X_processed, Y_waf, verbose=1, batch_size=128) print("Model Accuracy: {:0.2f}%".format(acc * 100))
10000/10000 [==============================] - 97s Model Accuracy: 73.52%