1
+ import sklearn
2
+ import numpy as np
3
+ from utils .LogHelper import Logs
4
+ from utils .DateUtil import get_microseconds
5
+ from utils .Anomaly import Anomaly
6
+ from sklearn import preprocessing
7
+
8
+
9
+ class Dataset :
10
+
11
+ def __init__ (self ):
12
+
13
+ self .logs = Logs ().read ()
14
+ self .client_ip_label_encoder = preprocessing .LabelEncoder ()
15
+ self .request_method_label_encoder = preprocessing .LabelEncoder ()
16
+ self .request_status_label_encoder = preprocessing .LabelEncoder ()
17
+ self .request_size_label_encoder = preprocessing .LabelEncoder ()
18
+ self .time_taken_to_serve_label_encoder = preprocessing .LabelEncoder ()
19
+ self .user_agent_label_encoder = preprocessing .LabelEncoder ()
20
+ self .request_header_label_encoder = preprocessing .LabelEncoder ()
21
+
22
+ self .scores = []
23
+ self .client_ips = []
24
+ self .request_methods = []
25
+ self .request_status = []
26
+ self .request_size = []
27
+ self .times_taken_to_serve = []
28
+ self .user_agents = []
29
+ self .request_headers = []
30
+
31
+ self .dataset = []
32
+
33
+ def preprocess_time (self ):
34
+ timestamp_clusters = {}
35
+ for row in self .logs :
36
+ timestamp = get_microseconds (row [0 ])
37
+ if timestamp not in timestamp_clusters :
38
+ timestamp_clusters [timestamp ]= 0
39
+ timestamp_clusters [timestamp ] = timestamp_clusters [timestamp ] + 1
40
+ anomaly_scores = Anomaly ().detect (timestamp_clusters )
41
+ for row in self .logs :
42
+ self .scores .append (anomaly_scores [row [0 ]])
43
+
44
+ def preprocess_client_ip (self ):
45
+ self .client_ip_label_encoder .fit ([row [1 ] for row in self .logs ])
46
+ inst = [row [1 ] for row in self .logs ]
47
+ self .client_ips = self .client_ip_label_encoder .transform (inst )
48
+
49
+ def preprocess_request_method (self ):
50
+ self .request_method_label_encoder .fit ([row [2 ] for row in self .logs ])
51
+ inst = [row [2 ] for row in self .logs ]
52
+ self .request_methods = self .request_method_label_encoder .transform (inst )
53
+
54
+ def preprocess_request_status (self ):
55
+ self .request_status_label_encoder .fit ([row [3 ] for row in self .logs ])
56
+ inst = [row [3 ] for row in self .logs ]
57
+ self .request_status = self .request_status_label_encoder .transform (inst )
58
+
59
+ def preprocess_request_size (self ):
60
+ self .request_size_label_encoder .fit ([row [4 ] for row in self .logs ])
61
+ inst = [row [4 ] for row in self .logs ]
62
+ self .request_size = self .request_size_label_encoder .transform (inst )
63
+
64
+ def preprocess_time_taken_to_serve (self ):
65
+ self .time_taken_to_serve_label_encoder .fit ([row [5 ] for row in self .logs ])
66
+ inst = [row [5 ] for row in self .logs ]
67
+ self .times_taken_to_serve = self .time_taken_to_serve_label_encoder .transform (inst )
68
+
69
+ def proprocess_user_agent (self ):
70
+ self .user_agent_label_encoder .fit ([row [6 ] for row in self .logs ])
71
+ inst = [row [6 ] for row in self .logs ]
72
+ self .user_agents = self .user_agent_label_encoder .transform (inst )
73
+
74
+ def preprocess_request_header (self ):
75
+ self .request_header_label_encoder .fit ([row [7 ] for row in self .logs ])
76
+ inst = [row [7 ] for row in self .logs ]
77
+ self .request_headers = self .request_header_label_encoder .transform (inst )
78
+
79
+ def detransform_client_ip (self , client_ip_list ):
80
+ return self .client_ip_label_encoder .inverse_transform (client_ip_list )
81
+
82
+ def preprocess (self ):
83
+
84
+ self .preprocess_time ()
85
+ self .preprocess_client_ip ()
86
+ self .preprocess_request_method ()
87
+ self .preprocess_request_status ()
88
+ self .preprocess_request_size ()
89
+ self .preprocess_time_taken_to_serve ()
90
+ self .proprocess_user_agent ()
91
+ self .preprocess_request_header ()
92
+
93
+ dataset_size = len (self .logs )
94
+ for i in range (dataset_size ):
95
+ obj = [
96
+ self .logs [i ][0 ],
97
+ self .scores [i ],
98
+ self .client_ips [i ],
99
+ self .request_methods [i ],
100
+ self .request_status [i ],
101
+ self .request_size [i ],
102
+ self .times_taken_to_serve [i ],
103
+ self .user_agents [i ],
104
+ self .request_headers [i ]
105
+ ]
106
+ self .dataset .append (obj )
107
+
108
+ def read_dataset (self ):
109
+ self .preprocess ()
110
+ return self .dataset
111
+
112
+
113
+ if __name__ == '__main__' :
114
+ dataset_obj = Dataset ()
115
+ dataset_obj .preprocess ()
0 commit comments