Machine Learning – Clustering

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# coding: utf-8

import base64
import psycopg2
import re
from urllib.parse import unquote
import hashlib
from psycopg2 import IntegrityError
from os import walk

regex = r"(?P<hostname>[\d\.)]+)\s(?P<logname>.*?)\s(?P<remote_user>.*?)\s\[(?P<date>\d{2}\/[A-z]{3}\/\d{4}):(?P<time>\d{2}:\d{2}:\d{2})\s(?P<timezone>[-|+]\d{3,4})]\s"(?P<method>[A-Z]*)(?P<request>.*?)(?P<http_version>[A-Z]*\/1.\d)"\s(?P<response_code>\d+)\s(?P<response_size>.*?)\s"(?P<referer>.*?)"\s"(?P<user_agent>.*?)"" # Expressao regular para analise da URL
con = psycopg2.connect(host='192.168.1.1', database='mllab', user='postgres', password='postgres')
cur = con.cursor()

data_source = "/home/ricardomaia/data_source/"

logfiles = []

for (dirpath, dirnames, filenames) in walk(data_source):
    logfiles.extend(filenames)
    break
   
for filename in logfiles:
   
    file_extension = filename.split('.')      
    category = file_extension[-1] # Categoria do log        
    print (filename, category)
   
    filepath = data_source+filename
    print("Processando arquivo"+str(filepath))
   
    file = open(filepath, "r")

    for line in file:

        matches = re.finditer(regex, line)

        for matchNum, match in enumerate(matches):

            try:

                hostname = match.group('hostname') # Nao relevante. Tende a variar de acordo com o cliente
                logname = match.group('logname') # Nao relevante. Geralmente nao disponivel.
                remote_user = match.group('remote_user') # Nao relevante. Geralmente nao disponivel.
                date = match.group('date') # Nao relevante.
                time = match.group('time') # Nao relevante.
                timezone = match.group('timezone') # Nao relevante.
                method = match.group('method') # Relevante mas alta frequencia em todos os documentos.
                request = match.group('request') # CRITERIO MAIS IMPORTANTE
                http_version = match.group('http_version') # Nao relevante.
                response_code = match.group('response_code') # Nao relevante. Maior parte 200 (OK) ou 301 (Not Modified).
                response_size = match.group('response_size') # Nao relevante. Varia principalmente em sites dinamicos.
                referer = match.group('referer') # Relevante. Pode revelar fluxo anomalo no uso da aplicacao.
                user_agent = match.group('user_agent') # Nao relevante. Tende a variar de acordo com o cliente.

                request_decoded = unquote(request) # Decodifica a requisicao                
                m = hashlib.sha256()
                m.update(request_decoded.encode('utf-8'))
                fingerprint = m.hexdigest() # Cria um ID unico para a requisicao
               
                request = base64.b64encode(request_decoded.encode('utf-8')).decode('ascii') # Armazena em base 64
                raw =  base64.b64encode(line.encode('utf-8')).decode('ascii') # Armazena em base 64

                sql = "INSERT INTO LOGS (category,raw,hostname, logname, remote_user, date, time, timezone, method, request, "                       "http_version, response_code, response_size, referer, user_agent, fingerprint  )  "                       "VALUES ("
                sql += "'{category}',".format(category=category)
                sql += "'{raw}',".format(raw=raw)
                sql += "'{hostname}',".format(hostname=hostname)
                sql += "'{logname}',".format(logname=logname)
                sql += "'{remote_user}',".format(remote_user=remote_user)
                sql += "'{date}',".format(date=date)
                sql += "'{time}',".format(time=time)
                sql += "'{timezone}',".format(timezone=timezone)
                sql += "'{method}',".format(method=method)
                sql += "'{request}',".format(request=request)
                sql += "'{http_version}',".format(http_version=http_version)
                sql += "{response_code},".format(response_code=response_code)
                sql += "{response_size},".format(response_size=response_size)
                sql += "'{referer}',".format(referer=referer)
                sql += "'{user_agent}',".format(user_agent=user_agent)
                sql += "'{fingerprint}'".format(fingerprint=fingerprint)
                sql += ");"
               
                cur.execute(sql, (category, raw, hostname, logname, remote_user, date, time, timezone, method, request, http_version, response_code, response_size, referer, user_agent, fingerprint))
                                   


            except IntegrityError as e: # O documento ja esta cadastrado
                con.rollback()
                print("Documento ja cadastrado.")
                print(e)
            else:
                con.commit()


con.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# coding: utf-8

from sklearn.feature_extraction.text import CountVectorizer # Extracao de caracteristicas
from sklearn.feature_extraction.text import TfidfTransformer # Extracao de caracteristicas
from sklearn.feature_extraction.text import TfidfVectorizer # Extracao de caracteristicas
from sklearn.cluster import KMeans # Algoritimo de clustering
from sklearn.decomposition import PCA # Principal Component Analysis
from psycopg2 import IntegrityError
from scipy.spatial.distance import cdist
import psycopg2
import matplotlib.pyplot as plt # Criacao de graficos
import re # Para expressao regulares
import base64
import pandas as pd # Tranformacao e apresentacao de dados
import numpy as np


'''Definicao de parametros'''
dbhost = '192.168.1.1'
dbuser = 'postgres'
dbpass = 'postgres'
dbname = 'mllab'
n_clusters = 6 # Numero de agrupamentos
n_components = 2 # Numero de componentes principais
max_features = 1000 # Numero maximo de caracteristicas extraidas do corpus para criar a matriz de frequencia TF-IDF
token_pattern = "(\w[\w']*\w|../)" # Como os tokens serao localizados
stop_words = ['.', '&',':'] # Termos que serao desconsiderados na formacao dos tokens

'''Definicao de variaveis'''
corpus = [] # Conjunto de documentos
regex = r"(?P<hostname>[\d\.)]+)\s(?P<logname>.*?)\s(?P<remote_user>.*?)\s\[(?P<date>\d{2}\/[A-z]{3}\/\d{4}):(?P<time>\d{2}:\d{2}:\d{2})\s(?P<timezone>[-|+]\d{3,4})]\s"(?P<method>[A-Z]*)(?P<request>.*?)(?P<http_version>[A-Z]*\/1.\d)"\s(?P<response_code>\d+)\s(?P<response_size>.*?)\s"(?P<referer>.*?)"\s"(?P<user_agent>.*?)"" # Expressao regular para analise da URL
ids = [] # Indices dos documentos (hash)
category = [] # Categoria dos documentos
colors = ['red', 'green', 'blue', 'orange', 'violet', 'slateblue', 'springgreen', 'olive'] # Cores para criacao dos graficos.
markers = ['1','2','3','4','8','s','p'] # Marcadores para criacao dos graficos.s

def mask(df, key, value):
    '''Funcao para filtragem de dados'''
    return df[df[key] == value]

pd.DataFrame.mask = mask

con = psycopg2.connect(host=dbhost, database=dbname, user=dbuser, password=dbpass)
cur = con.cursor()


# In[334]:


sql = "select id,request, category, cluster from logs"
cur.execute(sql)
rows = cur.fetchall()

df_rows = pd.DataFrame(columns=['id','request', 'category', 'cluster'])

row_pos = 0
for row in rows:
    request = row[1]
    # base64.b64encode(request_decoded.encode('utf-8')).decode('ascii')
    request_decoded = base64.b64decode(request).decode('ascii') # Decodifica a requisicao        
    corpus.append(request_decoded) # Adiciona o request (documento) ao corpus
    df_rows.loc[row_pos] = [row[0],request_decoded,row[2],row[3]]
    row_pos += 1


'''Contagem de Frequencia'''
transformer = TfidfVectorizer(use_idf=True, max_features=max_features, token_pattern=token_pattern,
                              stop_words=stop_words, analyzer='word')
tf_idf = transformer.fit_transform(corpus)
columns = transformer.get_feature_names()

'''Execucao do algoritmo de agrupamento'''
kmeans = KMeans(n_clusters=n_clusters).fit(tf_idf)
clusters = kmeans.labels_
clusters_centers = kmeans.cluster_centers_

'''Reducao de dimensionalidade'''
pca = PCA(n_components=n_components).fit(tf_idf.todense()) # PCA das caracteristicas extraidas do corpus
centers2D = pca.transform(clusters_centers) # Executa a transformacao vetorial para matriz clusters_centers
data2D = pca.transform(tf_idf.todense())  # Executa a transformacao vetorial para matriz de frequencia tf_idf

df_centers2D = pd.DataFrame(centers2D)
df_data2D = pd.DataFrame(data2D)

#Centroides
cluster_graph, partial_plot = plt.subplots()
partial_plot.scatter(centers2D[:,0], centers2D[:,1], marker='o', s=4000, c='k', alpha=0.02)

for p in range(0, centers2D.shape[0]):
    t = plt.text(centers2D[p, 0], centers2D[p, 1],"Cluster " + str(p))
    t.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='gray'))

# Dados
for i in range(0, data2D.shape[0]):
    c = colors[clusters[i]]
    marker = markers[clusters[i]]
    partial_plot.scatter(data2D[i, 0], data2D[i, 1], c=c, marker=marker, s=180, linewidths=0, alpha=0.4)


df_rows['cluster'] = clusters

df_rows[(df_rows['category']=='default') ]


df_rows[(df_rows['cluster']==2) & (df_rows['category'] != "sqli")]

for index, row in df_rows.iterrows():
    sql = "UPDATE logs SET cluster="+str(row['cluster'])+" WHERE id="+str(row['id'])
    cur.execute(sql)
    con.commit()

con.close()

# Elbow
num_clusters = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in num_clusters]

score = [kmeans[i].fit(tf_idf).score(tf_idf) for i in range(len(kmeans))]
score

plt.plot(num_clusters,score)
plt.xlabel('Numero de agrupamentos')
plt.ylabel('Pontuacao')
plt.title('Curva Elbow')
plt.show()