
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | # coding: utf-8 import base64 import psycopg2 import re from urllib.parse import unquote import hashlib from psycopg2 import IntegrityError from os import walk regex = r"(?P<hostname>[\d\.)]+)\s(?P<logname>.*?)\s(?P<remote_user>.*?)\s\[(?P<date>\d{2}\/[A-z]{3}\/\d{4}):(?P<time>\d{2}:\d{2}:\d{2})\s(?P<timezone>[-|+]\d{3,4})]\s"(?P<method>[A-Z]*)(?P<request>.*?)(?P<http_version>[A-Z]*\/1.\d)"\s(?P<response_code>\d+)\s(?P<response_size>.*?)\s"(?P<referer>.*?)"\s"(?P<user_agent>.*?)"" # Expressao regular para analise da URL con = psycopg2.connect(host='192.168.1.1', database='mllab', user='postgres', password='postgres') cur = con.cursor() data_source = "/home/ricardomaia/data_source/" logfiles = [] for (dirpath, dirnames, filenames) in walk(data_source): logfiles.extend(filenames) break for filename in logfiles: file_extension = filename.split('.') category = file_extension[-1] # Categoria do log print (filename, category) filepath = data_source+filename print("Processando arquivo"+str(filepath)) file = open(filepath, "r") for line in file: matches = re.finditer(regex, line) for matchNum, match in enumerate(matches): try: hostname = match.group('hostname') # Nao relevante. Tende a variar de acordo com o cliente logname = match.group('logname') # Nao relevante. Geralmente nao disponivel. remote_user = match.group('remote_user') # Nao relevante. Geralmente nao disponivel. date = match.group('date') # Nao relevante. time = match.group('time') # Nao relevante. timezone = match.group('timezone') # Nao relevante. method = match.group('method') # Relevante mas alta frequencia em todos os documentos. request = match.group('request') # CRITERIO MAIS IMPORTANTE http_version = match.group('http_version') # Nao relevante. response_code = match.group('response_code') # Nao relevante. Maior parte 200 (OK) ou 301 (Not Modified). response_size = match.group('response_size') # Nao relevante. Varia principalmente em sites dinamicos. referer = match.group('referer') # Relevante. Pode revelar fluxo anomalo no uso da aplicacao. user_agent = match.group('user_agent') # Nao relevante. Tende a variar de acordo com o cliente. request_decoded = unquote(request) # Decodifica a requisicao m = hashlib.sha256() m.update(request_decoded.encode('utf-8')) fingerprint = m.hexdigest() # Cria um ID unico para a requisicao request = base64.b64encode(request_decoded.encode('utf-8')).decode('ascii') # Armazena em base 64 raw = base64.b64encode(line.encode('utf-8')).decode('ascii') # Armazena em base 64 sql = "INSERT INTO LOGS (category,raw,hostname, logname, remote_user, date, time, timezone, method, request, " "http_version, response_code, response_size, referer, user_agent, fingerprint ) " "VALUES (" sql += "'{category}',".format(category=category) sql += "'{raw}',".format(raw=raw) sql += "'{hostname}',".format(hostname=hostname) sql += "'{logname}',".format(logname=logname) sql += "'{remote_user}',".format(remote_user=remote_user) sql += "'{date}',".format(date=date) sql += "'{time}',".format(time=time) sql += "'{timezone}',".format(timezone=timezone) sql += "'{method}',".format(method=method) sql += "'{request}',".format(request=request) sql += "'{http_version}',".format(http_version=http_version) sql += "{response_code},".format(response_code=response_code) sql += "{response_size},".format(response_size=response_size) sql += "'{referer}',".format(referer=referer) sql += "'{user_agent}',".format(user_agent=user_agent) sql += "'{fingerprint}'".format(fingerprint=fingerprint) sql += ");" cur.execute(sql, (category, raw, hostname, logname, remote_user, date, time, timezone, method, request, http_version, response_code, response_size, referer, user_agent, fingerprint)) except IntegrityError as e: # O documento ja esta cadastrado con.rollback() print("Documento ja cadastrado.") print(e) else: con.commit() con.close() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | # coding: utf-8 from sklearn.feature_extraction.text import CountVectorizer # Extracao de caracteristicas from sklearn.feature_extraction.text import TfidfTransformer # Extracao de caracteristicas from sklearn.feature_extraction.text import TfidfVectorizer # Extracao de caracteristicas from sklearn.cluster import KMeans # Algoritimo de clustering from sklearn.decomposition import PCA # Principal Component Analysis from psycopg2 import IntegrityError from scipy.spatial.distance import cdist import psycopg2 import matplotlib.pyplot as plt # Criacao de graficos import re # Para expressao regulares import base64 import pandas as pd # Tranformacao e apresentacao de dados import numpy as np '''Definicao de parametros''' dbhost = '192.168.1.1' dbuser = 'postgres' dbpass = 'postgres' dbname = 'mllab' n_clusters = 6 # Numero de agrupamentos n_components = 2 # Numero de componentes principais max_features = 1000 # Numero maximo de caracteristicas extraidas do corpus para criar a matriz de frequencia TF-IDF token_pattern = "(\w[\w']*\w|../)" # Como os tokens serao localizados stop_words = ['.', '&',':'] # Termos que serao desconsiderados na formacao dos tokens '''Definicao de variaveis''' corpus = [] # Conjunto de documentos regex = r"(?P<hostname>[\d\.)]+)\s(?P<logname>.*?)\s(?P<remote_user>.*?)\s\[(?P<date>\d{2}\/[A-z]{3}\/\d{4}):(?P<time>\d{2}:\d{2}:\d{2})\s(?P<timezone>[-|+]\d{3,4})]\s"(?P<method>[A-Z]*)(?P<request>.*?)(?P<http_version>[A-Z]*\/1.\d)"\s(?P<response_code>\d+)\s(?P<response_size>.*?)\s"(?P<referer>.*?)"\s"(?P<user_agent>.*?)"" # Expressao regular para analise da URL ids = [] # Indices dos documentos (hash) category = [] # Categoria dos documentos colors = ['red', 'green', 'blue', 'orange', 'violet', 'slateblue', 'springgreen', 'olive'] # Cores para criacao dos graficos. markers = ['1','2','3','4','8','s','p'] # Marcadores para criacao dos graficos.s def mask(df, key, value): '''Funcao para filtragem de dados''' return df[df[key] == value] pd.DataFrame.mask = mask con = psycopg2.connect(host=dbhost, database=dbname, user=dbuser, password=dbpass) cur = con.cursor() # In[334]: sql = "select id,request, category, cluster from logs" cur.execute(sql) rows = cur.fetchall() df_rows = pd.DataFrame(columns=['id','request', 'category', 'cluster']) row_pos = 0 for row in rows: request = row[1] # base64.b64encode(request_decoded.encode('utf-8')).decode('ascii') request_decoded = base64.b64decode(request).decode('ascii') # Decodifica a requisicao corpus.append(request_decoded) # Adiciona o request (documento) ao corpus df_rows.loc[row_pos] = [row[0],request_decoded,row[2],row[3]] row_pos += 1 '''Contagem de Frequencia''' transformer = TfidfVectorizer(use_idf=True, max_features=max_features, token_pattern=token_pattern, stop_words=stop_words, analyzer='word') tf_idf = transformer.fit_transform(corpus) columns = transformer.get_feature_names() '''Execucao do algoritmo de agrupamento''' kmeans = KMeans(n_clusters=n_clusters).fit(tf_idf) clusters = kmeans.labels_ clusters_centers = kmeans.cluster_centers_ '''Reducao de dimensionalidade''' pca = PCA(n_components=n_components).fit(tf_idf.todense()) # PCA das caracteristicas extraidas do corpus centers2D = pca.transform(clusters_centers) # Executa a transformacao vetorial para matriz clusters_centers data2D = pca.transform(tf_idf.todense()) # Executa a transformacao vetorial para matriz de frequencia tf_idf df_centers2D = pd.DataFrame(centers2D) df_data2D = pd.DataFrame(data2D) #Centroides cluster_graph, partial_plot = plt.subplots() partial_plot.scatter(centers2D[:,0], centers2D[:,1], marker='o', s=4000, c='k', alpha=0.02) for p in range(0, centers2D.shape[0]): t = plt.text(centers2D[p, 0], centers2D[p, 1],"Cluster " + str(p)) t.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='gray')) # Dados for i in range(0, data2D.shape[0]): c = colors[clusters[i]] marker = markers[clusters[i]] partial_plot.scatter(data2D[i, 0], data2D[i, 1], c=c, marker=marker, s=180, linewidths=0, alpha=0.4) df_rows['cluster'] = clusters df_rows[(df_rows['category']=='default') ] df_rows[(df_rows['cluster']==2) & (df_rows['category'] != "sqli")] for index, row in df_rows.iterrows(): sql = "UPDATE logs SET cluster="+str(row['cluster'])+" WHERE id="+str(row['id']) cur.execute(sql) con.commit() con.close() # Elbow num_clusters = range(1, 20) kmeans = [KMeans(n_clusters=i) for i in num_clusters] score = [kmeans[i].fit(tf_idf).score(tf_idf) for i in range(len(kmeans))] score plt.plot(num_clusters,score) plt.xlabel('Numero de agrupamentos') plt.ylabel('Pontuacao') plt.title('Curva Elbow') plt.show() |