祝小珞珞六一儿童节快乐,爸爸妈妈爱你!
最近真的太忙了,天天打仗一样,感谢大家的支持和关注,继续加油!该系列文章将系统整理和深入学习系统安全、逆向分析和恶意代码检测,文章会更加聚焦,更加系统,更加深入,也是作者的慢慢成长史。漫漫长征路,偏向虎山行。享受过程,一起奋斗~
-
一.恶意软件分析
-
1.静态特征
-
2.动态特征
-
二.基于逻辑回归的恶意家族检测
-
1.数据集
-
2.模型构建
-
三.基于SVM的恶意家族检测
-
1.SVM模型
-
2.代码实现
-
四.基于随机森林的恶意家族检测
-
五.总结
作者的github资源:
-
逆向分析:
-
https://github.com/eastmountyxz/
SystemSecurity-ReverseAnalysis
-
网络安全:
-
https://github.com/eastmountyxz/
-
NetworkSecuritySelf-study
一.恶意软件分析
1.静态特征
2.动态特征
二.基于逻辑回归的恶意家族检测
1.数据集
恶意家族 | 类别 | 数量 | 训练集 | 测试集 |
---|---|---|---|---|
AAAA | class1 | 352 | 242 | 110 |
BBBB | class2 | 335 | 235 | 100 |
CCCC | class3 | 363 | 243 | 120 |
DDDD | class4 | 293 | 163 | 130 |
EEEE | class5 | 548 | 358 | 190 |
#coding:utf-8
#By:Eastmount CSDN 2023-05-31
import csv
import re
import os
csv.field_size_limit(500 * 1024 * 1024)
filename = "AAAA_result.csv"
writename = "AAAA_result_final.csv"
fw = open(writename, mode="w", newline="")
writer = csv.writer(fw)
writer.writerow(['no', 'type', 'md5', 'api'])
with open(filename,encoding='utf-8') as fr:
reader = csv.reader(fr)
no = 1
for row in reader: #['no','type','md5','api']
tt = row[1]
md5 = row[2]
api = row[3]
#print(no,tt,md5,api)
#api空值的过滤
if api=="" or api=="api":
continue
else:
writer.writerow([str(no),tt,md5,api])
no += 1
fr.close()
2.模型构建
# -*- coding: utf-8 -*-
# By:Eastmount CSDN 2023-06-01
import os
import csv
import time
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
start = time.clock()
csv.field_size_limit(500 * 1024 * 1024)
#---------------------------第一步 加载数据集------------------------
#训练集
file = "train_dataset.csv"
label_train = []
content_train = []
with open(file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
for row in csv_reader:
label_train.append(row[1])
value = str(row[3])
content_train.append(value)
print(label_train[:2])
print(content_train[:2])
#测试集
file = "test_dataset.csv"
label_test = []
content_test = []
with open(file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
for row in csv_reader:
label_test.append(row[1])
value = str(row[3])
content_test.append(value)
print(len(label_train),len(label_test))
print(len(content_train),len(content_test)) #1241 650
#---------------------------第二步 向量转换------------------------
contents = content_train + content_test
labels = label_train + label_test
#计算词频 min_df max_df
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(contents)
words = vectorizer.get_feature_names()
print(words[:10])
print("特征词数量:",len(words))
#计算TF-IDF
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
weights = tfidf.toarray()
#---------------------------第三步 编码转换------------------------
le = LabelEncoder()
y = le.fit_transform(labels)
X_train, X_test = weights[:1241], weights[1241:]
y_train, y_test = y[:1241], y[1241:]
#---------------------------第四步 分类检测------------------------
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)
pre = clf.predict(X_test)
print(clf)
print(classification_report(y_test, pre, digits=4))
print("accuracy:")
print(metrics.accuracy_score(y_test, pre))
#计算时间
elapsed = (time.clock() - start)
print("Time used:", elapsed)
1241 650
1241 650
'accept', 'bind', 'changewindowmessagefilter', 'closesocket', 'clsidfromprogid', 'cocreateinstance', 'cocreateinstanceex', 'cogetclassobject', 'colescript_parsescripttext']
269 :
'liblinear') =
precision recall f1-score support
0 0.5398 0.5545 0.5471 110
1 0.6526 0.6200 0.6359 100
2 0.6596 0.5167 0.5794 120
3 0.8235 0.5385 0.6512 130
4 0.5665 0.7842 0.6578 190
accuracy 0.6215 650
macro avg 0.6484 0.6028 0.6143 650
weighted avg 0.6438 0.6215 0.6199 650
accuracy:
0.6215384615384615
Time used: 2.2597622
三.基于SVM的恶意家族检测
1.SVM模型
1.0, =
cache_size=200,
class_weight=None,
coef0=0.0,
decision_function_shape=None,
degree=3,
gamma='auto',
kernel='rbf',
max_iter=-1,
probability=False,
random_state=None,
shrinking=True,
tol=0.001,
verbose=False)
2.代码实现
# -*- coding: utf-8 -*-
# By:Eastmount CSDN 2023-06-01
import os
import csv
import time
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
start = time.clock()
csv.field_size_limit(500 * 1024 * 1024)
#---------------------------第一步 加载数据集------------------------
#训练集
file = "train_dataset.csv"
label_train = []
content_train = []
with open(file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
for row in csv_reader:
label_train.append(row[1])
value = str(row[3])
content_train.append(value)
print(label_train[:2])
print(content_train[:2])
#测试集
file = "test_dataset.csv"
label_test = []
content_test = []
with open(file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
for row in csv_reader:
label_test.append(row[1])
value = str(row[3])
content_test.append(value)
print(len(label_train),len(label_test))
print(len(content_train),len(content_test)) #1241 650
#---------------------------第二步 向量转换------------------------
contents = content_train + content_test
labels = label_train + label_test
#计算词频 min_df max_df
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(contents)
words = vectorizer.get_feature_names()
print(words[:10])
print("特征词数量:",len(words))
#计算TF-IDF
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
weights = tfidf.toarray()
#---------------------------第三步 编码转换------------------------
le = LabelEncoder()
y = le.fit_transform(labels)
X_train, X_test = weights[:1241], weights[1241:]
y_train, y_test = y[:1241], y[1241:]
#---------------------------第四步 分类检测------------------------
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
pre = clf.predict(X_test)
print(clf)
print(classification_report(y_test, pre, digits=4))
print("accuracy:")
print(metrics.accuracy_score(y_test, pre))
#结果存储
f1 = open("svm_test_pre.txt", "w")
for n in pre:
f1.write(str(n) + "n")
f1.close()
f2 = open("svm_test_y.txt", "w")
for n in y_test:
f2.write(str(n) + "n")
f2.close()
#计算时间
elapsed = (time.clock() - start)
print("Time used:", elapsed)
1241 650
1241 650
['__anomaly__', 'accept', 'bind', 'changewindowmessagefilter', 'closesocket', 'clsidfromprogid', 'cocreateinstance', 'cocreateinstanceex', 'cogetclassobject', 'colescript_parsescripttext']
特征词数量: 269
LinearSVC()
precision recall f1-score support
0 0.6439 0.7727 0.7025 110
1 0.8780 0.7200 0.7912 100
2 0.7315 0.6583 0.6930 120
3 0.9091 0.6154 0.7339 130
4 0.6583 0.8316 0.7349 190
accuracy 0.7292 650
macro avg 0.7642 0.7196 0.7311 650
weighted avg 0.7534 0.7292 0.7301 650
accuracy:
0.7292307692307692
Time used: 2.2672032
四.基于随机森林的恶意家族检测
# -*- coding: utf-8 -*-
# By:Eastmount CSDN 2023-06-01
import os
import csv
import time
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
start = time.clock()
csv.field_size_limit(500 * 1024 * 1024)
#---------------------------第一步 加载数据集------------------------
#训练集
file = "train_dataset.csv"
label_train = []
content_train = []
with open(file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
for row in csv_reader:
label_train.append(row[1])
value = str(row[3])
content_train.append(value)
print(label_train[:2])
print(content_train[:2])
#测试集
file = "test_dataset.csv"
label_test = []
content_test = []
with open(file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
header = next(csv_reader)
for row in csv_reader:
label_test.append(row[1])
value = str(row[3])
content_test.append(value)
print(len(label_train),len(label_test))
print(len(content_train),len(content_test)) #1241 650
#---------------------------第二步 向量转换------------------------
contents = content_train + content_test
labels = label_train + label_test
#计算词频 min_df max_df
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(contents)
words = vectorizer.get_feature_names()
print(words[:10])
print("特征词数量:",len(words))
#计算TF-IDF
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
weights = tfidf.toarray()
#---------------------------第三步 编码转换------------------------
le = LabelEncoder()
y = le.fit_transform(labels)
X_train, X_test = weights[:1241], weights[1241:]
y_train, y_test = y[:1241], y[1241:]
#---------------------------第四步 分类检测------------------------
clf = RandomForestClassifier(n_estimators=5)
clf.fit(X_train, y_train)
pre = clf.predict(X_test)
print(clf)
print(classification_report(y_test, pre, digits=4))
print("accuracy:")
print(metrics.accuracy_score(y_test, pre))
#结果存储
f1 = open("rf_test_pre.txt", "w")
for n in pre:
f1.write(str(n) + "n")
f1.close()
f2 = open("rf_test_y.txt", "w")
for n in y_test:
f2.write(str(n) + "n")
f2.close()
#计算时间
elapsed = (time.clock() - start)
print("Time used:", elapsed)
#---------------------------第五步 可视化分析------------------------
#降维
pca = PCA(n_components=2)
pca = pca.fit(X_test)
xx = pca.transform(X_test)
#画图
plt.figure()
plt.scatter(xx[:,0],xx[:,1],c=y_test, s=50)
plt.title("Malware Family Detection")
plt.show()
1241 650
1241 650
'accept', 'bind', 'changewindowmessagefilter', 'closesocket', 'clsidfromprogid', 'cocreateinstance', 'cocreateinstanceex', 'cogetclassobject', 'colescript_parsescripttext']
269 :
5) =
precision recall f1-score support
0 0.7185 0.8818 0.7918 110
1 0.9000 0.8100 0.8526 100
2 0.7963 0.7167 0.7544 120
3 0.9444 0.7846 0.8571 130
4 0.7656 0.8421 0.8020 190
accuracy 0.8092 650
macro avg 0.8250 0.8070 0.8116 650
weighted avg 0.8197 0.8092 0.8103 650
accuracy:
0.8092307692307692
Time used: 2.1914324
五.总结
前文回顾(下面的超链接可以点击喔):
-
[系统安全] 四十九.恶意家族分类 (1)基于API序列和机器学习的恶意家族分类实例详解
原文始发于微信公众号(娜璋AI安全之家):[系统安全] 四十九.恶意家族分类 (1)基于API序列和机器学习的恶意家族分类实例详解