别催~ 在加载了 . . .

Anomaly Detection


Anomaly Detection

Question

In this exercise, you will implement an anomaly detection algorithm to detect anomalous behavior in server computers. The features measure the through put (mb/s) and latency (ms) of response of each server. While your servers were operating, you collected m = 307 examples of how they were behaving,

and thus have an unlabeled dataset {*x (1)*, . . . , x(m)}. You suspect that the vast majority of these examples are “normal” (non-anomalous) examples of the servers operating normally, but there might also be some examples of servers acting anomalously within this dataset.

理论基础

在实际生产中,我们借助于计算机通常会不断得到很多数据反馈,(比如飞机运行时的发动机温度转速、风扇转速、颠簸度等等)正确判断反馈的数据是否正常是整个系统安全平稳运行的基础,因此数据的异常检测极为重要。常见的异常检测其实跟“机器学习”的关联性并不是很大,体现机器学习的地方可能就是通过计算机选取很多个epsilon来找到契合模型的最佳的epsilon。普通的异常检测的本质是对于高斯函数的运用,我们需要对数据进行处理,使其能形成一个近似高斯函数,然后选取最佳的epsilon,在epsilon限定范围以内的点记为异常点。具体epsilon的选取方式为:得分=2×精确率×召回率/(精确率+召回率)。得分越大epsilon选取越好。

数据读取处理

1
2
3
4
5
6
7
8
9
10
11
12
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

mat=sio.loadmat('Anomaly Detection.mat')
mat.keys()
dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])

X=mat['X']
Xval,yval=mat['Xval'],mat['yval']
X.shape,Xval.shape,yval.shape
((307, 2), (307, 2), (307, 1))
1
2
plt.plot(X[:,0],X[:,1],'bx')
plt.show()


png

1
2
3
4
5
6
7
def estimateGaussian(X,isCovariance):
means=np.mean(X,axis=0)
if isCovariance:
sigma2=(X-means).T@(X-means)/len(X)
else:
sigma2=np.var(X,axis=0)
return means,sigma2
1
2
3
4
5
6
7
8
means,sigma2=estimateGaussian(X,isCovariance=True)
sigma2
array([[ 1.83263141, -0.22712233],
[-0.22712233, 1.70974533]])

means,sigma2=estimateGaussian(X,isCovariance=False)
sigma2
array([1.83263141, 1.70974533])

高斯函数的构建

1
2
3
4
5
6
7
8
9
10
11
12
def gaussian(X,means,sigma2):
if np.ndim(sigma2)==1:
sigma2=np.diag(sigma2)
X=X-means
n=X.shape[1]

first=np.power(2*np.pi,-n/2)*(np.linalg.det(sigma2)**(-0.5))
second=np.diag(X@np.linalg.inv(sigma2)@X.T)
p=first*np.exp(-0.5*second)
p=p.reshape(-1,1)

return p
1
2
3
4
5
6
7
8
9
def plotGaussian(X,means,sigma2):
x=np.arange(0,30,0.5)
y=np.arange(0,30,0.5)
xx,yy=np.meshgrid(x,y)
z=gaussian(np.c_[xx.ravel(),yy.ravel()],means,sigma2)
zz=z.reshape(xx.shape)
plt.plot(X[:,0],X[:,1],'bx')
contour_levels=[10**h for h in range(-20,0,3)]
plt.contour(xx,yy,zz,contour_levels)
1
2
means,sigma2=estimateGaussian(X,isCovariance=False)
plotGaussian(X,means,sigma2)


png

1
2
means,sigma2=estimateGaussian(X,isCovariance=True)
plotGaussian(X,means,sigma2)


png

选取最佳的Epsilon

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def selectThreshold(yval,p):
bestEpsilon=0
bestF1=0
epsilons=np.linspace(min(p),max(p),1000)
for e in epsilons:
p_=p<e
tp=np.sum((yval==1)&(p_==1))
fp=np.sum((yval==0)&(p_==1))
fn=np.sum((yval==1)&(p_==0))
prec=tp/(tp+fp)if(tp+fp) else 0
rec=tp/(tp+fn)if(tp+fn) else 0
F1_e=2*prec*rec/(prec+rec)if(prec+rec) else 0
if F1_e>bestF1:
bestF1=F1_e
bestEpsilon=e
return bestEpsilon,bestF1
1
2
3
4
5
6
means,sigma2=estimateGaussian(X,isCovariance=True)
pval=gaussian(Xval,means,sigma2)
bestEpsilon,bestF1=selectThreshold(yval,pval)

bestEpsilon,bestF1
(array([9.07484457e-05]), 0.8750000000000001)
1
2
3
4
p=gaussian(X,means,sigma2)
anoms=np.array([X[i] for i in range(X.shape[0])if p[i]<bestEpsilon])
plotGaussian(X,means,sigma2)
plt.scatter(anoms[:,0],anoms[:,1],c='r',marker='o')


png

1
2
3
4
5
6
means,sigma2=estimateGaussian(X,isCovariance=False)
pval=gaussian(Xval,means,sigma2)
bestEpsilon,bestF1=selectThreshold(yval,pval)

bestEpsilon,bestF1
(array([8.99985263e-05]), 0.8750000000000001)
1
2
3
4
p=gaussian(X,means,sigma2)
anoms=np.array([X[i] for i in range(X.shape[0])if p[i]<bestEpsilon])
plotGaussian(X,means,sigma2)
plt.scatter(anoms[:,0],anoms[:,1],c='r',marker='o')


png

高维情况

可以看出是否协方差对于异常点的估计有影响

1
2
3
4
5
6
mat=sio.loadmat('Anomaly Detection_2.mat')
X2=mat['X']
Xval2,yval2=mat['Xval'],mat['yval']
X2.shape

(1000, 11)
1
2
3
4
5
6
7
means,sigma2=estimateGaussian(X2,isCovariance=False)
pval=gaussian(Xval2,means,sigma2)
bestEpsilon,bestF1=selectThreshold(yval2,pval)
p=gaussian(X2,means,sigma2)
anoms=[X2[i] for i in range(len(X2)) if p[i]<bestEpsilon]
len(anoms)
117
1
2
3
4
5
6
7
means,sigma2=estimateGaussian(X2,isCovariance=True)
pval=gaussian(Xval2,means,sigma2)
bestEpsilon,bestF1=selectThreshold(yval2,pval)
p=gaussian(X2,means,sigma2)
anoms=[X2[i] for i in range(len(X2)) if p[i]<bestEpsilon]
len(anoms)
122

Site

代码(Jupyter)和所用数据:https://github.com/codeYu233/Study/tree/main/Anomaly%20Detection

Note

该题与数据集均来源于Coursera上斯坦福大学的吴恩达老师机器学习的习题作业,学习交流用,如有不妥,立马删除


文章作者: codeYu233
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 codeYu233 !
评论
  目录