AIOps系列(1):时间序列分析的方法
1. 数据获取和预处理
import json as jsonimport pandas as pdimport numpy as npdelay= json.loads('xxxx')df=pd.DataFrame(delay['data'])df['dateTime']=pd.to_datetime(df['entry_time']+8*3600*1000,unit='ms')data = df[['dateTime','latency','jitter','loss_percentage']]
2. 数据可视化观测
其实图上就看到很明显的周期性了和使用率不同的峰值结构,当然也有一些测量的异常和系统故障点,总体来说还是很不错的数据集
我们先用一个老的python的统计模型库statsmodel,数据周期性分解如下:
from statsmodels.tsa.seasonal import seasonal_decomposedecomposition = seasonal_decompose(data['latency'],freq=48)data['seasonal']= decomposition.seasonaldata['trend'] = decomposition.trenddata['residual'] = decomposition.resid
然后画个图:
chart2 =bokeh_multi_line_chart(data,['seasonal'],['周期性'],title='周期')chart3 =bokeh_multi_line_chart(data,['trend'],['延迟'],title='趋势')chart4 =bokeh_multi_line_chart(data,['residual'],['残差'],title='残差')show_column([chart2,chart3,chart4])
可以看到趋势线维持在平稳的66~67ms之间, 每天的周期性有点好玩,到了晚高峰有些抖动,主要是丢包引起的延迟测量问题,而且5月10日有明显的异常, 这些异常可以捕获残差获得并上报。
我们再来看丢包率的数据,周期性可以看到明显的早高峰,下午工作段和晚高峰,而趋势线可以明显的看出节假日丢包率更高
这是一套开箱即用的工具,Prophet通过将全自动预测与在线学习相结合从而保证了该工具能够解决大多数商业业务问题,Prophet工作流程如下图所示:
然后fit一下:
from fbprophet import Prophetdata['ds'] = data['dateTime']data['y'] = data['latency']m = Prophet(changepoint_prior_scale=0.01).fit(data)future = m.make_future_dataframe(periods=96,freq='H')fcst = m.predict(future)fig = m.plot(fcst)
同样预测丢包值:
学会了么?AIOps就这么简单这么容易,赶紧找点数据算算写点报告给老板们看吧,加薪了别忘了我就好~~嘿嘿,最后 Bokeh Wrapper代码:
import tabulate as tabulateimport pandas as pdimport numpy as npimport bokeh.plottingimport bokeh.modelsimport bokeh.layoutsimport bokeh.palettesbokeh.plotting.output_notebook()def bokeh_multi_line_chart(df,item_list,legend_list,title,width=1900,height=600,legend_location='bottom_left',x_axis='dateTime',x_axis_type='datetime',y_axis_type='auto',line_width=1.5,alpha=0.7):fig = bokeh.plotting.figure(width=width,height=height,x_axis_type=x_axis_type , y_axis_type=y_axis_type ,title=title)lines_counter = len (item_list)if (lines_counter <= 3):color_list=['#d25535','#35b2d2','#98d235']elif (lines_counter <=10):color_list=bokeh.palettes.Category10[10]else:color_list=bokeh.palettes.Category20[20]for idx in range(0,lines_counter):item = item_list[idx]label = legend_list[idx]fig.line(df[x_axis],df[item],color=color_list[idx],legend=label,line_width=line_width,alpha=alpha)fig.legend.location = legend_locationfig.legend.label_text_font_size = "0.8em"return figdef bokeh_hbar_chart(df,categories_col,value_col,title,color='#B2D235',width=400,height=300):categories = list(df[categories_col])categories.reverse()result_df = df[[categories_col,value_col]]source = bokeh.models.ColumnDataSource(result_df)fig = bokeh.plotting.figure(title=title, y_range=bokeh.models.FactorRange(factors=categories), width=width,height=height)fig.hbar(left=0, y=categories_col,right=value_col, color=color, source=source,height=0.3)return figdef bokeh_vbar_chart(df,categories_col,value_col,title,color='#4F4478',width=600,height=380):rdf = df[[categories_col,value_col]]factors = list(rdf[categories_col])fig = bokeh.plotting.figure(title=title, width=width,height=height,x_range=bokeh.models.FactorRange(*factors))fig.vbar(bottom=0, top=rdf[value_col], x=factors , color=color, width=0.5, alpha=0.8)return figdef bokeh_multi_hbar_chart(df,cat_col,value_list,width=400,height=300):chart_list=[]value_counter = len(value_list)if (value_counter <= 3):color_list=['#5154eb','#b2d235','#df9815']elif (value_counter <=10):color_list=bokeh.palettes.Category10[10]else:color_list=bokeh.palettes.Category20[20]for idx in range(0,value_counter):value_name = value_list[idx]pfig = bokeh_hbar_chart(df,cat_col,value_name,value_name,color=color_list[idx], width=width,height=height)chart_list.append(pfig)return chart_listdef bokeh_hist_chart(item_list,title,bins=100,width=400,height=300,legend_location='bottom_left'):fig = bokeh.plotting.figure(width=width,height=height,title=title)lines_counter = len (item_list)if (lines_counter <=3):color_list=['#036564','red','navy']elif (lines_counter <=10):color_list=bokeh.palettes.Category10b[10]else:color_list=bokeh.palettes.Category20b[20]for idx in range(0,lines_counter):hist,edges = np.histogram(item_list[idx], density=True, bins=bins)fig.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color=color_list[idx], line_color="#033649",alpha=0.5)return figdef show_grid(chart_list, num_in_row = 4):grid_render_idx = 0grid_render_matrix = []templist =[]for item in chart_list:templist.append(item)grid_render_idx +=1if (grid_render_idx == num_in_row):grid_render_matrix.append(templist) #append in a new linetemplist =[]grid_render_idx =0if (len(templist) >0 ):grid_render_matrix.append(templist)bokeh.plotting.show(bokeh.layouts.gridplot(grid_render_matrix))def show_column(chart_list):bokeh.plotting.show(bokeh.layouts.column(chart_list))def show_row(chart_list):bokeh.plotting.show(bokeh.layouts.row(chart_list))
