機器學習算法 - 時間序列系2 - 時序模式實戰(zhàn)

算法實踐分析

直接貼一波代碼,詳細后面再分析

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pylint: disable=E1101
"""
Created on Sat Nov  4 11:04:32 2017

@author: lu
"""

import pandas as pd
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller as ADF

"""
FutureWarning警告:原因未知,在spyder3上運行第二次就消失了,猜測是使用了緩存的原因
attr_trans-->屬性變換
programmer_1-->數(shù)據(jù)篩選
programmer_2-->平穩(wěn)性檢測
programmer_3-->白噪聲檢測
programmer_4-->確定最佳p、d、q值,有問題?。?!
programmer_5-->模型檢驗
programmer_6-->計算預測誤差
"""

PATH = "D:/SoftwareData/Dropbox/MachineLearning/10 kaggleSpareribs/LoadAnalysis/chapter11/"


# 屬性變換
def attr_trans(x):
    """==================== 屬性變換 ====================
    :param x:分組后的每一組數(shù)據(jù),DataFrame對象, 二維數(shù)組
    :return: 返回指定索引的Series對象, 一維數(shù)組
    """
    # 重新定義列名
    result = pd.Series(index=[
        "SYS_NAME", "CWXT_DB:184:C:\\", "CWXT_DB:184:D:\\", "COLLECTTIME"
    ])
    result["SYS_NAME"] = x["SYS_NAME"].iloc[0]  # "SYS_NAME"列第0個數(shù)據(jù)
    result["COLLECTTIME"] = x["COLLECTTIME"].iloc[0]  # "COLLECTTIME"列第0個數(shù)據(jù)
    result["CWXT_DB:184:C:\\"] = x["VALUE"].iloc[0]  # "VALUE"列第0個數(shù)據(jù)
    result["CWXT_DB:184:D:\\"] = x["VALUE"].iloc[1]  # "VALUE"列第1個數(shù)據(jù)

    return result


def programmer_1():
    """==================== 數(shù)據(jù)篩選 ====================

    :return:
    """
    discfile = "{0}data/discdata.xls".format(PATH)
    transformeddata = "{0}tmp/discdata_processed.xls".format(PATH)
    data = pd.read_excel(discfile)
    # 提取某部分數(shù)據(jù) TARGET_ID為184
    data = data[data["TARGET_ID"] == 184].copy()

    # 以某字段進行分組
    data_group = data.groupby("COLLECTTIME")

    # 調(diào)用 attr_trans 對分組后的數(shù)據(jù)進行處理
    data_processed = data_group.apply(attr_trans)
    # 將結(jié)果存入文件中
    data_processed.to_excel(transformeddata, index=False)


def programmer_2():
    discfile = "{0}data/discdata_processed.xls".format(PATH)
    data = pd.read_excel(discfile)
    # 去除最后5個數(shù)據(jù)
    predictnum = 0
    data = data.iloc[:len(data) - predictnum]

    # 平穩(wěn)性檢測 【問題1:這個ADF計算方法沒有搞懂】
    diff = 0
    adf = ADF(data["CWXT_DB:184:D:\\"])
    while adf[1] > 0.05:
        diff = diff + 1
        adf = ADF(data["CWXT_DB:184:D:\\"].diff(diff).dropna())

    print(u"原始序列經(jīng)過%s階差分后歸于平穩(wěn),p值為%s" % (diff, adf[1]))


def programmer_3():
    discfile = "{0}data/discdata_processed.xls".format(PATH)

    data = pd.read_excel(discfile)
    data = data.iloc[:len(data) - 5]

    # 【問題2:這個acorr_ljungbox計算方法沒有搞懂】
    # 厲害了還能這樣子賦值
    [[lb], [p]] = acorr_ljungbox(data["CWXT_DB:184:D:\\"], lags=1)
    if p < 0.05:
        print(u"原始序列為非白噪聲序列,對應的p值為:%s" % p)
    else:
        print(u"原始序列為白噪聲序列,對應的p值為:%s" % p)

    [[lb], [p]] = acorr_ljungbox(data["CWXT_DB:184:D:\\"].diff().dropna(), lags=1)

    if p < 0.05:
        print(u"一階差分序列為非白噪聲序列,對應的p值為:%s" % p)
    else:
        print(u"一階差分序列為白噪聲序列,對應的p值為:%s" % p)
    print(lb)


def programmer_4():
    """
     【問題3:運行失敗了】
    :return:
    """
    discfile = "data/discdata_processed.xls"

    data = pd.read_excel(discfile, index_col="COLLECTTIME")
    # 不使用最后五個數(shù)據(jù)
    data = data.iloc[:len(data) - 5]
    xdata = data["CWXT_DB:184:D:\\"]

    # 定階
    pmax = int(len(xdata) / 10)
    qmax = int(len(xdata) / 10)
    # 定義bic矩陣
    bic_matrix = []
    for p in range(pmax + 1):
        tmp = []
        for q in range(qmax + 1):
            try:
                tmp.append(ARIMA(xdata, (p, 1, q)).fit().bic)
            except:
                tmp.append(None)
        bic_matrix.append(tmp)

    bic_matrix = pd.DataFrame(bic_matrix)
    # 找出最小值
    p, q = bic_matrix.stack().idxmin()
    print(u"BIC最小的p值和q值為:%s、%s" % (p, q))


def programmer_5():
    discfile = "{0}data/discdata_processed.xls".format(PATH)
    # 殘差延遲個數(shù)
    lagnum = 12

    data = pd.read_excel(discfile, index_col="COLLECTTIME")
    data = data.iloc[:len(data) - 5]
    xdata = data["CWXT_DB:184:D:\\"]

    # 訓練模型并預測,計算殘差 【問題4:ARIMA的使用方法】
    arima = ARIMA(xdata, (0, 1, 1)).fit()
    xdata_pred = arima.predict(typ="levels")
    pred_error = (xdata_pred - xdata).dropna()

    lb, p = acorr_ljungbox(pred_error, lags=lagnum)
    h = (p < 0.05).sum()
    if h > 0:
        print(u"模型ARIMA(0,1,1)不符合白噪聲檢驗")
    else:
        print(u"模型ARIMA(0,1,1)符合白噪聲檢驗")
    print(lb)


def programmer_6():
    #【問題5:ARIMA的使用方法】
    file = "{0}data/predictdata.xls".format(PATH)
    data = pd.read_excel(file)

    # 計算誤差
    abs_ = (data[u"預測值"] - data[u"實際值"]).abs()
    mae_ = abs_.mean()
    rmse_ = ((abs_ ** 2).mean()) ** 0.5
    mape_ = (abs_ / data[u"實際值"]).mean()

    print(u"平均絕對誤差為:%0.4f, \n 均方根誤差為%0.4f, \n平均絕對百分誤差為:%0.6f。" % (mae_, rmse_, mape_))


if __name__ == "__main__":
    # programmer_1()
    # programmer_2()
    # programmer_3()
    # programmer_4()
    # programmer_5()
    programmer_6()
    pass

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容