diff --git "a/PM2.5 \351\231\210\345\260\217\346\235\276 .md" "b/PM2.5 \351\231\210\345\260\217\346\235\276 .md" new file mode 100644 index 0000000..78a3163 --- /dev/null +++ "b/PM2.5 \351\231\210\345\260\217\346\235\276 .md" @@ -0,0 +1,8 @@ +项目流程: +1.设计模型: + 先整理集合,绘制散点图,因为建立的是Linear Regression模型,结合散点图选择features. +2.定义LOSS函数 + 设计Loss函数,梯度下降 +总结: + 在项目进行中,发现自己python掌握很差,应当多学习学习 + 对LOSS函数的建立,及梯度下降还是不太懂,应加强学习。 \ No newline at end of file diff --git "a/PM2.5 \351\231\210\345\260\217\346\235\276.py" "b/PM2.5 \351\231\210\345\260\217\346\235\276.py" new file mode 100644 index 0000000..b8f1f31 --- /dev/null +++ "b/PM2.5 \351\231\210\345\260\217\346\235\276.py" @@ -0,0 +1,49 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +data = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\train.csv') + +pm2_5 = data[data['class']=='PM2.5'].ix[:, 3:] + +tempxlist = [] +tempylist = [] +for i in range(15): + tempx = pm2_5.iloc[:, i:i+9] + tempx.columns = np.array(range(9)) + tempy = pm2_5.iloc[:, i+9] + tempy.columns = ['1'] + tempxlist.append(tempx) + tempylist.append(tempy) +xdata = pd.concat(tempxlist) +x = np.array(xdata, float) +ydata = pd.concat(tempylist) +y = np.array(ydata, float) + +x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1) + +w = np.zeros((len(x[0]))) +lr = 10 +iteration = 10000 +s_grad = np.zeros(len(x[0])) +for i in range(iteration): + tem = np.dot(x, w) + loss = y - tem + grad = np.dot(x.transpose(), loss)*(-2) + s_grad += grad**2 + ada = np.sqrt(s_grad) + w = w - lr*grad/ada + +testdata = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\test.csv') +pm2_5_test = testdata[testdata['class']=='PM2.5'].ix[:, 2:] +x_test = np.array(pm2_5_test, float) +x_test_b = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1) +y_star = np.dot(x_test_b, w) +y_pre = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\sampleSubmission.csv', encoding='gbk') +y_pre.value = y_star + +real = pd.read_csv(r'C:\Users\18801\Documents\WeChat Files\wxid_8vs9i3nhli4u22\FileStorage\File\2019-05\2017fall-ml-hw1-master\ans.csv') +erro = abs(y_pre.value - real.value).sum()/len(real.value) +print(erro) + +输出:4.97442948413 \ No newline at end of file diff --git "a/PM2.5_ \345\256\213\346\226\207\345\256\207.py" "b/PM2.5_ \345\256\213\346\226\207\345\256\207.py" new file mode 100644 index 0000000..0ea88d2 --- /dev/null +++ "b/PM2.5_ \345\256\213\346\226\207\345\256\207.py" @@ -0,0 +1,179 @@ +import pandas as pd +import matplotlib.pyplot as plt +from random import randint +from numba import jit + +# @jit +def test(): + '''读取数据''' + data = pd.read_csv('train.csv') #DataFrame类型 + # print(data) + # # del data[:3] + del data['datetime'] + del data['stations'] + del data['observations'] + data.drop([0]) + # print(data) + data = data.replace('NR', 0) + # print(data) + # print(data) + + + '''整理训练集合''' + ItemNum=18 + X_Train=[] #训练样本features集合 + Y_Train=[] #训练样本目标PM2.5集合 + for i in range(int(len(data)/ItemNum)): + day = data[i*ItemNum:(i+1)*ItemNum] #一天的观测数据 + for j in range(15): + x = day.iloc[:, j:j + 9] + y = int(day.iloc[9,j+9]) + X_Train.append(x) + Y_Train.append(y) + + + '''绘制散点图''' + x_AMB_TEMP=[] + x_CH4=[] + x_CO=[] + x_NMHC=[] + y=[] + for i in range(len(Y_Train)): + y.append(Y_Train[i]) + x=X_Train[i] + #求各测项的平均值 + x_AMB_TEMP_sum=0 + x_CH4_sum=0 + x_CO_sum=0 + x_NMHC_sum=0 + for j in range(9): + # print(x.iloc[0,j]) + x_AMB_TEMP_sum=x_AMB_TEMP_sum+float(x.iloc[0,j]) + # print(x.iloc[1, j]) + x_CH4_sum = x_CH4_sum + float(x.iloc[1, j]) + x_CO_sum = x_CO_sum + float(x.iloc[2, j]) + x_NMHC_sum = x_NMHC_sum + float(x.iloc[3, j]) + x_AMB_TEMP.append(x_AMB_TEMP_sum / 9) + x_CH4.append(x_CH4_sum / 9) + x_CO.append(x_CO_sum / 9) + x_NMHC.append(x_NMHC_sum / 9) + plt.figure(figsize=(10, 6)) + plt.subplot(2, 2, 1) + plt.title('AMB_TEMP') + plt.scatter(x_AMB_TEMP, y) + plt.subplot(2, 2, 2) + plt.title('CH4') + plt.scatter(x_CH4, y) + plt.subplot(2, 2, 3) + plt.title('CO') + plt.scatter(x_CO, y) + plt.subplot(2, 2, 4) + plt.title('NMHC') + plt.scatter(x_NMHC, y) + plt.show() + + '''小批量梯度下降''' + dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12} + iteration_count = 100 #迭代次数 + learning_rate = 0.000001 #学习速率 + b=0.0001 #初始化偏移项 + parameters=[0.001]*27 #初始化27个参数 + loss_history=[] + for i in range(iteration_count): + loss=0 + b_grad=0 + w_grad=[0]*27 + examples=list(randint(0, len(X_Train)-1) for index in range(100)) + print("标记1",i,"\n") + for j in range(100): + index=examples.pop() + day = X_Train[index] + # print(day) + # # print(parameters[0]) + # print(day.iloc[8,0]) + # print(day.iloc[8,1]) + # print(type(day.iloc[8,1])) + + + # print(day.iloc[8,:]) + # print(day.iloc[9,:]) + # print(day.iloc[12,:]) + # # print(day.iloc[8,1]) + # # print(day.iloc[8,2]) + # # print(day.iloc[8,1]) + + partsum = b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])-Y_Train[index] + loss=loss + partsum * partsum + b_grad = b_grad + partsum + for k in range(27): + # print(day.iloc[dict[k],k % 9]) + w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9]) + loss_history.append(loss/2) + #更新参数 + b = b - learning_rate * b_grad + for t in range(27): + parameters[t] = parameters[t] - learning_rate * w_grad[t] + # print("jieshu ") + + '''小批量梯度下降''' + dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12} + iteration_count = 100 #迭代次数 + learning_rate = 0.000001 #学习速率 + b=0.0001 #初始化偏移项 + parameters=[0.001]*27 #初始化27个参数 + loss_history=[] + for i in range(iteration_count): + print("标记2",i,"\n") + loss=0 + b_grad=0 + w_grad=[0]*27 + examples=list(randint(0, len(X_Train)-1) for index in range(100)) + for j in range(100): + index=examples.pop() + day = X_Train[index] + partsum = b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])-Y_Train[index] + loss=loss + partsum * partsum + b_grad = b_grad + partsum + for k in range(27): + w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9]) + loss_history.append(loss/2) + #更新参数 + b = b - learning_rate * b_grad + for t in range(27): + parameters[t] = parameters[t] - learning_rate * w_grad[t] + + '''评价模型''' + data1 = pd.read_csv('test.csv') + del data1['id'] + del data1['item'] + # data.drop([0], axis=1) + + + X_Test=[] + ItemNum=18 + for i in range(int(len(data1)/ItemNum)): + day = data1[i*ItemNum:(i+1)*ItemNum] #一天的观测数据 + X_Test.append(day) + Y_Test=[] + data2 = pd.read_csv('answer.csv') + for i in range(len(data2)): + Y_Test.append(data2.iloc[i,1]) + b=0.00371301266193 + parameters=[-0.0024696993501677625, 0.0042664323568029619, -0.0086174899917209787, -0.017547874680980298, -0.01836289806786489, -0.0046459546176775678, -0.031425910733080147, 0.018037490234208024, 0.17448898242705385, 0.037982590870111861, 0.025666115101346722, 0.02295437149703404, 0.014272058968395849, 0.011573452230087483, 0.010984971346586308, -0.0061003639742210781, 0.19310213021199321, 0.45973205224805752, -0.0034995637680653086, 0.00094072189075279807, 0.00069329550591916357, 0.002966257320079194, 0.0050690506276038138, 0.007559004246038563, 0.013296350700555241, 0.027251049329127801, 0.039423988570899793] + Y_predict=[] + for i in range(len(X_Test)): + day=X_Test[i] + p=b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8]) + Y_predict.append(p) + def dev_degree(y_true,y_predict): #评价函数 + sum=0 + for i in range(len(y_predict)): + sum=sum+(y_true[i]-y_predict[i])*(y_true[i]-y_predict[i]) + return sum/len(y_predict) + print(dev_degree(Y_Test,Y_predict)) + +def main(): + test() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git "a/PM2.5_\344\272\216\351\207\221\346\263\275.md" "b/PM2.5_\344\272\216\351\207\221\346\263\275.md" new file mode 100644 index 0000000..d25479b --- /dev/null +++ "b/PM2.5_\344\272\216\351\207\221\346\263\275.md" @@ -0,0 +1,15 @@ +# 实现思路 +- 读取文件,获取PM2.5的数据 +- 9个小时为一组进行整合,存储在列表中 +- 对x添加一列1起偏置作用 +- 循环更新w参数 +- 读取测试文件,计算预测值和真实值的偏差,计算平均误差 + +# 学习内容 +- 实现了辛普森一家的分类,并且实现了处理视频 +- 学习了LSTM和RNN的实现原理 + +# 学习预期 +- 实现至少5个kaggle项目,使用pytorch实现 +- 制作一个简易的chatbot +- 制作一个爬取公众号的scrapy项目,尽量实现界面化 diff --git "a/PM2.5_\344\272\216\351\207\221\346\263\275.py" "b/PM2.5_\344\272\216\351\207\221\346\263\275.py" new file mode 100644 index 0000000..a17a186 --- /dev/null +++ "b/PM2.5_\344\272\216\351\207\221\346\263\275.py" @@ -0,0 +1,48 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +data = pd.read_csv('train.csv', encoding='gbk') + +pm2_5 = data[data['class']=='PM2.5'].ix[:, 3:] + +tempxlist = [] +tempylist = [] +for i in range(15): + tempx = pm2_5.iloc[:, i:i+9] + tempx.columns = np.array(range(9)) + tempy = pm2_5.iloc[:, i+9] + tempy.columns = ['1'] + tempxlist.append(tempx) + tempylist.append(tempy) +xdata = pd.concat(tempxlist) +x = np.array(xdata, float) +ydata = pd.concat(tempylist) +y = np.array(ydata, float) + +x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1) + +w = np.zeros((len(x[0]))) +lr = 10 +iteration = 10000 +s_grad = np.zeros(len(x[0])) +for i in range(iteration): + tem = np.dot(x, w) + loss = y - tem + grad = np.dot(x.transpose(), loss)*(-2) + s_grad += grad**2 + ada = np.sqrt(s_grad) + w = w - lr*grad/ada + +testdata = pd.read_csv('test.csv', encoding='gbk') +pm2_5_test = testdata[testdata['class']=='PM2.5'].ix[:, 2:] +x_test = np.array(pm2_5_test, float) +x_test_b = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1) +y_star = np.dot(x_test_b, w) +y_pre = pd.read_csv('sampleSubmission.csv', encoding='gbk') +y_pre.value = y_star + +real = pd.read_csv('ans.csv', encoding='gbk') +erro = abs(y_pre.value - real.value).sum()/len(real.value) +print(erro) + diff --git "a/PM2.5_\345\220\264\350\266\212.md" "b/PM2.5_\345\220\264\350\266\212.md" new file mode 100644 index 0000000..761df1f --- /dev/null +++ "b/PM2.5_\345\220\264\350\266\212.md" @@ -0,0 +1,41 @@ +# PM 2.5数据分析及预测及近期学习报告# + +## 已完成 ## + + +- 对文章的进行关于PM2.5的分析预测思路有了大致的了解,并通过对csv文件的预处理,成功运行实现了代码,但对于核心梯度下降的代码部分仍有遗留问题 + + + +- 文章通过对18项空气质量指标,通过绘制各个空气指标前9小时与待预测PM2.5值的散点图,从中筛选出关键特征(Features),至于散点图的筛选标准,我个人觉得为该散点图的正相关程度,作者最终选择了S02,PM 10以及PM 2.5作为三个关键特征带入后续计算,而这三者图像均倾向于正相关趋势 + + + +- 三个特征项,每一项前9个小时的数据,因而有27个weight权重值进入梯度下降计算中 +![](https://i.imgur.com/XI2Iq7W.png) + +## 存疑(代码运行中存在问题) ## + + +- 博客作者最后评价体系给出了他所得到的最佳的27个weight权值值,而我通过运算的到的Loss值73.65,与博客最终提供的45.68有较大的出入 + + + +- 尝试自己迭代进行小批量梯度下降计算,得到的权值十分巨大,迭代部分核心代码有较大问题,仍在思考解决中(按照源码迭代出的权值,超级大,以至于最后的Loss值为Inf,不得不说,真是惊了) + +## 学习进展 ## + + +- 看了两集李宏毅的学习教程,在宝可梦的谜一样世界里梦游,不得不说,这样的老师讲课,学生不会瞌睡倒是真的 + + + +- 通过Mooc入门学习了TensorFlow框架,目前学至神经网络的正则化,TensorFlow函数实在千姿百态,刚入门,实在记不太住具体用法和使用情景 + + + +- 完成辛普森一家角色人物分拣,第一次使用热门的数据平台Kaggle,用的很蹩脚,希望以后多多使用,增加熟悉 + + + +5/13/2019 12:22:49 AM diff --git "a/PM2.5_\345\220\264\350\266\212.py" "b/PM2.5_\345\220\264\350\266\212.py" new file mode 100644 index 0000000..2cc0560 --- /dev/null +++ "b/PM2.5_\345\220\264\350\266\212.py" @@ -0,0 +1,161 @@ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from random import randint + +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +'''读取数据''' +data = pd.read_csv('train.csv') # DataFrame类型 +# 对数据表格进行加工 +del data['datetime'] +del data['item'] +del data['location'] +data.drop([0]) +data = data.replace("NR", 0) + +'''整理训练集合''' +ItemNum = 18 +X_Train = [] # 训练样本features集合 +Y_Train = [] # 训练样本目标PM2.5集合 +for i in range(int(len(data)/ItemNum)): + day = data[i*ItemNum:(i+1)*ItemNum] # 一天的观测数据 + for j in range(15): + x = day.iloc[:, j:j + 9] + y = int(day.iloc[9, j+9]) + X_Train.append(x) + Y_Train.append(y) + +'''绘制散点图''' +x_AMB_TEMP = [] +x_CH4 = [] +x_CO = [] +x_NMHC = [] +x_SO2 = [] +x_NO2 = [] +x_NOX = [] +x_PM10 = [] +x_PM2 = [] +y = [] +for i in range(len(Y_Train)): + y.append(Y_Train[i]) + x = X_Train[i] + #求各测项的平均值 + x_AMB_TEMP_sum = 0 + x_CH4_sum = 0 + x_CO_sum = 0 + x_NMHC_sum = 0 + x_SO2_sum = 0 + x_NO2_sum = 0 + x_NOX_sum = 0 + x_PM10_sum = 0 + x_PM2_sum = 0 + + for j in range(9): + x_AMB_TEMP_sum = x_AMB_TEMP_sum + float(x.iloc[0, j]) + x_CH4_sum = x_CH4_sum + float(x.iloc[1, j]) + x_CO_sum = x_CO_sum + float(x.iloc[2, j]) + x_NMHC_sum = x_NMHC_sum + float(x.iloc[3, j]) + x_SO2_sum = x_SO2_sum + float(x.iloc[12, j]) + x_NO2_sum = x_NO2_sum + float(x.iloc[5, j]) + x_NOX_sum = x_NOX_sum + float(x.iloc[6, j]) + x_PM10_sum = x_PM10_sum + float(x.iloc[8, j]) + x_PM2_sum = x_PM2_sum + float(x.iloc[9, j]) + x_AMB_TEMP.append(x_AMB_TEMP_sum / 9) + x_CH4.append(x_CH4_sum / 9) + x_CO.append(x_CO_sum / 9) + x_NMHC.append(x_NMHC_sum / 9) + x_NO2.append(x_NO2_sum / 9) + x_NOX.append(x_NOX_sum / 9) + x_SO2.append(x_SO2_sum / 9) + x_PM10.append(x_PM10_sum / 9) + x_PM2.append(x_PM2_sum / 9) +plt.figure(figsize=(10, 6)) +plt.subplot(2, 5, 1) +plt.title('AMB_TEMP') +plt.scatter(x_AMB_TEMP, y) +plt.subplot(2, 5, 2) +plt.title('CH4') +plt.scatter(x_CH4, y) +plt.subplot(2, 5, 3) +plt.title('CO') +plt.scatter(x_CO, y) +plt.subplot(2, 5, 4) +plt.title('NMHC') +plt.scatter(x_NMHC, y) +plt.subplot(2, 5, 5) +plt.title('SO2') +plt.scatter(x_SO2, y) +plt.subplot(2, 5, 6) +plt.title('NO2') +plt.scatter(x_NO2, y) +plt.subplot(2, 5, 7) +plt.title('NOX') +plt.scatter(x_NOX, y) +plt.subplot(2, 5, 8) +plt.title('PM 10') +plt.scatter(x_PM10, y) +plt.subplot(2, 5, 9) +plt.title('PM 2.5') +plt.scatter(x_PM2, y) +plt.show() + +'''小批量梯度下降''' +dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12} +iteration_count = 200 #迭代次数 +learning_rate = 0.000001 #学习速率 +b=0.0001 #初始化偏移项 +parameters=[0.001]*27 #初始化27个参数 +loss_history=[] +for i in range(iteration_count): + loss=0 + b_grad=0 + w_grad=[0]*27 + examples=list(randint(0, len(X_Train)-1) for index in range(100)) + for j in range(100): + index=examples.pop() + day = X_Train[index] + partsum = b + parameters[0] * float(day.iloc[8,0]) + parameters[1] * float(day.iloc[8,1]) + parameters[2] * float(day.iloc[8,2])+ parameters[3] * float(day.iloc[8,3]) + parameters[4] * float(day.iloc[8,4]) + parameters[5] * float(day.iloc[8,5]) + parameters[6] * float(day.iloc[8,6]) + parameters[7] * float(day.iloc[8,7]) + parameters[8] * float(day.iloc[8,8]) + parameters[9] * float(day.iloc[9,0]) + parameters[10] * float(day.iloc[9,1]) + parameters[11] * float(day.iloc[9,2]) + parameters[12] * float(day.iloc[9,3]) + parameters[13] * float(day.iloc[9,4]) + parameters[14] * float(day.iloc[9,5]) + parameters[15] * float(day.iloc[9,6]) + parameters[16] * float(day.iloc[9,7]) + parameters[17] * float(day.iloc[9,8]) + parameters[18] * float(day.iloc[12,0]) + parameters[19] * float(day.iloc[12,1]) + parameters[20] * float(day.iloc[12,2]) + parameters[21] * float(day.iloc[12,3]) + parameters[22] * float(day.iloc[12,4]) + parameters[23] * float(day.iloc[12,5]) + parameters[24] * float(day.iloc[12,6]) + parameters[25] * float(day.iloc[12,7]) + parameters[26] * float(day.iloc[12,8]) - Y_Train[index] + loss=loss + partsum * partsum + b_grad = b_grad + partsum + for k in range(27): + w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9]) + loss_history.append(loss/2) + #更新参数 + b = b - learning_rate * b_grad + for t in range(27): + parameters[t] = parameters[t] - learning_rate * w_grad[t] + if i % 100 == 0: + print(i) + + +'''评价模型''' +data1 = pd.read_csv('test.csv') +# 对数据进行加工 +del data1['id'] +del data1['item'] +data1.drop([0]) +data.replace("NR", 0) +X_Test=[] +ItemNum=18 +for i in range(int(len(data1)/ItemNum)): + day = data1[i*ItemNum:(i+1)*ItemNum] #一天的观测数据 + X_Test.append(day) +Y_Test=[] +data2 = pd.read_csv('ans.csv') +for i in range(len(data2)): + Y_Test.append(data2.iloc[i,1]) +b=0.00371301266193 +parameters=[-0.0024696993501677625, 0.0042664323568029619, -0.0086174899917209787, -0.017547874680980298, -0.01836289806786489, -0.0046459546176775678, -0.031425910733080147, 0.018037490234208024, 0.17448898242705385, 0.037982590870111861, 0.025666115101346722, 0.02295437149703404, 0.014272058968395849, 0.011573452230087483, 0.010984971346586308, -0.0061003639742210781, 0.19310213021199321, 0.45973205224805752, -0.0034995637680653086, 0.00094072189075279807, 0.00069329550591916357, 0.002966257320079194, 0.0050690506276038138, 0.007559004246038563, 0.013296350700555241, 0.027251049329127801, 0.039423988570899793] +Y_predict=[] +for i in range(len(X_Test)): + day=X_Test[i] + p = b + parameters[0] * float(day.iloc[8,0]) + parameters[1] * float(day.iloc[8,1]) + parameters[2] * float(day.iloc[8,2])+ parameters[3] * float(day.iloc[8,3]) + parameters[4] * float(day.iloc[8,4]) + parameters[5] * float(day.iloc[8,5]) + parameters[6] * float(day.iloc[8,6]) + parameters[7] * float(day.iloc[8,7]) + parameters[8] * float(day.iloc[8,8]) + parameters[9] * float(day.iloc[9,0]) + parameters[10] * float(day.iloc[9,1]) + parameters[11] * float(day.iloc[9,2]) + parameters[12] * float(day.iloc[9,3]) + parameters[13] * float(day.iloc[9,4]) + parameters[14] * float(day.iloc[9,5]) + parameters[15] * float(day.iloc[9,6]) + parameters[16] * float(day.iloc[9,7]) + parameters[17] * float(day.iloc[9,8]) + parameters[18] * float(day.iloc[12,0]) + parameters[19] * float(day.iloc[12,1]) + parameters[20] * float(day.iloc[12,2]) + parameters[21] * float(day.iloc[12,3]) + parameters[22] * float(day.iloc[12,4]) + parameters[23] * float(day.iloc[12,5]) + parameters[24] * float(day.iloc[12,6]) + parameters[25] * float(day.iloc[12,7]) + parameters[26] * float(day.iloc[12,8]) + Y_predict.append(p) +def dev_degree(y_true,y_predict): #评价函数 + sum=0 + for i in range(len(y_predict)): + sum=sum+(y_true[i]-y_predict[i])*(y_true[i]-y_predict[i]) + return sum/len(y_predict) +print(dev_degree(Y_Test,Y_predict)) diff --git "a/PM2.5_\350\275\246\345\255\220\346\235\260.md" "b/PM2.5_\350\275\246\345\255\220\346\235\260.md" new file mode 100644 index 0000000..ee1efdb --- /dev/null +++ "b/PM2.5_\350\275\246\345\255\220\346\235\260.md" @@ -0,0 +1,24 @@ + +## 实现思路 +> 首先导入pandas库,读取test.csv和train.csv文件, 以此读取数据 +> 以前9小时数据为一组存入data中 +> 通过获取的数据,根据计算方式,计算预测值和真实偏差 +> 计算平均误差 + +## 实现结果 +>成功获得平均误差 +但是平均误差相对来说较大 +没有实现另一个csdn中给出的散点绘图,plt的用法还在学习中 + +## 学习内容 +>观看李宏毅视频至PM2.5处 +学习python的基本用法 + +## 下阶段学习计划 +>继续深入学习python用法,了解其他库文件用法 +继续学习李宏毅视频 +尝试实现一下辛普森一家问题 + + + + diff --git "a/PM2.5_\350\275\246\345\255\220\346\235\260.py" "b/PM2.5_\350\275\246\345\255\220\346\235\260.py" new file mode 100644 index 0000000..230975a --- /dev/null +++ "b/PM2.5_\350\275\246\345\255\220\346\235\260.py" @@ -0,0 +1,46 @@ +import pandas as pd +import numpy as np + +data = pd.read_csv('train.csv', encoding='gbk') +pm2_5 = data[data["class"]=="PM2.5"].ix[:, 3:] + +tempxlist = [] +tempylist = [] +for i in range(15): + tempx = pm2_5.iloc[:, i:i+9] + tempx.columns = np.array(range(9)) + tempy = pm2_5.iloc[:, i+9] + tempy.columns = ['1'] + tempxlist.append(tempx) + tempylist.append(tempy) +xdata = pd.concat(tempxlist) +x = np.array(xdata, float) +ydata = pd.concat(tempylist) +y = np.array(ydata, float) + +x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1) + +w = np.zeros((len(x[0]))) +lr = 10 +iteration = 10000 +s_grad = np.zeros(len(x[0])) +for i in range(iteration): + tem = np.dot(x, w) + loss = y - tem + grad = np.dot(x.transpose(), loss)*(-2) + s_grad += grad**2 + ada = np.sqrt(s_grad) + w = w - lr*grad/ada +'''''' + +testdata = pd.read_csv('test.csv', encoding='gbk') +pm2_5_test = testdata[testdata['class']=='PM2.5'].ix[:, 2:] +x_test = np.array(pm2_5_test, float) +x_test_b = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1) +y_star = np.dot(x_test_b, w) +y_pre = pd.read_csv('sampleSubmission.csv', encoding='gbk') +y_pre.value = y_star + +real = pd.read_csv('ans.csv', encoding='gbk') +erro = abs(y_pre.value - real.value).sum()/len(real.value) +print(erro) diff --git a/PM2.5test.md b/PM2.5test.md new file mode 100644 index 0000000..56cb34f --- /dev/null +++ b/PM2.5test.md @@ -0,0 +1,13 @@ +###用训练集和测试集进行训练并测试,最终生成结果如图所示。 +一共分为四个步骤: +设计模型,定义Loss函数,评价模型。 +### 项目简介 +建立一个模型,根据前9个小时的观测数据预测此时间点的PM2.5值。训练数据集为train.csv,测试数据集为test.csv。 +本次采用的是Linear Regression模型,此模型的关键在于选择features,为此我绘制了训练数据集的各个测项和PM2.5的散点图,通过对散点图的分析,初步选择我认为必要的feature来建立模型。 +###定义LOSS函数 +由于本文采用小批量梯度下降算法,所以本文的LOSS函数定义如下 +###评价模型 + 评价模型必须有一个评价标准和测试样本。评价标准定义为若预测值与真实值平均偏差程度低,则模型可认为比较好。其中预测值与真实值平均偏差程度为预测值和真实值之差的平方和的平均值。 + + + diff --git "a/PM2.5\345\274\240\351\235\226.md" "b/PM2.5\345\274\240\351\235\226.md" new file mode 100644 index 0000000..5e56a14 --- /dev/null +++ "b/PM2.5\345\274\240\351\235\226.md" @@ -0,0 +1,11 @@ +## 李宏毅PM2.5问题 ## +建立一个模型,根据前9个小时的观测数据预测此时间点的PM2.5值 +##实现思路## +从训练数据中提取出连续十个小时的观测数据,最后一个小时的PM2.5作为该条数据的类标签,而前九个小时的PM2.5值作为特征。一天24个小时,一天内总共有24-10+1 =15条记录。 +将cha更新 +读取测试文件,计算预测值和真实值的偏差,计算平均误差 +## 学习内容 ## +部分python基础知识 +吴恩达第一、二周课程 +## 预期学习 ## +吴恩达第三四周课程,继续学习python \ No newline at end of file diff --git "a/PM2.5\345\274\240\351\235\226.py" "b/PM2.5\345\274\240\351\235\226.py" new file mode 100644 index 0000000..b012e9c --- /dev/null +++ "b/PM2.5\345\274\240\351\235\226.py" @@ -0,0 +1,48 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +data = pd.read_csv(r'C:\Users\zjnorton\Desktop\2017fall-ml-hw1-master\train.csv') + +pm2_5 = data[data['class']=='PM2.5'].ix[:, 3:] + +tempxlist = [] +tempylist = [] +for i in range(15): + tempx = pm2_5.iloc[:, i:i+9] + tempx.columns = np.array(range(9)) + tempy = pm2_5.iloc[:, i+9] + tempy.columns = ['1'] + tempxlist.append(tempx) + tempylist.append(tempy) +xdata = pd.concat(tempxlist) +x = np.array(xdata, float) +ydata = pd.concat(tempylist) +y = np.array(ydata, float) + +x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1) + +w = np.zeros((len(x[0]))) +lr = 10 +iteration = 10000 +s_grad = np.zeros(len(x[0])) +for i in range(iteration): + tem = np.dot(x, w) + loss = y - tem + grad = np.dot(x.transpose(), loss)*(-2) + s_grad += grad**2 + ada = np.sqrt(s_grad) + w = w - lr*grad/ada + +testdata = pd.read_csv(r'C:\Users\zjnorton\Desktop\2017fall-ml-hw1-master\test.csv') +pm2_5_test = testdata[testdata['class']=='PM2.5'].ix[:, 2:] +x_test = np.array(pm2_5_test, float) +x_test_b = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1) +y_star = np.dot(x_test_b, w) +y_pre = pd.read_csv(r'C:\Users\zjnorton\Desktop\2017fall-ml-hw1-master\sampleSubmission.csv', encoding='gbk') +y_pre.value = y_star + +real = pd.read_csv(r'C:\Users\zjnorton\Desktop\2017fall-ml-hw1-master\ans.csv') +erro = abs(y_pre.value - real.value).sum()/len(real.value) +print(erro) + diff --git a/pm2.5test.py b/pm2.5test.py new file mode 100644 index 0000000..73ce984 --- /dev/null +++ b/pm2.5test.py @@ -0,0 +1,179 @@ +import pandas as pd +import matplotlib.pyplot as plt +from random import randint +from numba import jit + +# @jit +def test(): + '''读取数据''' + data = pd.read_csv('train.csv') #DataFrame类型 + # print(data) + # # del data[:3] + del data['datetime'] + del data['stations'] + del data['observations'] + data.drop([0]) + # print(data) + data = data.replace('NR', 0) + # print(data) + # print(data) + + + '''整理训练集合''' + ItemNum=18 + X_Train=[] #训练样本features集合 + Y_Train=[] #训练样本目标PM2.5集合 + for i in range(int(len(data)/ItemNum)): + day = data[i*ItemNum:(i+1)*ItemNum] #一天的观测数据 + for j in range(15): + x = day.iloc[:, j:j + 9] + y = int(day.iloc[9,j+9]) + X_Train.append(x) + Y_Train.append(y) + + + '''绘制散点图''' + x_AMB_TEMP=[] + x_CH4=[] + x_CO=[] + x_NMHC=[] + y=[] + for i in range(len(Y_Train)): + y.append(Y_Train[i]) + x=X_Train[i] + #求各测项的平均值 + x_AMB_TEMP_sum=0 + x_CH4_sum=0 + x_CO_sum=0 + x_NMHC_sum=0 + for j in range(9): + # print(x.iloc[0,j]) + x_AMB_TEMP_sum=x_AMB_TEMP_sum+float(x.iloc[0,j]) + # print(x.iloc[1, j]) + x_CH4_sum = x_CH4_sum + float(x.iloc[1, j]) + x_CO_sum = x_CO_sum + float(x.iloc[2, j]) + x_NMHC_sum = x_NMHC_sum + float(x.iloc[3, j]) + x_AMB_TEMP.append(x_AMB_TEMP_sum / 9) + x_CH4.append(x_CH4_sum / 9) + x_CO.append(x_CO_sum / 9) + x_NMHC.append(x_NMHC_sum / 9) + plt.figure(figsize=(10, 6)) + plt.subplot(2, 2, 1) + plt.title('AMB_TEMP') + plt.scatter(x_AMB_TEMP, y) + plt.subplot(2, 2, 2) + plt.title('CH4') + plt.scatter(x_CH4, y) + plt.subplot(2, 2, 3) + plt.title('CO') + plt.scatter(x_CO, y) + plt.subplot(2, 2, 4) + plt.title('NMHC') + plt.scatter(x_NMHC, y) + plt.show() + + '''小批量梯度下降''' + dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12} + iteration_count = 10 #迭代次数 + learning_rate = 0.000001 #学习速率 + b=0.0001 #初始化偏移项 + parameters=[0.001]*27 #初始化27个参数 + loss_history=[] + for i in range(iteration_count): + loss=0 + b_grad=0 + w_grad=[0]*27 + examples=list(randint(0, len(X_Train)-1) for index in range(100)) + print("标记1",i,"\n") + for j in range(100): + index=examples.pop() + day = X_Train[index] + # print(day) + # # print(parameters[0]) + # print(day.iloc[8,0]) + # print(day.iloc[8,1]) + # print(type(day.iloc[8,1])) + + + # print(day.iloc[8,:]) + # print(day.iloc[9,:]) + # print(day.iloc[12,:]) + # # print(day.iloc[8,1]) + # # print(day.iloc[8,2]) + # # print(day.iloc[8,1]) + + partsum = b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])-Y_Train[index] + loss=loss + partsum * partsum + b_grad = b_grad + partsum + for k in range(27): + # print(day.iloc[dict[k],k % 9]) + w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9]) + loss_history.append(loss/2) + #更新参数 + b = b - learning_rate * b_grad + for t in range(27): + parameters[t] = parameters[t] - learning_rate * w_grad[t] + # print("jieshu ") + + '''小批量梯度下降''' + dict={0:8,1:8,2:8,3:8,4:8,5:8,6:8,7:8,8:8,9:9,10:9,11:9,12:9,13:9,14:9,15:9,16:9,17:9,18:12,19:12,20:12,21:12,22:12,23:12,24:12,25:12,26:12} + iteration_count = 10 #迭代次数 + learning_rate = 0.000001 #学习速率 + b=0.0001 #初始化偏移项 + parameters=[0.001]*27 #初始化27个参数 + loss_history=[] + for i in range(iteration_count): + print("标记2",i,"\n") + loss=0 + b_grad=0 + w_grad=[0]*27 + examples=list(randint(0, len(X_Train)-1) for index in range(100)) + for j in range(100): + index=examples.pop() + day = X_Train[index] + partsum = b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8])-Y_Train[index] + loss=loss + partsum * partsum + b_grad = b_grad + partsum + for k in range(27): + w_grad[k]=w_grad[k]+ partsum * float(day.iloc[dict[k],k % 9]) + loss_history.append(loss/2) + #更新参数 + b = b - learning_rate * b_grad + for t in range(27): + parameters[t] = parameters[t] - learning_rate * w_grad[t] + + '''评价模型''' + data1 = pd.read_csv('test.csv') + del data1['id'] + del data1['item'] + # data.drop([0], axis=1) + + + X_Test=[] + ItemNum=18 + for i in range(int(len(data1)/ItemNum)): + day = data1[i*ItemNum:(i+1)*ItemNum] #一天的观测数据 + X_Test.append(day) + Y_Test=[] + data2 = pd.read_csv('answer.csv') + for i in range(len(data2)): + Y_Test.append(data2.iloc[i,1]) + b=0.00371301266193 + parameters=[-0.0024696993501677625, 0.0042664323568029619, -0.0086174899917209787, -0.017547874680980298, -0.01836289806786489, -0.0046459546176775678, -0.031425910733080147, 0.018037490234208024, 0.17448898242705385, 0.037982590870111861, 0.025666115101346722, 0.02295437149703404, 0.014272058968395849, 0.011573452230087483, 0.010984971346586308, -0.0061003639742210781, 0.19310213021199321, 0.45973205224805752, -0.0034995637680653086, 0.00094072189075279807, 0.00069329550591916357, 0.002966257320079194, 0.0050690506276038138, 0.007559004246038563, 0.013296350700555241, 0.027251049329127801, 0.039423988570899793] + Y_predict=[] + for i in range(len(X_Test)): + day=X_Test[i] + p=b+parameters[0]*float(day.iloc[8,0])+parameters[1]*float(day.iloc[8,1])+parameters[2]*float(day.iloc[8,2])+parameters[3]*float(day.iloc[8,3])+parameters[4]*float(day.iloc[8,4])+parameters[5]*float(day.iloc[8,5])+parameters[6]*float(day.iloc[8,6])+parameters[7]*float(day.iloc[8,7])+parameters[8]*float(day.iloc[8,8])+parameters[9]*float(day.iloc[9,0])+parameters[10]*float(day.iloc[9,1])+parameters[11]*float(day.iloc[9,2])+parameters[12]*float(day.iloc[9,3])+parameters[13]*float(day.iloc[9,4])+parameters[14]*float(day.iloc[9,5])+parameters[15]*float(day.iloc[9,6])+parameters[16]*float(day.iloc[9,7])+parameters[17]*float(day.iloc[9,8])+parameters[18]*float(day.iloc[12,0])+parameters[19]*float(day.iloc[12,1])+parameters[20]*float(day.iloc[12,2])+parameters[21]*float(day.iloc[12,3])+parameters[22]*float(day.iloc[12,4])+parameters[23]*float(day.iloc[12,5])+parameters[24]*float(day.iloc[12,6])+parameters[25]*float(day.iloc[12,7])+parameters[26]*float(day.iloc[12,8]) + Y_predict.append(p) + def dev_degree(y_true,y_predict): #评价函数 + sum=0 + for i in range(len(y_predict)): + sum=sum+(y_true[i]-y_predict[i])*(y_true[i]-y_predict[i]) + return sum/len(y_predict) + print(dev_degree(Y_Test,Y_predict)) + +def main(): + test() + +if __name__ == '__main__': + main() \ No newline at end of file