更新入模数据

main
鸽子 11 months ago
parent 544ac6add4
commit cb599702a1

@ -1,120 +0,0 @@
import xgboost as xgb
import pandas as pd
import os
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['font.sans-serif']=['kaiti']
pd.set_option('display.width',None)
def hf_season(x):
list1= []
for i in range(1,13):
if x.loc[f'2021-{i}'].mean() >= x.describe()['75%']:
list1.append(i)
return list1
def season(x):
if str(x)[5:7] in ('06','07','08','12','01','02'):
return 1
else:
return 0
def month(x):
if str(x)[5:7] in ('08','09','10','12','01','02'):
return 1
else:
return 0
def normal(nd):
high = nd.describe()['75%'] + 1.5*(nd.describe()['75%']-nd.describe()['25%'])
low = nd.describe()['25%'] - 1.5*(nd.describe()['75%']-nd.describe()['25%'])
return nd[(nd<high)&(nd>low)]
data = pd.read_excel(r'C:\python-project\pytorch3\入模数据\杭州数据.xlsx',index_col='dtdate')
data.index = pd.to_datetime(data.index,format='%Y-%m-%d')
data = data.loc[normal(data['售电量']).index]
# for i in range(1,13):
# plt.plot(range(len(data['售电量'][f'2022-{i}'])),data['售电量'][f'2022-{i}'])
# plt.show()
print(data['售电量']['2022-9'])
plt.plot(range(len(data['售电量']['2022-7'])),data['售电量']['2022-7'])
plt.plot(range(len(data['售电量']['2022-7']),len(data['售电量']['2022-7'])+len(data['售电量']['2023-7'])),data['售电量']['2023-7'])
# plt.plot(range(len(data['售电量'][['2022-9','2023-9']])),data['售电量'][['2022-9','2023-9']])
plt.show()
# print(hf_season(data.loc['2021']['售电量']))
data['month'] = data.index.strftime('%Y-%m-%d').str[6]
data['month'] = data['month'].astype('int')
data['season'] = data.index.map(season)
print(data.head(50))
df_eval = data.loc['2023-7']
df_train = data.loc['2021-1':'2023-6']
# df_train = df[500:850]
print(len(df_eval),len(df_train),len(data))
print(data.drop(columns='city_name').corr(method='pearson')['售电量'])
df_train = df_train[['tem_max','tem_min','24ST','rh','rh_max','prs','prs_max','prs_min','售电量','month','holiday','season']]
# IQR = df['售电量'].describe()['75%'] - df['售电量'].describe()['25%']
# high = df['售电量'].describe()['75%'] + 1.5*IQR
# low = df['售电量'].describe()['25%'] - 1.5*IQR
# print('异常值数量:',len(df[(df['售电量'] >= high) | (df['售电量'] <= low)]))
#
# df_train = df_train[(df['售电量'] <= high) & (df['售电量'] >= low)]
X = df_train[['tem_max','tem_min','24ST','holiday','season']]
X_eval = df_eval[['tem_max','tem_min','24ST','holiday','season']]
y = df_train['售电量']
print(y.describe())
# best_goal = 1
# best_i = {}
# for i in range(400):
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)
model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=150)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
result_test = pd.DataFrame({'test':y_test,'pred':y_pred},index=y_test.index)
# 指标打印
print(abs(y_test - y_pred).mean() / y_test.mean())
eval_pred = model.predict(X_eval)
result_eval = pd.DataFrame({'eval':df_eval['售电量'],'pred':eval_pred},index=df_eval['售电量'].index)
print((result_eval['eval'].sum()-result_eval['pred'].sum())/result_eval['eval'].sum())
goal = (result_eval['eval'][-3:].sum()-result_eval['pred'][-3:].sum())/result_eval['eval'].sum()
print('goal:',goal)
goal2 = (result_eval['eval'][-23:].sum()-result_eval['pred'][-23:].sum())/result_eval['eval'].sum()
print('goal2:',goal2)
print(result_eval)
print('r2:',r2_score(y_test,y_pred))
# if abs(goal) < best_goal:
# best_goal = abs(goal)
# best_i['best_i'] = i
# x = goal2
# print(best_i,best_goal,x)
# result_eval.to_csv(r'C:\Users\user\Desktop\9月各地市日电量预测结果\杭州.csv')
# with open(r'C:\Users\user\Desktop\9月各地市日电量预测结果\偏差率.txt','a',encoding='utf-8') as f:
# f.write(f'杭州月末3天偏差率{round(goal,5)},9号-月底偏差率:{round(goal2,5)}\n')
# # 保存模型
# model.save_model('hangzhou.bin')
# loaded_model = xgb.XGBRegressor()
# loaded_model.load_model('hangzhou.bin')
# model.predict(X_eval)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -1,32 +0,0 @@
dtdate,eval,pred
2023-08-01,4781.72,4638.4565
2023-08-02,5264.18,4224.5635
2023-08-03,5308.39,5036.8037
2023-08-04,5531.27,5441.4995
2023-08-05,5989.36,6265.5225
2023-08-06,6373.8,5753.8174
2023-08-07,5688.18,4972.277
2023-08-08,5287.83,4424.5815
2023-08-09,5560.11,4310.7837
2023-08-10,5706.55,4657.085
2023-08-11,5923.97,5702.3916
2023-08-12,6238.88,5897.044
2023-08-13,5961.14,4939.6694
2023-08-14,5316.45,3566.0615
2023-08-15,4802.99,3005.286
2023-08-16,4908.05,3805.3303
2023-08-17,4792.48,3044.6094
2023-08-18,4380.25,3086.1318
2023-08-19,4490.53,4237.0283
2023-08-20,4577.54,3911.61
2023-08-21,4784.33,4044.5312
2023-08-22,4517.86,3943.1465
2023-08-23,4327.74,4588.3257
2023-08-24,4736.04,4383.0825
2023-08-25,4981.34,4765.4146
2023-08-26,4967.04,4744.9272
2023-08-27,5044.84,4771.1987
2023-08-28,4919.99,4644.142
2023-08-29,3611.24,3359.0356
2023-08-30,3184.04,3217.3503
2023-08-31,3026.0,3217.8718
1 dtdate eval pred
2 2023-08-01 4781.72 4638.4565
3 2023-08-02 5264.18 4224.5635
4 2023-08-03 5308.39 5036.8037
5 2023-08-04 5531.27 5441.4995
6 2023-08-05 5989.36 6265.5225
7 2023-08-06 6373.8 5753.8174
8 2023-08-07 5688.18 4972.277
9 2023-08-08 5287.83 4424.5815
10 2023-08-09 5560.11 4310.7837
11 2023-08-10 5706.55 4657.085
12 2023-08-11 5923.97 5702.3916
13 2023-08-12 6238.88 5897.044
14 2023-08-13 5961.14 4939.6694
15 2023-08-14 5316.45 3566.0615
16 2023-08-15 4802.99 3005.286
17 2023-08-16 4908.05 3805.3303
18 2023-08-17 4792.48 3044.6094
19 2023-08-18 4380.25 3086.1318
20 2023-08-19 4490.53 4237.0283
21 2023-08-20 4577.54 3911.61
22 2023-08-21 4784.33 4044.5312
23 2023-08-22 4517.86 3943.1465
24 2023-08-23 4327.74 4588.3257
25 2023-08-24 4736.04 4383.0825
26 2023-08-25 4981.34 4765.4146
27 2023-08-26 4967.04 4744.9272
28 2023-08-27 5044.84 4771.1987
29 2023-08-28 4919.99 4644.142
30 2023-08-29 3611.24 3359.0356
31 2023-08-30 3184.04 3217.3503
32 2023-08-31 3026.0 3217.8718

Binary file not shown.

Binary file not shown.

@ -12,10 +12,6 @@ mpl.rcParams['font.sans-serif']=['kaiti']
pd.set_option('display.width',None)
def season(x):
if str(x)[5:7] in ['07', '08']:
return 2

@ -17,8 +17,8 @@ class LSTM_Regression(nn.Module):
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, _x):
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size) 一批多少条样本 多少批样本 每一个样本的输入特征大小10
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size) 经过lstm计算后输出为隐藏层大小
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size)
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size)
x = x.view(s * b, h)
x = self.fc(x)
x = x.view(s, b, -1) # 把形状改回来
@ -52,133 +52,110 @@ def data_preprocessing(data):
return data
if __name__ == '__main__':
# 拼接数据集
file_dir = r'C:\Users\user\Desktop\浙江各地市分电压日电量数据'
excel = os.listdir(file_dir)[0]
# 拼接数据集
file_dir = r'C:\Users\鸽子\Desktop\浙江各地市分电压日电量数据'
excel = os.listdir(file_dir)[0]
data = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0, index_col=' stat_date ')
data = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0, index_col=' stat_date ')
data = data_preprocessing(data)
df = data[data.columns[0]]
df.dropna(inplace = True)
dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
for level in data.columns[1:]:
df = data[level]
df.dropna(inplace=True)
x, y = create_dataset(df, DAYS_FOR_TRAIN)
dataset_x = np.concatenate((dataset_x, x))
dataset_y = np.concatenate((dataset_y, y))
for excel in os.listdir(file_dir)[1:]:
data = pd.read_excel(os.path.join(file_dir,excel), sheet_name=0,index_col=' stat_date ')
data = data_preprocessing(data)
for level in data.columns:
df = data[level]
df.dropna(inplace=True)
x,y = create_dataset(df,DAYS_FOR_TRAIN)
dataset_x = np.concatenate((dataset_x,x))
dataset_y = np.concatenate((dataset_y,y))
print(dataset_x,dataset_y,dataset_x.shape,dataset_y.shape)
# 训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 标准化到0~1
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
# 划分训练集和测试集
train_size = int(len(dataset_x)*0.7)
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
# 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN)
train_y = train_y.reshape(-1, 1, 5)
# 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=5, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
# for i in range(1500):
# out = model(train_x)
# loss = loss_function(out, train_y)
# loss.backward()
# optimizer.step()
# optimizer.zero_grad()
# train_loss.append(loss.item())
# # print(loss)
# # 保存模型
# torch.save(model.state_dict(),'dy5.pth')
model.load_state_dict(torch.load('dy5.pth'))
# for test
model = model.eval() # 转换成测试模式
# model.load_state_dict(torch.load(os.path.join(model_save_dir,model_file))) # 读取参数
dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x).to(device).type(torch.float32)
pred_test = model(dataset_x) # 全量训练集
# 模型输出 (seq_size, batch_size, output_size)
pred_test = pred_test.view(-1)
pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test.cpu().detach().numpy()))
# plt.plot(pred_test.reshape(-1), 'r', label='prediction')
# plt.plot(dataset_y.reshape(-1), 'b', label='real')
# plt.plot((train_size*5, train_size*5), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
# plt.legend(loc='best')
# plt.show()
data = data_preprocessing(data)
df = data[data.columns[0]]
df.dropna(inplace = True)
dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
for level in data.columns[1:]:
df = data[level]
df.dropna(inplace=True)
x, y = create_dataset(df, DAYS_FOR_TRAIN)
dataset_x = np.concatenate((dataset_x, x))
dataset_y = np.concatenate((dataset_y, y))
for excel in os.listdir(file_dir)[1:]:
data = pd.read_excel(os.path.join(file_dir,excel), sheet_name=0,index_col=' stat_date ')
data = data_preprocessing(data)
for level in data.columns:
df = data[level]
df.dropna(inplace=True)
x,y = create_dataset(df,DAYS_FOR_TRAIN)
dataset_x = np.concatenate((dataset_x,x))
dataset_y = np.concatenate((dataset_y,y))
print(dataset_x,dataset_y,dataset_x.shape,dataset_y.shape)
# 训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 标准化到0~1
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
# 划分训练集和测试集
train_size = len(dataset_x)*0.7
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
# 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN)
train_y = train_y.reshape(-1, 1, 5)
# 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device)
train_y = torch.from_numpy(train_y).to(device)
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
for i in range(1500):
out = model(train_x)
loss = loss_function(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
# print(loss)
# 保存模型
torch.save(model.state_dict(),'dy5.pth')
# for test
model = model.eval() # 转换成测试模式
# model.load_state_dict(torch.load(os.path.join(model_save_dir,model_file))) # 读取参数
dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x).to(device)
pred_test = model(dataset_x) # 全量训练集
# 模型输出 (seq_size, batch_size, output_size)
pred_test = pred_test.view(-1)
pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test.cpu().detach().numpy()))
plt.plot(pred_test, 'r', label='prediction')
plt.plot(df, 'b', label='real')
plt.plot((train_size, train_size), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
plt.legend(loc='best')
plt.show()
# 创建测试集
# result_list = []
# 以x为基础实际数据滚动预测未来3天
df_eval = pd.read_excel(r'C:\Users\user\Desktop\浙江各地市分电压日电量数据\杭州.xlsx',index_col=' stat_date ')
df_eval.columns = df_eval.columns.map(lambda x:x.strip())
df_eval.index = pd.to_datetime(df_eval.index)
x,y = create_dataset(df_eval.loc['2023-7']['10kv以下'],10)
x = (x - min_value) / (max_value - min_value)
x = x.reshape(-1,1,10)
x = torch.from_numpy(x).type(torch.float32).to(device)
pred = model(x)
# x = torch.from_numpy(df[-14:-4]).to(device)
# pred = model(x.reshape(-1,1,DAYS_FOR_TRAIN)).view(-1).detach().numpy()
# 反归一化
pred = pred * (max_value - min_value) + min_value
# pred = pred * (max_value - min_value) + min_value
# df = df * (max_value - min_value) + min_value
print(pred,y)
df = pd.DataFrame({'real':y.reshape(-1),'pred':pred.view(-1).cpu().detach().numpy()})
df.to_csv('7月预测.csv',encoding='gbk')
# 打印指标
# print(pred)
# # 打印指标
# print(abs(pred - df[-3:]).mean() / df[-3:].mean())
# result_eight = pd.DataFrame({'pred': np.round(pred,1),'real': df[-3:]})
# target = (result_eight['pred'].sum() - result_eight['real'].sum()) / df[-31:].sum()

Loading…
Cancel
Save