You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

165 lines
6.3 KiB
Python

11 months ago
import numpy as np
import pandas as pd
import torch
from torch import nn
from multiprocessing import Pool
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
DAYS_FOR_TRAIN = 10
torch.manual_seed(42)
class LSTM_Regression(nn.Module):
def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, _x):
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size)
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size)
x = x.view(s * b, h)
x = self.fc(x)
x = x.view(s, b, -1) # 把形状改回来
return x
def create_dataset(data, days_for_train=5) -> (np.array, np.array):
dataset_x, dataset_y = [], []
11 months ago
for i in range(len(data) - days_for_train-3):
11 months ago
_x = data[i:(i + days_for_train)]
dataset_x.append(_x)
11 months ago
dataset_y.append(data[i + days_for_train:i + days_for_train+3])
11 months ago
return (np.array(dataset_x), np.array(dataset_y))
def normal(nd):
high = nd.describe()['75%'] + 1.5*(nd.describe()['75%']-nd.describe()['25%'])
low = nd.describe()['25%'] - 1.5*(nd.describe()['75%']-nd.describe()['25%'])
return nd[(nd<high)&(nd>low)]
def run(file_dir,excel):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = pd.read_excel(os.path.join(file_dir,excel), sheet_name=0,index_col=' stat_date ')
data.columns = data.columns.map(lambda x: x.strip())
data.index = pd.to_datetime(data.index,format='%Y%m%d')
data.sort_index(inplace=True)
print(data.head())
data = data.loc['2021-01':'2023-09']
data.drop(columns=[i for i in data.columns if (data[i] == 0).sum() / len(data) >= 0.5], inplace=True) # 去除0值列
print('len(data):', len(data))
11 months ago
list_app = []
11 months ago
for industry in data.columns:
df = data[industry]
df = df[df.values != 0] # 去除0值行
df = normal(df)
df = df.astype('float32').values # 转换数据类型
# 标准化到0~1
max_value = np.max(df)
min_value = np.min(df)
df = (df - min_value) / (max_value - min_value)
dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
print('len(dataset_x:)', len(dataset_x))
# 划分训练集和测试集
train_size = len(dataset_x) - 3
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
# 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN)
11 months ago
train_y = train_y.reshape(-1, 1, 3)
11 months ago
# 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device)
train_y = torch.from_numpy(train_y).to(device)
11 months ago
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
11 months ago
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
11 months ago
for i in range(2500):
11 months ago
out = model(train_x)
loss = loss_function(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
11 months ago
print(loss)
11 months ago
# 保存模型
# torch.save(model.state_dict(),save_filename)
# torch.save(model.state_dict(),os.path.join(model_save_dir,model_file))
# for test
model = model.eval() # 转换成测试模式
# model.load_state_dict(torch.load(os.path.join(model_save_dir,model_file))) # 读取参数
dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x).to(device)
pred_test = model(dataset_x) # 全量训练集
# 模型输出 (seq_size, batch_size, output_size)
pred_test = pred_test.view(-1)
pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test.cpu().detach().numpy()))
# plt.plot(pred_test, 'r', label='prediction')
# plt.plot(df, 'b', label='real')
# plt.plot((train_size, train_size), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
# plt.legend(loc='best')
# plt.show()
# 创建测试集
11 months ago
# result_list = []
11 months ago
# 以x为基础实际数据滚动预测未来3天
x = torch.from_numpy(df[-14:-4]).to(device)
11 months ago
pred = model(x.reshape(-1,1,DAYS_FOR_TRAIN)).view(-1).cpu().detach().numpy()
11 months ago
11 months ago
# for i in range(3):
# next_1_8 = x[1:]
# next_9 = model(x.reshape(-1,1,DAYS_FOR_TRAIN))
# # print(next_9,next_1_8)
# x = torch.concatenate((next_1_8, next_9.view(-1)))
# result_list.append(next_9.view(-1).item())
11 months ago
11 months ago
# 反归一化
pred = pred * (max_value - min_value) + min_value
df = df * (max_value - min_value) + min_value
11 months ago
11 months ago
print(pred)
11 months ago
# 打印指标
11 months ago
print(abs(pred - df[-3:]).mean() / df[-3:].mean())
11 months ago
result_eight = pd.DataFrame({'pred': np.round(pred,1),'real': df[-3:]})
target = (result_eight['pred'].sum() - result_eight['real'].sum()) / df[-31:].sum()
result_eight['loss_rate'] = round(target, 5)
11 months ago
result_eight['industry'] = industry
list_app.append(result_eight)
11 months ago
print(target)
print(result_eight)
11 months ago
final_df = pd.concat(list_app,ignore_index=True)
11 months ago
# final_df.to_csv('市行业电量.csv',encoding='gbk')
11 months ago
print(final_df)
11 months ago
# result_eight.to_csv(f'./月底预测结果/9月{excel[:2]}.txt', sep='\t', mode='a')
# with open(fr'./偏差/9月底偏差率.txt', 'a', encoding='utf-8') as f:
# f.write(f'{excel[:2]}{industry}:{round(target, 5)}\n')
11 months ago
if __name__ == '__main__':
11 months ago
file_dir = r'C:\Users\user\PycharmProjects\pytorch2\浙江行业电量\浙江所有地市133行业数据'
11 months ago
run(file_dir,'丽水133行业数据.xlsx')
# p = Pool(4)
# for excel in os.listdir(file_dir):
# p.apply_async(func=run,args=(file_dir,excel))
# p.close()
# p.join()