You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pytorch/浙江行业电量/行业电量_输出为3_步长为10.py

203 lines
7.1 KiB
Python

11 months ago
import numpy as np
import pandas as pd
import torch
from torch import nn
import os
11 months ago
import time
import matplotlib.pyplot as plt
11 months ago
t1 = time.time()
11 months ago
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
DAYS_FOR_TRAIN = 10
torch.manual_seed(42)
class LSTM_Regression(nn.Module):
def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, _x):
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size)
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size)
x = x.view(s * b, h)
x = self.fc(x)
x = x.view(s, b, -1) # 把形状改回来
return x
def create_dataset(data, days_for_train=5) -> (np.array, np.array):
dataset_x, dataset_y = [], []
10 months ago
for i in range(len(data) - days_for_train-3):
11 months ago
dataset_x.append(data[i:(i + days_for_train)])
10 months ago
dataset_y.append(data[i + days_for_train:i + days_for_train+3])
11 months ago
# print(dataset_x,dataset_y)
return (np.array(dataset_x), np.array(dataset_y))
def normal(nd):
high = nd.describe()['75%'] + 1.5*(nd.describe()['75%']-nd.describe()['25%'])
low = nd.describe()['25%'] - 1.5*(nd.describe()['75%']-nd.describe()['25%'])
return nd[(nd<high)&(nd>low)]
def data_preprocessing(data):
data.columns = data.columns.map(lambda x: x.strip())
11 months ago
data.index = data.index.map(lambda x:str(x).strip()[:10])
11 months ago
data.index = pd.to_datetime(data.index,format='%Y-%m-%d')
data.sort_index(inplace=True)
data = data.loc['2021-01':'2023-08']
data.drop(columns=[i for i in data.columns if (data[i] == 0).sum() / len(data) >= 0.5], inplace=True) # 去除0值列
data = data.astype(float)
for col in data.columns:
data[col] = normal(data[col])
return data
# 拼接数据集
# file_dir = './浙江各地市行业电量数据'
# excel = os.listdir(file_dir)[0]
# data = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0, index_col='stat_date')
# data.drop(columns='地市',inplace=True)
# data = data_preprocessing(data)
#
# df = data[data.columns[0]]
# df.dropna(inplace = True)
# dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
#
# for level in data.columns[1:]:
# df = data[level]
# df.dropna(inplace=True)
# x, y = create_dataset(df, DAYS_FOR_TRAIN)
# dataset_x = np.concatenate((dataset_x, x))
# dataset_y = np.concatenate((dataset_y, y))
#
#
# for excel in os.listdir(file_dir)[1:]:
#
# data = pd.read_excel(os.path.join(file_dir,excel), sheet_name=0,index_col='stat_date')
# data.drop(columns='地市', inplace=True)
# data = data_preprocessing(data)
#
# for level in data.columns:
# df = data[level]
# df.dropna(inplace=True)
# x,y = create_dataset(df,DAYS_FOR_TRAIN)
# dataset_x = np.concatenate((dataset_x,x))
# dataset_y = np.concatenate((dataset_y,y))
#
#
# df_x_10 = pd.DataFrame(dataset_x)
# df_y_10 = pd.DataFrame(dataset_y)
# df_x_10.to_csv('df_x_10.csv',index=False)
# df_y_10.to_csv('df_y_10.csv',index=False)
dataset_x = pd.read_csv('df_x_10.csv').values
dataset_y = pd.read_csv('df_y_10.csv').values
print(dataset_x.shape,dataset_y.shape)
11 months ago
# # 训练
11 months ago
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 标准化到0~1
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
print('max_value:',max_value,'min_value:',min_value)
# 划分训练集和测试集
train_size = int(len(dataset_x)*0.7)
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
# # 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN)
10 months ago
train_y = train_y.reshape(-1, 1, 3)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
10 months ago
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
min_loss = 1
# for i in range(500):
# train_x,train_y = train_x.to(device),train_y.to(device)
# out = model(train_x)
# loss = loss_function(out, train_y)
# loss.backward()
# optimizer.step()
# optimizer.zero_grad()
# train_loss.append(loss.item())
#
# if loss <= min_loss:
# min_loss = loss
# best_para = model.state_dict()
# if i % 100 == 0:
# print(f'epoch {i+1}: loss:{loss}')
# # 保存/读取模型
# torch.save(best_para,'hy3.pth')
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device)
10 months ago
model.load_state_dict(torch.load('hy3.pth',map_location=torch.device('cpu')))
# 测试
model = model.eval()
10 months ago
# dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
# dataset_x = torch.from_numpy(dataset_x).to(device).type(torch.float32)
#
# pred_test = model(dataset_x) # 全量训练集
# pred_test = pred_test.view(-1)
# pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test.cpu().detach().numpy()))
#
# plt.plot(pred_test.reshape(-1), 'r', label='prediction')
# plt.plot(dataset_y.reshape(-1), 'b', label='real')
# plt.plot((train_size*3, train_size*3), (0, 1), 'g--')
# plt.legend(loc='best')
# plt.show()
# model.load_state_dict(torch.load('hy3.pth',map_location=torch.device('cpu')))
# max_value = 354024930.8
# min_value = 0.0
11 months ago
10 months ago
# 测试
# file_dir = './浙江各地市行业电量数据'
10 months ago
df = pd.read_excel(r'C:\Users\鸽子\Desktop\浙江电量20231129.xlsx',sheet_name=2)
10 months ago
for city in df['city_name'].drop_duplicates():
df_city = df[df['city_name']==city].sort_values(by='stat_date').set_index('stat_date')
10 months ago
# df_city.index = df_city.index.map(lambda x:str(x).strip()[:10])
# df_city.index = pd.to_datetime(df_city.index)
# df_city = df_city.loc['2023-9'][:-3]
11 months ago
result_dict = {}
for industry in df_city.columns[1:]:
11 months ago
df_city[industry] = df_city[industry].astype('float')
x, y = create_dataset(df_city[industry], 10)
x = (x - min_value) / (max_value - min_value)
x = x.reshape(-1, 1, 10)
x = torch.from_numpy(x).type(torch.float32).to(device)
pred = model(x).view(-1)
pred = pred * (max_value - min_value) + min_value
result = pred.cpu().detach().numpy()[-3:]
11 months ago
result_dict[industry] = list(result)
10 months ago
df1 = pd.DataFrame(result_dict,index=['2023-11-28','2023-11-29','2023-11-30'])
10 months ago
df1['city_name'] = city
df1 = df1[df_city.columns]
df1 = pd.concat((df_city.iloc[:27], df1))
print(df_city)
print(df1)
10 months ago
10 months ago
with pd.ExcelWriter(r'C:\Users\鸽子\Desktop\行业电量预测v1129.xlsx',mode='a',engine='openpyxl',if_sheet_exists='replace') as writer:
df1.to_excel(writer,sheet_name=f'{city[4:6]}')
11 months ago
print(time.time()-t1)
11 months ago
print(result_dict)
11 months ago
11 months ago