You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pytorch/浙江行业电量/行业电量_输出为3_27步长.py

206 lines
7.4 KiB
Python

import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
import os
from torch.utils.data import TensorDataset, DataLoader
import datetime
torch.manual_seed(42)
os.environ[
"KMP_DUPLICATE_LIB_OK"] = "TRUE" # 解决OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
pd.set_option('display.width', None)
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=3):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.fc1 = nn.Linear(hidden_size, 64)
self.fc2 = nn.Linear(64, 128)
self.fc3 = nn.Linear(128, output_size)
self.ReLu = nn.ReLU()
self.dropout = nn.Dropout()
def forward(self, x):
output, _ = self.lstm(x)
s, b, h = output.shape
output = output.reshape(-1, h)
output = self.ReLu(self.fc1(output))
output = self.ReLu(self.fc2(output))
output = self.fc3(output)
return output
def normal(df):
drop_col = [x for x in df.columns if len(df[df[x]==0])/len(df) >= 0.5]
df.drop(columns=drop_col,inplace=True)
for col in df.columns:
try:
high = df[col].describe()['75%'] + 1.5 * (df[col].describe()['75%'] - df[col].describe()['25%'])
low = df[col].describe()['25%'] - 1.5 * (df[col].describe()['75%'] - df[col].describe()['25%'])
df[col] = df[col].map(lambda x: np.nan if (x >= high) | (x <= low) else x)
df[col] = df[col].fillna(method='ffill')
df[col] = df[col].fillna(method='bfill')
except:
pass
return df
def create_data(df_industry, industry):
dataset_x = []
dataset_y = []
# 按月份分组
grouped = df_industry.groupby(df_industry['stat_date'].dt.to_period('M'))
# 遍历每个月的数据
for name, group in grouped:
if len(group) == 31:
dataset_x.append(list(group[industry].values[1:28]))
dataset_y.append(list(group[industry].values[-3:]))
if len(group) == 30:
dataset_x.append(list(group[industry].values[:27]))
dataset_y.append(list(group[industry].values[-3:]))
if len(group) == 28:
fst = group[industry].values[0]
dataset_x.append([fst, fst, fst] + list(group[industry].values[1:25]))
dataset_y.append(list(group[industry].values[-3:]))
else:
fst = group[industry].values[0]
if len([fst, fst] + list(group[industry].values[1:26])) != 27:
break
dataset_x.append([fst, fst] + list(group[industry].values[1:26]))
dataset_y.append(list(group[industry].values[-3:]))
return np.array(dataset_x), np.array(dataset_y)
# 创建数据集
file_dir = './浙江各地市行业电量数据'
city1 = os.listdir(file_dir)[0]
df_city = pd.read_excel(os.path.join(file_dir, city1))
df_city = normal(df_city)
df_city = df_city.drop(columns='地市')
df_city[df_city.columns[1:]] /= 10000
df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10])
df_city.stat_date = pd.to_datetime(df_city.stat_date)
industry = '全社会用电总计'
df_industry = df_city[['stat_date', industry]]
dataset_x, dataset_y = create_data(df_industry, industry)
for industry in df_city.columns[2:]:
df_level = df_city[['stat_date', industry]]
x, y = create_data(df_level, industry)
dataset_x = np.concatenate([dataset_x, x])
dataset_y = np.concatenate([dataset_y, y])
for excel in os.listdir(file_dir)[1:]:
df_city = pd.read_excel(os.path.join(file_dir, excel)).drop(columns='地市')
df_city = normal(df_city)
df_city[df_city.columns[1:]] /= 10000
df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10])
df_city.stat_date = pd.to_datetime(df_city.stat_date)
for industry in df_city.columns[1:]:
df_level = df_city[['stat_date', industry]]
x, y = create_data(df_level, industry)
dataset_x = np.concatenate([dataset_x, x])
dataset_y = np.concatenate([dataset_y, y])
print(dataset_x.shape, dataset_y.shape)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 标准化到0~1
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
print(max_value, min_value)
# 划分训练集和测试集
train_size = int(len(dataset_x) * 0.8)
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
eval_x = dataset_x[train_size:]
eval_y = dataset_y[train_size:]
# # 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, 27)
train_y = train_y.reshape(-1, 1, 3)
eval_x = eval_x.reshape(-1, 1, 27)
eval_y = eval_y.reshape(-1, 1, 3)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
eval_x = torch.from_numpy(eval_x).to(device).type(torch.float32)
eval_y = torch.from_numpy(eval_y).to(device).type(torch.float32)
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True)
eval_ds = TensorDataset(eval_x, eval_y)
eval_dl = DataLoader(eval_ds, batch_size=64, drop_last=True)
model = LSTM(27, 16, output_size=3, num_layers=3).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
min_loss = 1
for i in range(10):
model.train()
for j, (x, y) in enumerate(train_dl):
x, y = x.to(device), y.to(device)
out = model(x)
loss = loss_function(out, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
# if (i+1) % 100 == 0:
# print(f'epoch {i+1}/1500 loss:{round(loss.item(),5)}')
if (j + 1) % 10 == 0:
print(f'epoch {i + 1}/200 step {j + 1}/{len(train_dl)} loss:{loss}')
test_running_loss = 0
model.eval()
with torch.no_grad():
for x, y in eval_dl:
pred = model(eval_x)
loss = loss_function(pred, y)
test_running_loss += loss.item()
test_loss = test_running_loss / len(eval_dl)
if test_loss < min_loss:
min_loss = test_loss
best_model_weight = model.state_dict()
print(f'epoch {i + 1} test_loss:{test_loss}')
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters in the LSTM model: {total_params}")
# 保存模型
torch.save(best_model_weight, 'dy3.pth')
# 读取模型
model = LSTM(27, 16, output_size=3, num_layers=3).to(device)
model.load_state_dict(torch.load('dy3.pth'))
# for test
dataset_x = dataset_x.reshape(-1, 1, 27) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x).to(device).type(torch.float32)
pred_test = model(dataset_x) # 全量训练集
# 模型输出 (seq_size, batch_size, output_size)
pred_test = pred_test.view(-1).cpu().detach().numpy()
plt.plot(pred_test.reshape(-1), 'r', label='prediction')
plt.plot(dataset_y.reshape(-1), 'b', label='real')
plt.plot((train_size * 3, train_size * 3), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
plt.legend(loc='best')
plt.show()