You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

200 lines
6.8 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
torch.manual_seed(42)
train_step = 10
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.fc1 = nn.Linear(hidden_size, output_size)
# self.fc2 = nn.Linear(128, output_size)
# self.ReLu = nn.ReLU()
# self.dropout = nn.Dropout(0.8)
def forward(self, x):
x, _ = self.lstm(x)
s, b, h = x.shape
x = x.reshape(-1, h)
# output = self.ReLu(self.dropout(self.fc1(x)))
output = self.fc1(x)
output = output.view(s, b, -1)
return output
def normal(data):
high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%'])
low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%'])
return (data >= low) & (data <= high)
# file_dir = './浙江各地市行业电量数据'
#
# # 合并11个市
# df = pd.DataFrame({})
# for city in os.listdir(file_dir):
#
# df_city = pd.read_excel(os.path.join(file_dir, city))
#
# # 对每个市的每一个行业异常值 向后填充
# for industry in df_city.columns[2:]:
# outliers_index = normal(df_city[industry]).index
# df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill()
# df_city[industry].fillna(method='ffill',inplace=True)
# df = pd.concat([df,df_city])
# print(df.shape)
#
# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK')
df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk')
# 对df每一行业进行归一化
column_params = {}
for column in df.columns[2:]:
scaler = MinMaxScaler()
df[column] = scaler.fit_transform(df[[column]])
column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]}
print(column_params)
print(df.head())
def create_dataset(data, train_step=train_step) -> (np.array, np.array):
dataset_x, dataset_y = [], []
for i in range(len(data) - train_step - 3):
dataset_x.append(data[i:(i + train_step)])
dataset_y.append(data[i + train_step:i + train_step + 3])
return (np.array(dataset_x), np.array(dataset_y))
# 切分x,y数据集步长为10.最小单位为单个城市的单个行业。
# 先从第一个行业切分,合并所有城市。
# industry = df.columns[2:][0]
# city = df['地市'].drop_duplicates()[0]
# df_city_industry = df[df['地市'] == city][industry]
# dataset_x, dataset_y = create_dataset(df_city_industry)
#
# for city in df['地市'].drop_duplicates()[1:]:
# df_city_industry = df[df['地市'] == city][industry]
# x, y = create_dataset(df_city_industry)
# dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
#
# for industry in df.columns[2:][1:]:
# for city in df['地市'].drop_duplicates():
# df_city_industry = df[df['地市'] == city][industry]
# x, y = create_dataset(df_city_industry)
# dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
#
# print(dataset_x.shape, dataset_y.shape)
# df_x = pd.DataFrame(dataset_x)
# df_y = pd.DataFrame(dataset_y)
# df_x.to_csv('df_x_100.csv',index=False)
# df_y.to_csv('df_y_100.csv',index=False)
dataset_x = pd.read_csv('df_x.csv').values
dataset_y = pd.read_csv('df_y.csv').values
print(dataset_x.shape, dataset_y.shape)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_size = int(len(dataset_x)*0.7)
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
eval_x = dataset_x[train_size:]
eval_y = dataset_y[train_size:]
# # 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, train_step)
train_y = train_y.reshape(-1, 1, 3)
eval_x = eval_x.reshape(-1, 1, train_step)
eval_y = eval_y.reshape(-1, 1, 3)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
eval_x = torch.from_numpy(eval_x).to(device).type(torch.float32)
eval_y = torch.from_numpy(eval_y).to(device).type(torch.float32)
# ds = TensorDataset(x_train, y_train)
# dl = DataLoader(ds, batch_size=32, drop_last=True)
# eval_ds = TensorDataset(x_eval, y_eval)
# eval_dl = DataLoader(eval_ds, batch_size=64, drop_last=True)
model = LSTM(train_step,32, 3, num_layers=2).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
min_loss = 1
for i in range(1500):
model.train()
out = model(train_x)
loss = loss_fn(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
model.eval()
with torch.no_grad():
pred = model(eval_x)
eval_loss = loss_fn(pred,eval_y)
if eval_loss <= min_loss:
min_loss = eval_loss
best_para = model.state_dict()
if i % 100 == 0:
print(f'epoch {i+1}: loss:{loss} eval_loss:{eval_loss}')
# for epoch in range(3):
# model.train()
# for step, (x, y) in enumerate(dl):
# x, y = x.to(device), y.to(device)
# pred = model(x)
# loss = loss_fn(pred,y)
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
#
# if step % 1000 == 0:
# print(f'epoch{epoch+1}: train_step:{step}/{len(dl)} train_loss:{loss}\n')
#
# model.eval()
# batch_loss = 0
# with torch.no_grad():
# for x,y in eval_dl:
# x, y = x.to(device), y.to(device)
# pred = model(x)
# loss = loss_fn(pred, y)
# batch_loss += loss
# print(f'epoch{epoch+1}: eval_loss:{batch_loss/len(eval_dl)}\n')
#
# if batch_loss/len(eval_dl) < min_loss:
# min_loss = batch_loss/len(eval_dl)
# best_parameters = model.state_dict()
torch.save(best_para,'best_3.pth')
model = LSTM(train_step,32, 3, num_layers=2).to(device)
model.load_state_dict(torch.load('best_3.pth'))
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("LSTM参数总量:", params)
dataset_x = dataset_x.reshape(-1,1,train_step)
dataset_x = torch.from_numpy(dataset_x).type(torch.float32).to(device)
pred = model(dataset_x).reshape(-1)
pred = np.concatenate((np.zeros(train_step), pred.cpu().detach().numpy()))
plt.plot(pred, 'r', label='prediction')
plt.plot(dataset_y.reshape(-1), 'b', label='real')
plt.plot((train_size*3, train_size*3), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
plt.legend(loc='best')
plt.show()