|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
import torch
|
|
|
|
|
from torch import nn
|
|
|
|
|
from torch.utils.data import DataLoader, TensorDataset
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import os
|
|
|
|
|
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
|
|
|
|
torch.manual_seed(42)
|
|
|
|
|
|
|
|
|
|
train_step = 10
|
|
|
|
|
class LSTM(nn.Module):
|
|
|
|
|
def __init__(self, input_size, hidden_size, output_size, num_layers):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
|
|
|
|
|
self.fc1 = nn.Linear(hidden_size, output_size)
|
|
|
|
|
# self.fc2 = nn.Linear(128, output_size)
|
|
|
|
|
# self.ReLu = nn.ReLU()
|
|
|
|
|
# self.dropout = nn.Dropout(0.8)
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
x, _ = self.lstm(x)
|
|
|
|
|
s, b, h = x.shape
|
|
|
|
|
x = x.reshape(-1, h)
|
|
|
|
|
# output = self.ReLu(self.dropout(self.fc1(x)))
|
|
|
|
|
output = self.fc1(x)
|
|
|
|
|
output = output.view(s, b, -1)
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normal(data):
|
|
|
|
|
high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%'])
|
|
|
|
|
low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%'])
|
|
|
|
|
return (data >= low) & (data <= high)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# file_dir = './浙江各地市行业电量数据'
|
|
|
|
|
#
|
|
|
|
|
# # 合并11个市
|
|
|
|
|
# df = pd.DataFrame({})
|
|
|
|
|
# for city in os.listdir(file_dir):
|
|
|
|
|
#
|
|
|
|
|
# df_city = pd.read_excel(os.path.join(file_dir, city))
|
|
|
|
|
#
|
|
|
|
|
# # 对每个市的每一个行业异常值 向后填充
|
|
|
|
|
# for industry in df_city.columns[2:]:
|
|
|
|
|
# outliers_index = normal(df_city[industry]).index
|
|
|
|
|
# df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill()
|
|
|
|
|
# df_city[industry].fillna(method='ffill',inplace=True)
|
|
|
|
|
# df = pd.concat([df,df_city])
|
|
|
|
|
# print(df.shape)
|
|
|
|
|
#
|
|
|
|
|
# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK')
|
|
|
|
|
df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk')
|
|
|
|
|
|
|
|
|
|
# 对df每一行业进行归一化
|
|
|
|
|
column_params = {}
|
|
|
|
|
for column in df.columns[2:]:
|
|
|
|
|
scaler = MinMaxScaler()
|
|
|
|
|
|
|
|
|
|
df[column] = scaler.fit_transform(df[[column]])
|
|
|
|
|
|
|
|
|
|
column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]}
|
|
|
|
|
|
|
|
|
|
print(column_params)
|
|
|
|
|
print(df.head())
|
|
|
|
|
|
|
|
|
|
def create_dataset(data, train_step=train_step) -> (np.array, np.array):
|
|
|
|
|
dataset_x, dataset_y = [], []
|
|
|
|
|
for i in range(len(data) - train_step - 3):
|
|
|
|
|
dataset_x.append(data[i:(i + train_step)])
|
|
|
|
|
dataset_y.append(data[i + train_step:i + train_step + 3])
|
|
|
|
|
|
|
|
|
|
return (np.array(dataset_x), np.array(dataset_y))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 切分x,y数据集,步长为10.最小单位为单个城市的单个行业。
|
|
|
|
|
# 先从第一个行业切分,合并所有城市。
|
|
|
|
|
|
|
|
|
|
# industry = df.columns[2:][0]
|
|
|
|
|
# city = df['地市'].drop_duplicates()[0]
|
|
|
|
|
# df_city_industry = df[df['地市'] == city][industry]
|
|
|
|
|
# dataset_x, dataset_y = create_dataset(df_city_industry)
|
|
|
|
|
#
|
|
|
|
|
# for city in df['地市'].drop_duplicates()[1:]:
|
|
|
|
|
# df_city_industry = df[df['地市'] == city][industry]
|
|
|
|
|
# x, y = create_dataset(df_city_industry)
|
|
|
|
|
# dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
|
|
|
|
|
#
|
|
|
|
|
# for industry in df.columns[2:][1:]:
|
|
|
|
|
# for city in df['地市'].drop_duplicates():
|
|
|
|
|
# df_city_industry = df[df['地市'] == city][industry]
|
|
|
|
|
# x, y = create_dataset(df_city_industry)
|
|
|
|
|
# dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
|
|
|
|
|
#
|
|
|
|
|
# print(dataset_x.shape, dataset_y.shape)
|
|
|
|
|
# df_x = pd.DataFrame(dataset_x)
|
|
|
|
|
# df_y = pd.DataFrame(dataset_y)
|
|
|
|
|
# df_x.to_csv('df_x_100.csv',index=False)
|
|
|
|
|
# df_y.to_csv('df_y_100.csv',index=False)
|
|
|
|
|
dataset_x = pd.read_csv('df_x.csv').values
|
|
|
|
|
dataset_y = pd.read_csv('df_y.csv').values
|
|
|
|
|
print(dataset_x.shape, dataset_y.shape)
|
|
|
|
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
|
train_size = int(len(dataset_x)*0.7)
|
|
|
|
|
train_x = dataset_x[:train_size]
|
|
|
|
|
train_y = dataset_y[:train_size]
|
|
|
|
|
eval_x = dataset_x[train_size:]
|
|
|
|
|
eval_y = dataset_y[train_size:]
|
|
|
|
|
|
|
|
|
|
# # 将数据改变形状,RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
|
|
|
|
|
train_x = train_x.reshape(-1, 1, train_step)
|
|
|
|
|
train_y = train_y.reshape(-1, 1, 3)
|
|
|
|
|
eval_x = eval_x.reshape(-1, 1, train_step)
|
|
|
|
|
eval_y = eval_y.reshape(-1, 1, 3)
|
|
|
|
|
|
|
|
|
|
# # 转为pytorch的tensor对象
|
|
|
|
|
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
|
|
|
|
|
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
|
|
|
|
|
eval_x = torch.from_numpy(eval_x).to(device).type(torch.float32)
|
|
|
|
|
eval_y = torch.from_numpy(eval_y).to(device).type(torch.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ds = TensorDataset(x_train, y_train)
|
|
|
|
|
# dl = DataLoader(ds, batch_size=32, drop_last=True)
|
|
|
|
|
# eval_ds = TensorDataset(x_eval, y_eval)
|
|
|
|
|
# eval_dl = DataLoader(eval_ds, batch_size=64, drop_last=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = LSTM(train_step,32, 3, num_layers=2).to(device)
|
|
|
|
|
loss_fn = nn.MSELoss()
|
|
|
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
|
|
|
|
|
|
|
|
|
|
min_loss = 1
|
|
|
|
|
for i in range(1500):
|
|
|
|
|
model.train()
|
|
|
|
|
out = model(train_x)
|
|
|
|
|
loss = loss_fn(out, train_y)
|
|
|
|
|
loss.backward()
|
|
|
|
|
optimizer.step()
|
|
|
|
|
optimizer.zero_grad()
|
|
|
|
|
|
|
|
|
|
model.eval()
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
pred = model(eval_x)
|
|
|
|
|
eval_loss = loss_fn(pred,eval_y)
|
|
|
|
|
if eval_loss <= min_loss:
|
|
|
|
|
min_loss = eval_loss
|
|
|
|
|
best_para = model.state_dict()
|
|
|
|
|
if i % 100 == 0:
|
|
|
|
|
print(f'epoch {i+1}: loss:{loss} eval_loss:{eval_loss}')
|
|
|
|
|
|
|
|
|
|
# for epoch in range(3):
|
|
|
|
|
# model.train()
|
|
|
|
|
# for step, (x, y) in enumerate(dl):
|
|
|
|
|
# x, y = x.to(device), y.to(device)
|
|
|
|
|
# pred = model(x)
|
|
|
|
|
# loss = loss_fn(pred,y)
|
|
|
|
|
# optimizer.zero_grad()
|
|
|
|
|
# loss.backward()
|
|
|
|
|
# optimizer.step()
|
|
|
|
|
#
|
|
|
|
|
# if step % 1000 == 0:
|
|
|
|
|
# print(f'epoch{epoch+1}: train_step:{step}/{len(dl)} train_loss:{loss}\n')
|
|
|
|
|
#
|
|
|
|
|
# model.eval()
|
|
|
|
|
# batch_loss = 0
|
|
|
|
|
# with torch.no_grad():
|
|
|
|
|
# for x,y in eval_dl:
|
|
|
|
|
# x, y = x.to(device), y.to(device)
|
|
|
|
|
# pred = model(x)
|
|
|
|
|
# loss = loss_fn(pred, y)
|
|
|
|
|
# batch_loss += loss
|
|
|
|
|
# print(f'epoch{epoch+1}: eval_loss:{batch_loss/len(eval_dl)}\n')
|
|
|
|
|
#
|
|
|
|
|
# if batch_loss/len(eval_dl) < min_loss:
|
|
|
|
|
# min_loss = batch_loss/len(eval_dl)
|
|
|
|
|
# best_parameters = model.state_dict()
|
|
|
|
|
|
|
|
|
|
torch.save(best_para,'best_3.pth')
|
|
|
|
|
|
|
|
|
|
model = LSTM(train_step,32, 3, num_layers=2).to(device)
|
|
|
|
|
model.load_state_dict(torch.load('best_3.pth'))
|
|
|
|
|
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
|
|
|
print("LSTM参数总量:", params)
|
|
|
|
|
|
|
|
|
|
dataset_x = dataset_x.reshape(-1,1,train_step)
|
|
|
|
|
dataset_x = torch.from_numpy(dataset_x).type(torch.float32).to(device)
|
|
|
|
|
pred = model(dataset_x).reshape(-1)
|
|
|
|
|
pred = np.concatenate((np.zeros(train_step), pred.cpu().detach().numpy()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.plot(pred, 'r', label='prediction')
|
|
|
|
|
plt.plot(dataset_y.reshape(-1), 'b', label='real')
|
|
|
|
|
plt.plot((train_size*3, train_size*3), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
|
|
|
|
|
plt.legend(loc='best')
|
|
|
|
|
plt.show()
|