import os import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler import torch from torch import nn from torch.utils.data import DataLoader, TensorDataset import matplotlib.pyplot as plt class LSTM(nn.Module): def __init__(self, input_size, hidden_size, output_size, num_layers): super().__init__() self.lstm = nn.LSTM(input_size, hidden_size, num_layers) self.fc1 = nn.Linear(hidden_size, 128) self.fc2 = nn.Linear(128, output_size) self.ReLu = nn.ReLU() self.dropout = nn.Dropout(0.5) def forward(self, x): x, _ = self.lstm(x) s, b, h = x.shape x = x.reshape(-1, h) output = self.ReLU(self.dropout(self.fc1(x))) output = self.fc2(output) return output def normal(data): high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%']) low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%']) return (data >= low) & (data <= high) # file_dir = './浙江各地市行业电量数据' # # # 合并11个市 # df = pd.DataFrame({}) # for city in os.listdir(file_dir): # # df_city = pd.read_excel(os.path.join(file_dir, city)) # # # 对每个市的每一个行业异常值 向后填充 # for industry in df_city.columns[2:]: # outliers_index = normal(df_city[industry]).index # df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill() # df_city[industry].fillna(method='ffill',inplace=True) # df = pd.concat([df,df_city]) # print(df.shape) # # df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK') df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk') print(sum(df.isnull().sum())) print(df.describe()) # 对df每一行业进行归一化 column_params = {} for column in df.columns[2:]: scaler = MinMaxScaler() df[column] = scaler.fit_transform(df[[column]]) column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]} print(column_params) print(df.head()) def create_dataset(data, days_for_train=10) -> (np.array, np.array): dataset_x, dataset_y = [], [] for i in range(len(data) - days_for_train - 3): dataset_x.append(data[i:(i + days_for_train)]) dataset_y.append(data[i + days_for_train:i + days_for_train + 3]) return (np.array(dataset_x), np.array(dataset_y)) # 切分x,y数据集,步长为10.最小单位为单个城市的单个行业。 # 先从第一个行业切分,合并所有城市。 industry = df.columns[2:][0] city = df['地市'].drop_duplicates()[0] df_city_industry = df[df['地市'] == city][industry] dataset_x, dataset_y = create_dataset(df_city_industry) for city in df['地市'].drop_duplicates()[1:]: df_city_industry = df[df['地市'] == city][industry] x, y = create_dataset(df_city_industry) dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y]) for industry in df.columns[2:][1:]: for city in df['地市'].drop_duplicates(): df_city_industry = df[df['地市'] == city][industry] x, y = create_dataset(df_city_industry) dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y]) print(dataset_x.shape, dataset_y.shape) train_size = int(0.7 * len(dataset_x)) x_train, y_train = dataset_x[:train_size], dataset_y[:train_size] x_eval, y_eval = dataset_x[train_size:], dataset_y[train_size:] x_train, y_train = torch.from_numpy(x_train).type(torch.float32), torch.from_numpy(y_train).type(torch.float32) x_eval, y_eval = torch.from_numpy(x_eval).type(torch.float32), torch.from_numpy(y_eval).type(torch.float32) ds = TensorDataset(x_train, y_train) dl = DataLoader(ds, batch_size=128, shuffle=True, drop_last=True) eval_ds = TensorDataset(x_eval, y_eval) eval_dl = DataLoader(eval_ds, batch_size=256, drop_last=True) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = LSTM(10,64, 3, num_layers=2).to(device) loss_fn = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) min_loss = 1 for epoch in range(10): for step, (x, y) in enumerate(dl): x, y = x.to(device), y.to(device) pred = model(x) loss = loss_fn(pred,y) optimizer.zero_grad() loss.backward() optimizer.step() if (step+1) % 1000 == 0: print(f'epoch{epoch+1}: train_step{step}/{len(dl)} train_loss:{round(loss)}\n') model.eval() batch_loss = 0 with torch.no_grad(): for x,y in eval_dl: x, y = x.to(device), y.to(device) pred = model(x) loss = loss_fn(pred, y) batch_loss += loss print(f'epoch{epoch+1}: eval_loss:{batch_loss/len(eval_dl)}') if batch_loss/len(eval_dl) < min_loss: min_loss = batch_loss/len(eval_dl) best_parameters = model.state_dict() torch.save(best_parameters,'best_3.pth') model = LSTM(10,64, 3, num_layers=2).to(device) model.load_state_dict(torch.load('best_3.pth')) dataset_x = dataset_x.reshape(-1,1,10) dataset_x = torch.from_numpy(dataset_x).type(torch.float32).to(device) pred = model(dataset_x).reshape(-1) pred = np.concatenate((np.zeros(10), pred.cpu().detach().numpy())) plt.plot(pred, 'r', label='prediction') plt.plot(dataset_y.reshape(-1), 'b', label='real') plt.plot((train_size*3, train_size*3), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出 plt.legend(loc='best') plt.show()