You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
5.9 KiB
Python

11 months ago
import time
import numpy as np
import pandas as pd
import torch
from torch import nn
import os
11 months ago
from multiprocessing import Pool
11 months ago
DAYS_FOR_TRAIN = 9
class LSTM_Regression(nn.Module):
def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, _x):
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size)
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size)
x = x.view(s * b, h)
x = self.fc(x)
x = x.view(s, b, -1) # 把形状改回来
return x
def create_dataset(data, days_for_train=5) -> (np.array, np.array):
dataset_x, dataset_y = [], []
for i in range(len(data) - days_for_train):
_x = data[i:(i + days_for_train)]
dataset_x.append(_x)
dataset_y.append(data[i + days_for_train])
return (np.array(dataset_x), np.array(dataset_y))
def to_data(file_dir, excel):
data = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0)
data.columns = data.columns.map(lambda x: x.strip())
data.sort_values(by='stat_date', ascending=True)
data.drop(columns=[i for i in data.columns if (data[i] == 0).sum() / len(data) >= 0.5], inplace=True) # 去除0值列
print('len(data):', len(data))
11 months ago
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("运算类型:", device)
11 months ago
for industry in data.columns[1:]:
11 months ago
c = time.time()
11 months ago
df = data[['stat_date', industry]]
df = df[df[industry] != 0] # 去除0值行
df['stat_date'] = pd.to_datetime(df['stat_date'], format='%Y%m%d', errors='coerce').astype('string')
df.set_index('stat_date', inplace=True)
df = df[(df.index.str[:6] != '202309') & (df.index.str[:6] != '202310')]
df = df.astype('float32').values # 转换数据类型
# 标准化到0~1
max_value = np.max(df)
min_value = np.min(df)
df = (df - min_value) / (max_value - min_value)
dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
11 months ago
print("========")
11 months ago
print('len(dataset_x:)', len(dataset_x))
# 划分训练集和测试集
train_size = len(dataset_x) - 31
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
# 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN)
train_y = train_y.reshape(-1, 1, 1)
# 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x)
train_y = torch.from_numpy(train_y)
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=1, num_layers=2) # 导入模型并设置模型的参数输入输出层、隐藏层等
11 months ago
# 训练使用GPU
if torch.cuda.is_available():
train_x = train_x.cuda()
train_y = train_y.cuda()
model.to(device)
11 months ago
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
11 months ago
11 months ago
a = time.time()
11 months ago
print(industry, "行业加载时间", a - c)
11 months ago
for i in range(1200):
out = model(train_x)
loss = loss_function(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
b = time.time()
11 months ago
print(excel, industry, '训练用时', b - a)
11 months ago
# 保存模型
# torch.save(model.state_dict(),save_filename)
# torch.save(model.state_dict(),os.path.join(model_save_dir,model_file))
# for test
model = model.eval() # 转换成测试模式
# model.load_state_dict(torch.load(os.path.join(model_save_dir,model_file))) # 读取参数
dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x)
11 months ago
# 测试使用GPU
if torch.cuda.is_available():
dataset_x = dataset_x.cuda()
11 months ago
pred_test = model(dataset_x) # 全量训练集
# 模型输出 (seq_size, batch_size, output_size)
11 months ago
# 测试使用GPU ######################################################### 注意调整这里,反复转换效率不高
if torch.cuda.is_available():
# 不支持将GPU tensor转换为numpy
pred_test = pred_test.cpu()
# 测试使用GPU ######################################################### 注意调整这里,反复转换效率不高
11 months ago
pred_test = pred_test.view(-1).data.numpy()
pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test))
assert len(pred_test) == len(df)
# 反归一化
pred_test = pred_test * (max_value - min_value) + min_value
df = df * (max_value - min_value) + min_value
pred_test = pred_test.reshape(-1)
df = df.reshape(-1)
# 打印指标
print(abs(pred_test[-31:] - df[-31:]).mean() / df[-31:].mean())
result_eight = pd.DataFrame({'pred_test': pred_test[-31:], 'real': df[-31:]})
target = (result_eight['pred_test'][-3:].sum() - result_eight['real'][-3:].sum()) / result_eight[
'real'].sum()
11 months ago
# print(target)
11 months ago
with open(fr'.\cws_to_data\{excel[:2]}.txt', 'a', encoding='utf-8') as f:
tmp_data = {'city': excel[:2], 'industry': industry, "month_deviation_rate": round(target, 5)}
f.write(str(tmp_data) + "\n")
11 months ago
print("========")
11 months ago
if __name__ == '__main__':
file_dir = r'./浙江所有地市133行业数据'
11 months ago
11 months ago
p = Pool(6)
11 months ago
for excel in os.listdir(file_dir):
11 months ago
p.apply_async(func=to_data, args=(file_dir, excel))
p.close()
p.join()
11 months ago
11 months ago
# for excel in os.listdir((file_dir)):
# to_data(file_dir, excel)