删除重复数据集

main
get 10 months ago
parent 6308d6d555
commit 22f4141a79

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

@ -5,24 +5,27 @@ import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
torch.manual_seed(42)
train_step = 10
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.fc1 = nn.Linear(hidden_size, 128)
self.fc2 = nn.Linear(128, output_size)
self.ReLu = nn.ReLU()
self.dropout = nn.Dropout(0.8)
self.fc1 = nn.Linear(hidden_size, output_size)
# self.fc2 = nn.Linear(128, output_size)
# self.ReLu = nn.ReLU()
# self.dropout = nn.Dropout(0.8)
def forward(self, x):
x, _ = self.lstm(x)
s, b, h = x.shape
x = x.reshape(-1, h)
output = self.ReLu(self.dropout(self.fc1(x)))
output = self.fc2(output)
# output = self.ReLu(self.dropout(self.fc1(x)))
output = self.fc1(x)
output = output.view(s, b, -1)
return output
@ -100,36 +103,55 @@ dataset_x = pd.read_csv('df_x.csv').values
dataset_y = pd.read_csv('df_y.csv').values
print(dataset_x.shape, dataset_y.shape)
train_size = int(0.7 * len(dataset_x))
x_train, y_train = dataset_x[:train_size].reshape(-1,1,train_step), dataset_y[:train_size].reshape(-1, 1, 3)
x_eval, y_eval = dataset_x[train_size:].reshape(-1,1,train_step), dataset_y[train_size:].reshape(-1, 1, 3)
x_train, y_train = torch.from_numpy(x_train).type(torch.float32), torch.from_numpy(y_train).type(torch.float32)
x_eval, y_eval = torch.from_numpy(x_eval).type(torch.float32), torch.from_numpy(y_eval).type(torch.float32)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_size = int(len(dataset_x)*0.7)
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
eval_x = dataset_x[train_size:]
eval_y = dataset_y[train_size:]
# # 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, train_step)
train_y = train_y.reshape(-1, 1, 3)
eval_x = eval_x.reshape(-1, 1, train_step)
eval_y = eval_y.reshape(-1, 1, 3)
ds = TensorDataset(x_train, y_train)
dl = DataLoader(ds, batch_size=32, drop_last=True)
eval_ds = TensorDataset(x_eval, y_eval)
eval_dl = DataLoader(eval_ds, batch_size=64, drop_last=True)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
eval_x = torch.from_numpy(eval_x).to(device).type(torch.float32)
eval_y = torch.from_numpy(eval_y).to(device).type(torch.float32)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(train_step,64, 3, num_layers=2).to(device)
# ds = TensorDataset(x_train, y_train)
# dl = DataLoader(ds, batch_size=32, drop_last=True)
# eval_ds = TensorDataset(x_eval, y_eval)
# eval_dl = DataLoader(eval_ds, batch_size=64, drop_last=True)
model = LSTM(train_step,32, 3, num_layers=2).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
min_loss = 1
for i in range(500):
train_x,train_y = train_x.to(device),train_y.to(device)
for i in range(1500):
model.train()
out = model(train_x)
loss = loss_fn(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if loss <= min_loss:
min_loss = loss
best_para = model.state_dict()
model.eval()
with torch.no_grad():
pred = model(eval_x)
eval_loss = loss_fn(pred,eval_y)
if eval_loss <= min_loss:
min_loss = eval_loss
best_para = model.state_dict()
if i % 100 == 0:
print(f'epoch {i+1}: loss:{loss}')
print(f'epoch {i+1}: loss:{loss} eval_loss:{eval_loss}')
# for epoch in range(3):
# model.train()
@ -158,9 +180,9 @@ for i in range(500):
# min_loss = batch_loss/len(eval_dl)
# best_parameters = model.state_dict()
torch.save(best_parameters,'best_3.pth')
torch.save(best_para,'best_3.pth')
model = LSTM(train_step,64, 3, num_layers=2).to(device)
model = LSTM(train_step,32, 3, num_layers=2).to(device)
model.load_state_dict(torch.load('best_3.pth'))
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("LSTM参数总量:", params)

File diff suppressed because it is too large Load Diff

@ -149,9 +149,6 @@ for industry in data.columns:
print(df_result)
# result_eight.to_csv(f'9月{excel[:2]}.txt', sep='\t', mode='a')
# with open(fr'./偏差/9月底偏差率.txt', 'a', encoding='utf-8') as f:
# f.write(f'{excel[:2]}{industry}:{round(target, 5)}\n')

@ -54,42 +54,42 @@ def data_preprocessing(data):
return data
# 拼接数据集
file_dir = './浙江各地市行业电量数据'
excel = os.listdir(file_dir)[0]
data = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0, index_col='stat_date')
data.drop(columns='地市',inplace=True)
data = data_preprocessing(data)
df = data[data.columns[0]]
df.dropna(inplace = True)
dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
for level in data.columns[1:]:
df = data[level]
df.dropna(inplace=True)
x, y = create_dataset(df, DAYS_FOR_TRAIN)
dataset_x = np.concatenate((dataset_x, x))
dataset_y = np.concatenate((dataset_y, y))
for excel in os.listdir(file_dir)[1:]:
data = pd.read_excel(os.path.join(file_dir,excel), sheet_name=0,index_col='stat_date')
data.drop(columns='地市', inplace=True)
data = data_preprocessing(data)
for level in data.columns:
df = data[level]
df.dropna(inplace=True)
x,y = create_dataset(df,DAYS_FOR_TRAIN)
dataset_x = np.concatenate((dataset_x,x))
dataset_y = np.concatenate((dataset_y,y))
df_x_10 = pd.DataFrame(dataset_x)
df_y_10 = pd.DataFrame(dataset_y)
df_x_10.to_csv('df_x_10.csv',index=False)
df_y_10.to_csv('df_y_10.csv',index=False)
# file_dir = './浙江各地市行业电量数据'
# excel = os.listdir(file_dir)[0]
# data = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0, index_col='stat_date')
# data.drop(columns='地市',inplace=True)
# data = data_preprocessing(data)
#
# df = data[data.columns[0]]
# df.dropna(inplace = True)
# dataset_x, dataset_y = create_dataset(df, DAYS_FOR_TRAIN)
#
# for level in data.columns[1:]:
# df = data[level]
# df.dropna(inplace=True)
# x, y = create_dataset(df, DAYS_FOR_TRAIN)
# dataset_x = np.concatenate((dataset_x, x))
# dataset_y = np.concatenate((dataset_y, y))
#
#
# for excel in os.listdir(file_dir)[1:]:
#
# data = pd.read_excel(os.path.join(file_dir,excel), sheet_name=0,index_col='stat_date')
# data.drop(columns='地市', inplace=True)
# data = data_preprocessing(data)
#
# for level in data.columns:
# df = data[level]
# df.dropna(inplace=True)
# x,y = create_dataset(df,DAYS_FOR_TRAIN)
# dataset_x = np.concatenate((dataset_x,x))
# dataset_y = np.concatenate((dataset_y,y))
#
#
# df_x_10 = pd.DataFrame(dataset_x)
# df_y_10 = pd.DataFrame(dataset_y)
# df_x_10.to_csv('df_x_10.csv',index=False)
# df_y_10.to_csv('df_y_10.csv',index=False)
dataset_x = pd.read_csv('df_x_10.csv').values
dataset_y = pd.read_csv('df_y_10.csv').values
print(dataset_x.shape,dataset_y.shape)
@ -115,30 +115,31 @@ train_y = train_y.reshape(-1, 1, 3)
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
min_loss = 1
for i in range(500):
train_x,train_y = train_x.to(device),train_y.to(device)
out = model(train_x)
loss = loss_function(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
if loss <= min_loss:
min_loss = loss
best_para = model.state_dict()
if i % 100 == 0:
print(f'epoch {i+1}: loss:{loss}')
# 保存/读取模型
torch.save(best_para,'hy3.pth')
# for i in range(500):
# train_x,train_y = train_x.to(device),train_y.to(device)
# out = model(train_x)
# loss = loss_function(out, train_y)
# loss.backward()
# optimizer.step()
# optimizer.zero_grad()
# train_loss.append(loss.item())
#
# if loss <= min_loss:
# min_loss = loss
# best_para = model.state_dict()
# if i % 100 == 0:
# print(f'epoch {i+1}: loss:{loss}')
# # 保存/读取模型
# torch.save(best_para,'hy3.pth')
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device)
model.load_state_dict(torch.load('hy3.pth'))
# 测试
@ -171,7 +172,7 @@ for excel in os.listdir(file_dir):
df_city = df_city.loc['2023-9'][:-3]
city = df_city['地市'].iloc[0]
result_dict = {}
for industry in df_city.columns[2:]:
for industry in df_city.columns[1:]:
df_city[industry] = df_city[industry].astype('float')
x, y = create_dataset(df_city[industry], 10)
x = (x - min_value) / (max_value - min_value)
@ -179,30 +180,14 @@ for excel in os.listdir(file_dir):
x = torch.from_numpy(x).type(torch.float32).to(device)
pred = model(x).view(-1)
pred = pred * (max_value - min_value) + min_value
result = pred.cpu().detach().numpy()[-5:-2]
result = pred.cpu().detach().numpy()[-3:]
result_dict[industry] = list(result)
df = pd.DataFrame(result_dict,index=['2023-09-28','2023-09-29','2023-09-30'])
df.to_excel(fr'C:\Users\user\Desktop\9月行业电量预测28-30\{city} .xlsx')
print(time.time()-t1)
print(result_dict)
# 反归一化
pred = pred * (max_value - min_value) + min_value
df = df * (max_value - min_value) + min_value
# 打印指标
print(abs(pred - df[-3:]).mean() / df[-3:].mean())
result_eight = pd.DataFrame({'pred': np.round(pred,1),'real': df[-3:]})
target = (result_eight['pred'].sum() - result_eight['real'].sum()) / df[-31:].sum()
result_eight['loss_rate'] = round(target, 5)
result_eight['level'] = level
list_app.append(result_eight)
print(target)
print(result_eight)
final_df = pd.concat(list_app,ignore_index=True)
final_df.to_csv('市行业电量.csv',encoding='gbk')
print(final_df)

Loading…
Cancel
Save