删除重复数据集

main
get 1 year ago
parent dfbad8daa2
commit edc3951c25

@ -4,24 +4,166 @@ import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from torch import nn from torch import nn
import os import os
from torch.utils.data import TensorDataset,DataLoader
import datetime
torch.manual_seed(42) torch.manual_seed(42)
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" # 解决OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized. os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" # 解决OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
pd.set_option('display.width',None)
class LSTM(nn.Module): class LSTM(nn.Module):
def __init__(self,input_size,hidden_size,output_size,num_layers=2): def __init__(self,input_size,hidden_size,output_size,num_layers=2):
super().__init__() super().__init__()
self.lstm = nn.LSTM(input_size,hidden_size,num_layers=2) self.lstm = nn.LSTM(input_size,hidden_size,num_layers)
self.fc1 = nn.Linear(hidden_size,64) self.fc1 = nn.Linear(hidden_size,output_size)
self.ReLu = nn.ReLU() # self.ReLu = nn.ReLU()
self.fc2 = nn.Linear(64,output_size) # self.fc2 = nn.Linear(64,output_size)
def forward(self,x): def forward(self,x):
output,_ = self.lstm(x) output,_ = self.lstm(x)
s,b,h = output.shape s,b,h = output.shape
output = output.reshape(-1,h) output = output.reshape(-1,h)
output = self.ReLu(self.fc1(output)) output = self.fc1(output)
output = self.fc2(output) # output = self.ReLu(self.fc1(output))
# output = self.fc2(output)
return output return output
def create_data(df_level,volt_level):
dataset_x = []
dataset_y = []
# 按月份分组
grouped = df_level.groupby(df_level['stat_date'].dt.to_period('M'))
# 遍历每个月的数据
for name, group in grouped:
if len(group) == 31:
dataset_x.append(list(group[volt_level].values[1:28]))
dataset_y.append(list(group[volt_level].values[-3:]))
if len(group) == 30:
dataset_x.append(list(group[volt_level].values[:27]))
dataset_y.append(list(group[volt_level].values[-3:]))
if len(group) == 28:
fst = group[volt_level].values[0]
dataset_x.append([fst,fst,fst]+list(group[volt_level].values[1:25]))
dataset_y.append(list(group[volt_level].values[-3:]))
else:
fst = group[volt_level].values[0]
if len([fst, fst]+list(group[volt_level].values[1:26])) != 27:
break
dataset_x.append([fst, fst]+list(group[volt_level].values[1:26]))
dataset_y.append(list(group[volt_level].values[-3:]))
return np.array(dataset_x),np.array(dataset_y)
# 创建数据集 # 创建数据集
file_dir = './浙江各地市分电压日电量数据'
print(os.listdir(file_dir))
city1 = os.listdir(file_dir)[0]
df_city = pd.read_excel(os.path.join(file_dir,city1)).drop(columns='地市')
df_city = df_city[['stat_date','1-10kv','110kv(含66kv)','35kv']]
df_city[['1-10kv','110kv(含66kv)','35kv']] /= 10000
df_city.stat_date = pd.to_datetime(df_city.stat_date)
volt_level = '1-10kv'
df_level = df_city[['stat_date',volt_level]]
dataset_x,dataset_y = create_data(df_level,volt_level)
for volt_level in df_city.columns[2:]:
df_level = df_city[['stat_date',volt_level]]
x,y = create_data(df_level,volt_level)
dataset_x = np.concatenate([dataset_x,x])
dataset_y = np.concatenate([dataset_y,y])
for excel in os.listdir(file_dir)[1:]:
df_city = pd.read_excel(os.path.join(file_dir, excel)).drop(columns='地市')
df_city = df_city[['stat_date', '1-10kv', '110kv(含66kv)', '35kv']]
df_city[['1-10kv', '110kv(含66kv)', '35kv']] /= 10000
df_city.stat_date = pd.to_datetime(df_city.stat_date)
for volt_level in df_city.columns[1:]:
df_level = df_city[['stat_date', volt_level]]
x, y = create_data(df_level, volt_level)
dataset_x = np.concatenate([dataset_x, x])
dataset_y = np.concatenate([dataset_y, y])
print(dataset_x.shape,dataset_y.shape)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 标准化到0~1
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
print(max_value,min_value)
# 划分训练集和测试集
train_size = int(len(dataset_x)*0.8)
train_x = dataset_x[:train_size]
train_y = dataset_y[:train_size]
eval_x = dataset_x[train_size:]
eval_y = dataset_y[train_size:]
# # 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, 27)
train_y = train_y.reshape(-1, 1, 3)
eval_x = eval_x.reshape(-1, 1, 27)
eval_y = eval_y.reshape(-1, 1, 27)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
eval_x = torch.from_numpy(eval_x).to(device).type(torch.float32)
train_ds = TensorDataset(train_x,train_y)
train_dl = DataLoader(train_ds,batch_size=32,drop_last=True)
eval_ds = TensorDataset(eval_x,eval_y)
eval_dl = DataLoader(eval_ds,batch_size=64,drop_last=True)
model = LSTM(27, 16, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
min_loss = 1
for i in range(200):
for j,(x,y) in enumerate(train_dl):
x,y = x.to(device),y.to(device)
out = model(x)
loss = loss_function(out, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
# if (i+1) % 100 == 0:
# print(f'epoch {i+1}/1500 loss:{round(loss.item(),5)}')
if (j + 1) % 50 == 0:
print(f'epoch {i+1}/200 step {j+1}/{len(train_dl)} loss:{loss}' )
model.eval()
for k,(x,y) in enumerate(eval_dl):
pred = model(eval_x)
# 保存模型
torch.save(model.state_dict(),'dy3.pth')
# for test
model = model.eval()
dataset_x = dataset_x.reshape(-1, 1, 27) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x).to(device).type(torch.float32)
pred_test = model(dataset_x) # 全量训练集
# 模型输出 (seq_size, batch_size, output_size)
pred_test = pred_test.view(-1).cpu().detach().numpy()
plt.plot(pred_test.reshape(-1), 'r', label='prediction')
plt.plot(dataset_y.reshape(-1), 'b', label='real')
plt.plot((train_size*3, train_size*3), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
plt.legend(loc='best')
plt.show()

Loading…
Cancel
Save