删除重复数据集

main
get 1 year ago
parent b782ac9193
commit 6308d6d555

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="C:\anaconda\envs\pytorch" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="pytorch_gpu" project-jdk-type="Python SDK" />
</project>

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="C:\anaconda\envs\pytorch" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="pytorch_gpu" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -1,4 +1,3 @@
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
@ -7,6 +6,8 @@ from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
train_step = 10
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers):
super().__init__()
@ -14,13 +15,13 @@ class LSTM(nn.Module):
self.fc1 = nn.Linear(hidden_size, 128)
self.fc2 = nn.Linear(128, output_size)
self.ReLu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
self.dropout = nn.Dropout(0.8)
def forward(self, x):
x, _ = self.lstm(x)
s, b, h = x.shape
x = x.reshape(-1, h)
output = self.ReLU(self.dropout(self.fc1(x)))
output = self.ReLu(self.dropout(self.fc1(x)))
output = self.fc2(output)
return output
@ -49,8 +50,7 @@ def normal(data):
#
# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK')
df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk')
print(sum(df.isnull().sum()))
print(df.describe())
# 对df每一行业进行归一化
column_params = {}
for column in df.columns[2:]:
@ -63,12 +63,11 @@ for column in df.columns[2:]:
print(column_params)
print(df.head())
def create_dataset(data, days_for_train=10) -> (np.array, np.array):
def create_dataset(data, train_step=train_step) -> (np.array, np.array):
dataset_x, dataset_y = [], []
for i in range(len(data) - days_for_train - 3):
dataset_x.append(data[i:(i + days_for_train)])
dataset_y.append(data[i + days_for_train:i + days_for_train + 3])
for i in range(len(data) - train_step - 3):
dataset_x.append(data[i:(i + train_step)])
dataset_y.append(data[i + train_step:i + train_step + 3])
return (np.array(dataset_x), np.array(dataset_y))
@ -76,75 +75,100 @@ def create_dataset(data, days_for_train=10) -> (np.array, np.array):
# 切分x,y数据集步长为10.最小单位为单个城市的单个行业。
# 先从第一个行业切分,合并所有城市。
industry = df.columns[2:][0]
city = df['地市'].drop_duplicates()[0]
df_city_industry = df[df['地市'] == city][industry]
dataset_x, dataset_y = create_dataset(df_city_industry)
for city in df['地市'].drop_duplicates()[1:]:
df_city_industry = df[df['地市'] == city][industry]
x, y = create_dataset(df_city_industry)
dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
for industry in df.columns[2:][1:]:
for city in df['地市'].drop_duplicates():
df_city_industry = df[df['地市'] == city][industry]
x, y = create_dataset(df_city_industry)
dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
# industry = df.columns[2:][0]
# city = df['地市'].drop_duplicates()[0]
# df_city_industry = df[df['地市'] == city][industry]
# dataset_x, dataset_y = create_dataset(df_city_industry)
#
# for city in df['地市'].drop_duplicates()[1:]:
# df_city_industry = df[df['地市'] == city][industry]
# x, y = create_dataset(df_city_industry)
# dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
#
# for industry in df.columns[2:][1:]:
# for city in df['地市'].drop_duplicates():
# df_city_industry = df[df['地市'] == city][industry]
# x, y = create_dataset(df_city_industry)
# dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
#
# print(dataset_x.shape, dataset_y.shape)
# df_x = pd.DataFrame(dataset_x)
# df_y = pd.DataFrame(dataset_y)
# df_x.to_csv('df_x_100.csv',index=False)
# df_y.to_csv('df_y_100.csv',index=False)
dataset_x = pd.read_csv('df_x.csv').values
dataset_y = pd.read_csv('df_y.csv').values
print(dataset_x.shape, dataset_y.shape)
train_size = int(0.7 * len(dataset_x))
x_train, y_train = dataset_x[:train_size].reshape(-1,1,10), dataset_y[:train_size].reshape(-1, 1, 3)
x_eval, y_eval = dataset_x[train_size:].reshape(-1,1,10), dataset_y[train_size:].reshape(-1, 1, 3)
x_train, y_train = dataset_x[:train_size].reshape(-1,1,train_step), dataset_y[:train_size].reshape(-1, 1, 3)
x_eval, y_eval = dataset_x[train_size:].reshape(-1,1,train_step), dataset_y[train_size:].reshape(-1, 1, 3)
x_train, y_train = torch.from_numpy(x_train).type(torch.float32), torch.from_numpy(y_train).type(torch.float32)
x_eval, y_eval = torch.from_numpy(x_eval).type(torch.float32), torch.from_numpy(y_eval).type(torch.float32)
ds = TensorDataset(x_train, y_train)
dl = DataLoader(ds, batch_size=128, shuffle=True, drop_last=True)
dl = DataLoader(ds, batch_size=32, drop_last=True)
eval_ds = TensorDataset(x_eval, y_eval)
eval_dl = DataLoader(eval_ds, batch_size=256, drop_last=True)
eval_dl = DataLoader(eval_ds, batch_size=64, drop_last=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(10,64, 3, num_layers=2).to(device)
model = LSTM(train_step,64, 3, num_layers=2).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
min_loss = 1
for epoch in range(10):
for step, (x, y) in enumerate(dl):
x, y = x.to(device), y.to(device)
pred = model(x)
loss = loss_fn(pred,y)
optimizer.zero_grad()
for i in range(500):
train_x,train_y = train_x.to(device),train_y.to(device)
out = model(train_x)
loss = loss_fn(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if (step+1) % 1000 == 0:
print(f'epoch{epoch+1}: train_step{step}/{len(dl)} train_loss:{round(loss)}\n')
model.eval()
batch_loss = 0
with torch.no_grad():
for x,y in eval_dl:
x, y = x.to(device), y.to(device)
pred = model(x)
loss = loss_fn(pred, y)
batch_loss += loss
print(f'epoch{epoch+1}: eval_loss:{batch_loss/len(eval_dl)}')
if batch_loss/len(eval_dl) < min_loss:
min_loss = batch_loss/len(eval_dl)
best_parameters = model.state_dict()
if loss <= min_loss:
min_loss = loss
best_para = model.state_dict()
if i % 100 == 0:
print(f'epoch {i+1}: loss:{loss}')
# for epoch in range(3):
# model.train()
# for step, (x, y) in enumerate(dl):
# x, y = x.to(device), y.to(device)
# pred = model(x)
# loss = loss_fn(pred,y)
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
#
# if step % 1000 == 0:
# print(f'epoch{epoch+1}: train_step:{step}/{len(dl)} train_loss:{loss}\n')
#
# model.eval()
# batch_loss = 0
# with torch.no_grad():
# for x,y in eval_dl:
# x, y = x.to(device), y.to(device)
# pred = model(x)
# loss = loss_fn(pred, y)
# batch_loss += loss
# print(f'epoch{epoch+1}: eval_loss:{batch_loss/len(eval_dl)}\n')
#
# if batch_loss/len(eval_dl) < min_loss:
# min_loss = batch_loss/len(eval_dl)
# best_parameters = model.state_dict()
torch.save(best_parameters,'best_3.pth')
model = LSTM(10,64, 3, num_layers=2).to(device)
model = LSTM(train_step,64, 3, num_layers=2).to(device)
model.load_state_dict(torch.load('best_3.pth'))
dataset_x = dataset_x.reshape(-1,1,10)
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("LSTM参数总量:", params)
dataset_x = dataset_x.reshape(-1,1,train_step)
dataset_x = torch.from_numpy(dataset_x).type(torch.float32).to(device)
pred = model(dataset_x).reshape(-1)
pred = np.concatenate((np.zeros(10), pred.cpu().detach().numpy()))
pred = np.concatenate((np.zeros(train_step), pred.cpu().detach().numpy()))
plt.plot(pred, 'r', label='prediction')

@ -0,0 +1,128 @@
import pandas as pd
import datetime
import os
import numpy as np
import torch
from torch import nn
def normalise(dataset_x,dataset_y):
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
return dataset_x,dataset_y,max_value,min_value
def create_data(df_industry, industry):
dataset_x = []
dataset_y = []
for i in df_industry['地市'].drop_duplicates():
data = df_industry[df_industry['地市'] == i]
grouped = data.groupby(data['stat_date'].dt.to_period('M'))
# 遍历每个月的数据
for name, group in grouped:
if len(group) == 31:
dataset_x.append(list(group[industry].values[1:28]))
dataset_y.append(list(group[industry].values[-3:]))
if len(group) == 30:
dataset_x.append(list(group[industry].values[:27]))
dataset_y.append(list(group[industry].values[-3:]))
if len(group) == 28:
fst = group[industry].values[0]
dataset_x.append([fst, fst, fst] + list(group[industry].values[1:25]))
dataset_y.append(list(group[industry].values[-3:]))
else:
fst = group[industry].values[0]
if len([fst, fst] + list(group[industry].values[1:26])) != 27:
break
dataset_x.append([fst, fst] + list(group[industry].values[1:26]))
dataset_y.append(list(group[industry].values[-3:]))
return np.array(dataset_x), np.array(dataset_y)
df = pd.read_csv('合并行业数据.csv')
df[df.columns[2:]] /= 10000
df['stat_date'] = df['stat_date'].map(lambda x:str(x).strip()[:10])
df['stat_date'] = pd.to_datetime(df['stat_date'],format='%Y-%m-%d')
list_1 = ['1.煤炭开采和洗选业']
list_2 = ['2.石油和天然气开采业', '3.黑色金属矿采选业']
list_3 = ['2.林业', '4.有色金属矿采选业', '4.烟草制品业', '31.金属制品、机械和设备修理业', '2.燃气生产和供应业', '3.建筑安装业', '4.航空运输业', '5.管道运输业', '6.多式联运和运输代理业', '8.邮政业']
list_4 = ['第一产业', '一、农、林、牧、渔业', '1.农业', '3.畜牧业', '4.渔业', '5.农、林、牧、渔专业及辅助性活动', '(一)采矿业', '5.非金属矿采选业', '6.其他采矿业', '1.农副食品加工业', '2.食品制造业', '3.酒、饮料及精制茶制造业', '8.木材加工和木、竹、藤、棕、草制品业', '9.家具制造业', '11.印刷和记录媒介复制业', '25.铁路、船舶、航空航天和其他运输设备制造业', '28.仪器仪表制造业', '30.废弃资源综合利用业', '1.房屋建筑业', '4.建筑装饰、装修和其他建筑业', '1.铁路运输业', '2.道路运输业', '3.水上运输业', '7.装卸搬运和仓储业', '2.互联网和相关服务', '3.软件和信息技术服务业', '八、金融业', '1.科学研究和技术服务业', '3.居民服务、修理和其他服务业']
list_5 = ['城镇居民', '6.纺织服装、服饰业', '7.皮革、毛皮、羽毛及其制品和制鞋业', '10.造纸和纸制品业', '12.文教、工美、体育和娱乐用品制造业', '13.石油、煤炭及其他燃料加工业', '14.化学原料和化学制品制造业', '15.医药制造业', '16.化学纤维制造业', '17.橡胶和塑料制品业', '18.非金属矿物制品业', '19.黑色金属冶炼和压延加工业', '20.有色金属冶炼和压延加工业', '21.金属制品业', '22.通用设备制造业', '23.专用设备制造业', '24.汽车制造业', '26.电气机械和器材制造业', '27.计算机、通信和其他电子设备制造业', '29.其他制造业', '(三)电力、热力、燃气及水的生产和供应业', '1.电力、热力生产和供应业', '3.水的生产和供应业', '三、建筑业', '2.土木工程建筑业', '四、交通运输、仓储和邮政业', '五、信息传输、软件和信息技术服务业', '1.电信、广播电视和卫星传输服务', '六、批发和零售业', '七、住宿和餐饮业', '九、房地产业', '十、租赁和商务服务业', '十一、公共服务及管理组织', '2.水利、环境和公共设施管理业', '4.教育、文化、体育和娱乐业', '5.卫生和社会工作', '6.公共管理和社会组织、国际组织']
list_6 = ['全社会用电总计', 'a、全行业用电合计', '第二产业', '第三产业', 'b、城乡居民生活用电合计', '乡村居民', '二、工业', '(二)制造业', '5.纺织业']
# list_1
industry = '1.煤炭开采和洗选业'
df_industry = df[['地市', 'stat_date', industry]]
dataset_x1,dataset_y1 = create_data(df_industry,industry)
print('list_1:',np.max(dataset_x1),np.min(dataset_x1))
dataset_x1,dataset_y1 = normalise(dataset_x1,dataset_y1)[:2]
# list_2
industry = '2.石油和天然气开采业'
df_industry = df[['地市', 'stat_date', industry]]
dataset_x2,dataset_y2 = create_data(df_industry,industry)
for i in list_2[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x2 = np.concatenate([dataset_x2, x])
dataset_y2 = np.concatenate([dataset_y2, y])
print('list_2:',np.max(dataset_x2),np.min(dataset_x2))
dataset_x2,dataset_y2 = normalise(dataset_x2,dataset_y2)[:2]
# list_3
industry = '2.林业'
df_industry = df[['地市', 'stat_date', industry]]
dataset_x3,dataset_y3 = create_data(df_industry,industry)
for i in list_3[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x3 = np.concatenate([dataset_x3, x])
dataset_y3 = np.concatenate([dataset_y3, y])
print('list_3:',np.max(dataset_x3),np.min(dataset_x3))
dataset_x3,dataset_y3 = normalise(dataset_x3,dataset_y3)[:2]
# list_4
industry = list_4[0]
df_industry = df[['地市', 'stat_date', industry]]
dataset_x4,dataset_y4 = create_data(df_industry,industry)
for i in list_4[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x4 = np.concatenate([dataset_x4, x])
dataset_y4 = np.concatenate([dataset_y4, y])
print('list_4:',np.max(dataset_x4),np.min(dataset_x4))
dataset_x4,dataset_y4 = normalise(dataset_x4,dataset_y4)[:2]
# list_5
industry = list_5[0]
df_industry = df[['地市', 'stat_date', industry]]
dataset_x5,dataset_y5 = create_data(df_industry,industry)
for i in list_5[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x5 = np.concatenate([dataset_x5, x])
dataset_y5 = np.concatenate([dataset_y5, y])
print('list_5:',np.max(dataset_x5),np.min(dataset_x5))
dataset_x5,dataset_y5 = normalise(dataset_x5,dataset_y5)[:2]
# list_6
industry = list_6[0]
df_industry = df[['地市', 'stat_date', industry]]
dataset_x6,dataset_y6 = create_data(df_industry,industry)
for i in list_6[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x6 = np.concatenate([dataset_x6, x])
dataset_y6 = np.concatenate([dataset_y6, y])
print('list_6:',np.max(dataset_x6),np.min(dataset_x6))
dataset_x6,dataset_y6 = normalise(dataset_x6,dataset_y6)[:2]
dataset_x = np.concatenate([dataset_x1,dataset_x2,dataset_x3,dataset_x4,dataset_x5,dataset_x6])
dataset_y = np.concatenate([dataset_y1,dataset_y2,dataset_y3,dataset_y4,dataset_y5,dataset_y6])
print(dataset_y.shape,dataset_x.shape)

@ -4,6 +4,7 @@ import torch
from torch import nn
import os
import time
import matplotlib.pyplot as plt
t1 = time.time()
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
@ -50,7 +51,6 @@ def data_preprocessing(data):
data = data.astype(float)
for col in data.columns:
data[col] = normal(data[col])
return data
# 拼接数据集
@ -86,10 +86,16 @@ for excel in os.listdir(file_dir)[1:]:
dataset_y = np.concatenate((dataset_y,y))
df_x_10 = pd.DataFrame(dataset_x)
df_y_10 = pd.DataFrame(dataset_y)
df_x_10.to_csv('df_x_10.csv',index=False)
df_y_10.to_csv('df_y_10.csv',index=False)
dataset_x = pd.read_csv('df_x_10.csv').values
dataset_y = pd.read_csv('df_y_10.csv').values
print(dataset_x.shape,dataset_y.shape)
# # 训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#
# 标准化到0~1
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
@ -108,47 +114,52 @@ train_y = train_y.reshape(-1, 1, 3)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
print('=====================================',train_x.shape)
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
#
train_loss = []
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
for i in range(1500):
min_loss = 1
for i in range(500):
train_x,train_y = train_x.to(device),train_y.to(device)
out = model(train_x)
print(out.shape)
loss = loss_function(out, train_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
train_loss.append(loss.item())
if loss <= min_loss:
min_loss = loss
best_para = model.state_dict()
if i % 100 == 0:
print(f'epoch {i+1}: loss:{loss}')
#
# # 保存/读取模型
# torch.save(model.state_dict(),'hy5.pth')
# model.load_state_dict(torch.load('hy5.pth'))
# # for test
# model = model.eval() # 转换成测试模式
# # model.load_state_dict(torch.load(os.path.join(model_save_dir,model_file))) # 读取参数
# dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
# dataset_x = torch.from_numpy(dataset_x).to(device).type(torch.float32)
#
# pred_test = model(dataset_x) # 全量训练集
# # 模型输出 (seq_size, batch_size, output_size)
# pred_test = pred_test.view(-1)
# pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test.cpu().detach().numpy()))
# plt.plot(pred_test.reshape(-1), 'r', label='prediction')
# plt.plot(dataset_y.reshape(-1), 'b', label='real')
# plt.plot((train_size*5, train_size*5), (0, 1), 'g--') # 分割线 左边是训练数据 右边是测试数据的输出
# plt.legend(loc='best')
# plt.show()
model.load_state_dict(torch.load('hy3.pth',map_location=torch.device('cpu')))
max_value = 354024930.8
min_value = 0.0
# 保存/读取模型
torch.save(best_para,'hy3.pth')
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device)
model.load_state_dict(torch.load('hy3.pth'))
# 测试
model = model.eval()
dataset_x = dataset_x.reshape(-1, 1, DAYS_FOR_TRAIN) # (seq_size, batch_size, feature_size)
dataset_x = torch.from_numpy(dataset_x).to(device).type(torch.float32)
pred_test = model(dataset_x) # 全量训练集
pred_test = pred_test.view(-1)
pred_test = np.concatenate((np.zeros(DAYS_FOR_TRAIN), pred_test.cpu().detach().numpy()))
plt.plot(pred_test.reshape(-1), 'r', label='prediction')
plt.plot(dataset_y.reshape(-1), 'b', label='real')
plt.plot((train_size*3, train_size*3), (0, 1), 'g--')
plt.legend(loc='best')
plt.show()
# model.load_state_dict(torch.load('hy3.pth',map_location=torch.device('cpu')))
# max_value = 354024930.8
# min_value = 0.0
# 创建测试集
file_dir = './浙江各地市行业电量数据'
@ -176,22 +187,22 @@ print(time.time()-t1)
print(result_dict)
# 反归一化
# pred = pred * (max_value - min_value) + min_value
# df = df * (max_value - min_value) + min_value
# # 打印指标
# print(abs(pred - df[-3:]).mean() / df[-3:].mean())
# result_eight = pd.DataFrame({'pred': np.round(pred,1),'real': df[-3:]})
# target = (result_eight['pred'].sum() - result_eight['real'].sum()) / df[-31:].sum()
# result_eight['loss_rate'] = round(target, 5)
# result_eight['level'] = level
# list_app.append(result_eight)
# print(target)
# print(result_eight)
# final_df = pd.concat(list_app,ignore_index=True)
# final_df.to_csv('市行业电量.csv',encoding='gbk')
# print(final_df)
pred = pred * (max_value - min_value) + min_value
df = df * (max_value - min_value) + min_value
# 打印指标
print(abs(pred - df[-3:]).mean() / df[-3:].mean())
result_eight = pd.DataFrame({'pred': np.round(pred,1),'real': df[-3:]})
target = (result_eight['pred'].sum() - result_eight['real'].sum()) / df[-31:].sum()
result_eight['loss_rate'] = round(target, 5)
result_eight['level'] = level
list_app.append(result_eight)
print(target)
print(result_eight)
final_df = pd.concat(list_app,ignore_index=True)
final_df.to_csv('市行业电量.csv',encoding='gbk')
print(final_df)

Loading…
Cancel
Save