输出预测结果

main
鸽子 1 year ago
parent b1915bb35c
commit 1064682854

@ -2,8 +2,50 @@ from prophet import Prophet
import pandas as pd
import os
import datetime
import numpy as np
def normal(data):
high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%'])
low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%'])
return data[(data<=high)&(data>=low)]
file_dir = './浙江各地市行业电量数据'
city = os.listdir(file_dir)[0]
df_city = pd.read_excel(os.path.join(file_dir,city))
print(df_city.columns)
for city in os.listdir(file_dir):
df_city = pd.read_excel(os.path.join(file_dir, city))
df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10])
df_city['stat_date'] = pd.to_datetime(df_city['stat_date'])
list_goal = []
list_industry = []
for industry in df_city.columns[2:]:
s1 = df_city[['stat_date', industry]]
s1 = s1[(s1['stat_date'] >= '2022-09-30') & (s1['stat_date'] <= '2023-10-31')]
s1 = s1.loc[normal(s1[industry]).index]
s1.rename(columns={'stat_date': 'ds', industry: 'y'}, inplace=True)
df_train = s1[(s1['ds'] >= '2022-08-31') & (s1['ds'] <= '2023-10-31')].sort_values(by='ds')
df_test = s1[(s1['ds'] >= '2022-08-31') & (s1['ds'] <= '2023-10-31')].sort_values(by='ds')
model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)
model.add_country_holidays(country_name="CN")
model.fit(df_train)
future = model.make_future_dataframe(periods=3, freq='D')
predict = model.predict(future)
predict = predict[['ds', 'yhat']].set_index('ds')
print(city,industry)
print(predict.loc['2023-10'])
# df = predict.join(s1.set_index('ds')).loc['2023-8']
# df['偏差率'] = (df['y'] - df['yhat']) / df['y']
# df['goal'] = (df['y'] - df['yhat'])[-3:].sum() / df['y'].sum()
# list_goal.append((df['y'] - df['yhat'])[-3:].sum() / df['y'].sum())
# list_industry.append(industry)
# df = pd.DataFrame({'industry': list_industry, 'goal': list_goal})
# df.to_csv(fr'C:\Users\鸽子\Desktop\行业8月偏差\{city[:2]}_goal.csv', index=False, encoding='gbk')
#
# with open(r'C:\Users\鸽子\Desktop\goal_8.txt','a') as f:
# f.write(f'{city[:2]}\n')
# df['goal'].value_counts(bins=[-np.inf,-0.05, -0.01, -0.005, 0, 0.005, 0.01, 0.02, 0.05,np.inf], sort=False).to_csv(f,header=False,sep='\t')

@ -53,30 +53,62 @@ def normal(df):
pass
return df
file_dir = './浙江各地市行业电量数据'
city1 = os.listdir(file_dir)[0]
df_city = pd.read_excel(os.path.join(file_dir, city1))
df_city = normal(df_city)
df_city = df_city.drop(columns='地市')
df_city[df_city.columns[1:]] /= 10000
df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10])
df_city.stat_date = pd.to_datetime(df_city.stat_date)
print(df_city.describe())
# file_dir = './浙江各地市行业电量数据'
# city1 = os.listdir(file_dir)[0]
# df_city = pd.read_excel(os.path.join(file_dir, city1))
# df_city = normal(df_city)
# df_city = df_city.drop(columns='地市')
# df_city[df_city.columns[1:]] /= 10000
# df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10])
# df_city.stat_date = pd.to_datetime(df_city.stat_date)
# print(df_city.describe())
#
# list_1000 = []
# list_100 = []
# list_10 = []
# list_1 = []
# for i in df_city.columns[1:]:
# if df_city[i].describe()['mean']>=1000:
# list_1000.append(i)
# if df_city[i].describe()['mean'] < 1000 and df_city[i].describe()['mean'] >= 100:
# list_100.append(i)
# if df_city[i].describe()['mean'] < 100 and df_city[i].describe()['mean'] >= 10:
# list_10.append(i)
# else:
# list_1.append(i)
# print('list_1:',list_1)
# print('list_10:',list_10)
# print('list_100:',list_100)
# print('list_1000:',list_1000)
import pandas as pd
# 创建一个简单的DataFrame
data = pd.DataFrame({'A': [1, 2, 3000, 4, 500],
'B': [10, 20, 30, 40, 50]})
Q1 = data['A'].quantile(0.25)
Q3 = data['A'].quantile(0.75)
IQR = Q3 - Q1
lower_threshold = Q1 - 1.5 * IQR
upper_threshold = Q3 + 1.5 * IQR
# 向下移动一行
outliers = (data['A'] < lower_threshold) | (data['A'] > upper_threshold)
print(outliers)
print( data['A'].shift(1))
# 替换异常值为临近一个值
data = {'A': [1, 2, 3, 4, 5],
'B': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
# 将满足条件的元素替换为新值
condition = df['A'] > 3
df_new = df.where(condition, other=-1)
print("原始数据:")
print(df)
list_1000 = []
list_100 = []
list_10 = []
list_1 = []
for i in df_city.columns[1:]:
if df_city[i].describe()['mean']>=1000:
list_1000.append(i)
if df_city[i].describe()['mean'] < 1000 and df_city[i].describe()['mean'] >= 100:
list_100.append(i)
if df_city[i].describe()['mean'] < 100 and df_city[i].describe()['mean'] >= 10:
list_10.append(i)
else:
list_1.append(i)
print('list_1:',list_1)
print('list_10:',list_10)
print('list_100:',list_100)
print('list_1000:',list_1000)
print("\n根据条件替换后的数据:")
print(df_new)

@ -0,0 +1,76 @@
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def normal(data):
high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%'])
low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%'])
return (data >= low) & (data <= high)
# file_dir = './浙江各地市行业电量数据'
#
# # 合并11个市
# df = pd.DataFrame({})
# for city in os.listdir(file_dir):
#
# df_city = pd.read_excel(os.path.join(file_dir, city))
#
# # 对每个市的每一个行业异常值 向后填充
# for industry in df_city.columns[2:]:
# outliers_index = normal(df_city[industry]).index
# df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill()
# df_city[industry].fillna(method='ffill',inplace=True)
# df = pd.concat([df,df_city])
# print(df.shape)
#
# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK')
df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk')
print(sum(df.isnull().sum()))
print(df.describe())
# 对df每一行业进行归一化
column_params = {}
for column in df.columns[2:]:
scaler = MinMaxScaler()
df[column] = scaler.fit_transform(df[[column]])
column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]}
print(column_params)
print(df.head())
def create_dataset(data, days_for_train=10) -> (np.array, np.array):
dataset_x, dataset_y = [], []
for i in range(len(data) - days_for_train - 3):
dataset_x.append(data[i:(i + days_for_train)])
dataset_y.append(data[i + days_for_train:i + days_for_train + 3])
return (np.array(dataset_x), np.array(dataset_y))
# 切分x,y数据集步长为10.最小单位为单个城市的单个行业。
# 先从第一个行业切分,合并所有城市。
industry = df.columns[2:][0]
city = df['地市'].drop_duplicates()[0]
df_city_industry = df[df['地市'] == city][industry]
dataset_x, dataset_y = create_dataset(df_city_industry)
for city in df['地市'].drop_duplicates()[1:]:
df_city_industry = df[df['地市'] == city][industry]
x, y = create_dataset(df_city_industry)
dataset_x,dataset_y = np.concatenate([dataset_x,x]),np.concatenate([dataset_y,y])
for industry in df.columns[2:][1:]:
for city in df['地市'].drop_duplicates():
df_city_industry = df[df['地市'] == city][industry]
x, y = create_dataset(df_city_industry)
dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
print(dataset_x.shape, dataset_y.shape)

@ -28,9 +28,9 @@ class LSTM_Regression(nn.Module):
def create_dataset(data, days_for_train=5) -> (np.array, np.array):
dataset_x, dataset_y = [], []
for i in range(len(data) - days_for_train-5):
for i in range(len(data) - days_for_train-3):
dataset_x.append(data[i:(i + days_for_train)])
dataset_y.append(data[i + days_for_train:i + days_for_train+5])
dataset_y.append(data[i + days_for_train:i + days_for_train+3])
# print(dataset_x,dataset_y)
return (np.array(dataset_x), np.array(dataset_y))
@ -103,13 +103,13 @@ train_y = dataset_y[:train_size]
# # 将数据改变形状RNN 读入的数据维度是 (seq_size, batch_size, feature_size)
train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN)
train_y = train_y.reshape(-1, 1, 5)
train_y = train_y.reshape(-1, 1, 3)
# # 转为pytorch的tensor对象
train_x = torch.from_numpy(train_x).to(device).type(torch.float32)
train_y = torch.from_numpy(train_y).to(device).type(torch.float32)
print('=====================================',train_x.shape)
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=5, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等
#
train_loss = []
loss_function = nn.MSELoss()
@ -146,7 +146,7 @@ for i in range(1500):
# plt.legend(loc='best')
# plt.show()
model.load_state_dict(torch.load('hy5.pth',map_location=torch.device('cpu')))
model.load_state_dict(torch.load('hy3.pth',map_location=torch.device('cpu')))
max_value = 354024930.8
min_value = 0.0
Loading…
Cancel
Save