You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
4.9 KiB
Python

import numpy as np
import pandas as pd
from prophet import Prophet
import math
import matplotlib.pyplot as plt
import os
from openpyxl import Workbook
pd.set_option('display.width',None)
def normal(x):
high = x.describe()['75%'] + 1.5*(x.describe()['75%']-x.describe()['25%'])
low = x.describe()['25%'] - 1.5*(x.describe()['75%']-x.describe()['25%'])
return x[(x<=high)&(x>=low)]
df = pd.read_csv(r'C:\Users\鸽子\Desktop\浙江各区县数据(2).csv')
df.columns = df.columns.map(lambda x:x.strip())
df.drop(columns=['500kv(含330kv)及以上','220kv','110kv(含66kv)','20kv','power_sal'],inplace=True)
print(df.columns)
print(dict(zip(df.columns,[(df[x]==0).sum()/len(df) for x in df.columns])))
yc_org_list = []
list_fl = []
list_org = []
for city in df[''].drop_duplicates():
df_ct = df[df['']==city]
# wb = Workbook()
# wb.save(fr'C:\Users\鸽子\Desktop\9月0.4kv区县预测\{city}.xlsx')
for org in df_ct['org_name'].drop_duplicates():
if org.strip()[-4:] != '供电公司':
continue
df_org = df_ct[df_ct['org_name']==org]
df_org['1-10kv'] /= 10000
df_org['35kv'] /= 10000
df_org['0.4kv及以下'] /= 10000
s1 = df_org[['日期','0.4kv及以下']]
s1.replace(0,np.NaN,inplace=True)
s1.dropna(how='any',inplace=True)
# plt.plot(range(len(s1)),s1['1-10kv'])
# plt.show()
# 更改列名更改为Prophet指定的列名ds和y
dd = s1.rename(columns={'日期':'ds','0.4kv及以下':'y'})
dd['ds'] = pd.to_datetime(dd['ds'])
# 划分数据划分为训练集和验证集预测的数据设置为未来3天
df_train = dd[(dd['ds']>='2022-01-01')&(dd['ds']<='2023-07-31')][:-3]
df_train = df_train.loc[normal(df_train['y']).index]
if df_train.shape[0] <= 180:
yc_org_list.append(org)
continue
df_test = dd[(dd['ds']>='2022-01-01')&(dd['ds']<='2023-07-31')][-3:]
# 数据的变动会受到季节、周、天的影响存在一定的规律性因此我们将这三个参数设置为True
model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)
# 采用中国的假期模式,其余参数均保持默认
model.add_country_holidays(country_name="CN")
model.fit(df_train)
# make_future_dataframe: 作用是告诉模型我们要预测多长时间,以及时间的周期是什么。生成一个时间戳
future = model.make_future_dataframe(periods=3, freq='D')
# 进行预测返回预测的结果forecast
forecast = model.predict(future)
# forecast['additive_terms'] = forecast['weekly'] + forecast['yearly']
# 有forecast['yhat'] = forecast['trend'] + forecast['additive_terms'] 。
# 因此forecast['yhat'] = forecast['trend'] +forecast['weekly'] + forecast['yearly']。
# 如果有节假日因素那么就会有forecast['yhat'] = forecast['trend'] +forecast['weekly'] + forecast['yearly'] + forecast['holidays']。
# print(forecast)
# 测试把ds列即data_series列设置为索引列
df_test = df_test.set_index('ds')
# 把预测到的数据取出ds列预测值列yhat同样把ds列设置为索引列。
forecast = forecast[['ds','yhat']].set_index('ds')
# join:按照索引进行连接,
# dropna能够找到DataFrame类型数据的空值缺失值将空值所在的行/列删除后将新的DataFrame作为返回值返回。
df_all = forecast.join(dd.set_index('ds')).dropna()
df_all['org_name'] = org
df_all['偏差率'] = (df_all['y'] - df_all['yhat'])/df_all['y']
df_all.rename(columns={'y':'真实值','yhat':'预测值'},inplace=True)
df_all = df_all[['org_name','真实值','预测值','偏差率']]
list_org.append(org)
try:
result = df_all.loc['2023-7']
result['goal'] = (result['真实值'] - result['预测值'])[-3:].sum()/result['真实值'].sum()
list_fl.append((result['真实值'] - result['预测值'])[-3:].sum()/result['真实值'].sum())
# with pd.ExcelWriter(fr'C:\Users\鸽子\Desktop\9月0.4kv区县预测\{city}.xlsx',mode='a',engine='openpyxl',if_sheet_exists='replace') as writer:
# result.to_excel(writer,sheet_name=f'{org}')
except:
yc_org_list.append(org)
df = pd.DataFrame({'org':list_org,'goal':list_fl})
print(df)
print(df['goal'].value_counts(bins=[-0.05,-0.01,-0.005,0, 0.005, 0.01, 0.02,0.05],sort=False))
# print(yc_org_list)
# # 创建一个ExcelWriter对象
# with pd.ExcelWriter(r'C:\Users\鸽子\Desktop\output.xlsx',mode='a',if_sheet_exists='replace') as writer:
# # 将不同的子文件写入同一个Excel文件的不同工作表
# df_all.to_excel(writer, sheet_name=f'Sheet{i+1}')
# df_all.plot()
# # 设置左上角小标
# plt.legend(['true', 'yhat'])
# plt.show()