You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
4.2 KiB
Python

import pandas as pd
import datetime
import math
import chinese_calendar as cc
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
def holiday_work(x):
if cc.is_workday(x):
return 0
if cc.is_holiday(x):
return 1
def normal(nd):
high = nd.describe()['75%'] + 1.5 * (nd.describe()['75%'] - nd.describe()['25%'])
low = nd.describe()['25%'] - 1.5 * (nd.describe()['75%'] - nd.describe()['25%'])
return nd[(nd < high) & (nd > low)]
def jq(y, x):
a = 365.242 * (y - 1900) + 6.2 + 15.22 * x - 1.9 * math.sin(0.262 * x)
return datetime.date(1899, 12, 31) + datetime.timedelta(days=int(a))
jq_list = ['小寒', '大寒', '立春', '雨水', '惊蛰', '春分', '清明', '谷雨', '立夏', '小满', '芒种', '夏至', '小暑',
'大暑', '立秋', '处暑', '白露', '秋分', '寒露', '霜降', '立冬', '小雪', '大雪', '冬至']
jq_dict = {}
for j in range(2023, 2025):
for i in range(24):
jq_dict[jq(j, i).strftime('%Y-%m-%d')] = jq_list[i]
ys_df = pd.read_excel(r'C:\python-project\p1031\入模数据\杭州.xlsx')
ys_df['dtdate'] = pd.to_datetime(ys_df['dtdate'])
ys_dict = dict(zip([x for x in ys_df['dtdate']], [y for y in ys_df['holiday']]))
pd.set_option('display.width', None)
df_qy_ah = pd.read_excel(r'C:\python-project\p1031\北京安徽\北京安徽电量数据\北京安徽分压区域.xlsx', sheet_name=3)
df_qy_ah = df_qy_ah[df_qy_ah['county_name'].isnull()]
df_qy_ah['pt_date'] = pd.to_datetime(df_qy_ah['pt_date'])
df_qy_ah = df_qy_ah[df_qy_ah['pt_date'] <= '2023-12-31']
for city in df_qy_ah['city_name'].drop_duplicates():
df_ah_city = df_qy_ah[df_qy_ah['city_name'] == city]
df_ah_city.drop_duplicates(inplace=True)
df_ah_city['24ST'] = df_qy_ah['pt_date'].astype('string').map(jq_dict)
df_ah_city.fillna(method='ffill', inplace=True)
df_ah_city['24ST'].fillna('冬至', inplace=True)
label_dict = {'冬至': 0, '处暑': 1, '夏至': 2, '大寒': 3, '大暑': 4, '大雪': 5, '寒露': 6, '小寒': 7, '小暑': 8,
'小满': 9, '小雪': 10, '惊蛰': 11, '春分': 12, '清明': 13, '白露': 14, '秋分': 15, '立冬': 16,
'立夏': 17,
'立春': 18, '立秋': 19, '芒种': 20, '谷雨': 21, '雨水': 22, '霜降': 23}
df_ah_city['24ST'] = df_ah_city['24ST'].map(label_dict)
df_ah_city['holiday'] = df_qy_ah['pt_date'].map(ys_dict)
holiday_null_s = df_ah_city[df_ah_city['holiday'].isnull()]['pt_date']
holiday_null_s = holiday_null_s.map(holiday_work)
holiday_null_s.iloc[-2:] = 3
df_ah_city['holiday'].fillna(holiday_null_s.to_dict(), inplace=True)
df_ah_city.set_index('pt_date', inplace=True)
df_ = df_ah_city.loc['2023-12']['power_sal']
df_ah_city = df_ah_city.loc[normal(df_ah_city['power_sal']).index]
print(city)
print(df_ah_city)
X, y = df_ah_city.drop(columns=['city_name', 'county_name', 'power_sal']).iloc[-180:-3], \
df_ah_city['power_sal'].iloc[-180:-3]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
eval_x, eval_y = df_ah_city.drop(columns=['city_name', 'county_name', 'power_sal']).iloc[-3:], \
df_ah_city['power_sal'].iloc[-3:]
model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=150)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
result_test = pd.DataFrame({'test': y_test, 'pred': y_pred}, index=y_test.index)
print((abs(result_test['pred'] - result_test['test']) / result_test['test']).mean())
print(r2_score(y_test, y_pred))
final_df = pd.DataFrame({'真实值': list(df_.values), '预测值': list(df_.values)[:-3] + list(model.predict(eval_x))},
index=df_.index)
final_df['偏差率'] = (final_df['真实值'] - final_df['预测值']).sum() / final_df['真实值'].sum()
final_df['偏差率'] = final_df['偏差率'].apply(lambda x: "{:.5%}".format(x))
print(final_df)
with pd.ExcelWriter(r'C:\Users\鸽子\Desktop\北京区域电量_12月.xlsx', if_sheet_exists='replace', mode='a',
engine='openpyxl') as writer:
final_df.to_excel(writer, sheet_name=f'{city}')