You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pytorch/浙江行业电量/杭州日电量数据预处理.py

79 lines
2.5 KiB
Python

11 months ago
import pandas as pd
import datetime
from sklearn.preprocessing import LabelEncoder
import os
import math
import chinese_calendar as cc
is_holiday = cc.is_workday(datetime.date(2023, 10, 7))
print(is_holiday)
def holiday_work(x):
if cc.is_workday(x):
return 0
if cc.is_holiday(x):
return 1
tq_df = pd.read_excel(r'C:\Users\鸽子\Desktop\杭州\杭州气象数据.xlsx',sheet_name=0)
pd.set_option('display.width',None)
tq_df.columns = tq_df.columns.map(lambda x:x.strip())
tq_df = tq_df[['city_name','dtdate','tem_max','tem_min','rh','rh_max','rh_min','pre','prs','prs_max','prs_min','win_s_max','win_s_min']]
tq_df.drop_duplicates(subset='dtdate',inplace=True)
tq_df = tq_df.sort_values(by='dtdate',ascending=True)
tq_df.reset_index(inplace=True,drop=True)
tq_df['dtdate'] = pd.to_datetime(tq_df['dtdate'],format='%Y%m%d')
tq_df['holiday'] = tq_df['dtdate'].apply(holiday_work)
print(tq_df.columns)
print(tq_df.head())
print(tq_df.info())
def jq(y,x):
a=365.242 * (y - 1900) + 6.2 + 15.22 * x - 1.9 * math.sin(0.262 * x)
return datetime.date(1899,12,31)+datetime.timedelta(days=int(a))
# print(jq(2020,0))
jq_list=['小寒', '大寒', '立春', '雨水', '惊蛰', '春分', '清明', '谷雨', '立夏', '小满', '芒种', '夏至', '小暑', '大暑', '立秋', '处暑', '白露', '秋分', '寒露', '霜降', '立冬', '小雪', '大雪','冬至']
jq_dict={}
for j in range(2019,2024):
for i in range(24):
jq_dict[jq(j,i).strftime('%Y-%m-%d')]=jq_list[i]
print(jq_dict)
tq_df['24ST']=tq_df.dtdate
tq_df['24ST']=tq_df['24ST'].astype('string').map(jq_dict)
tq_df['24ST'].fillna(method='ffill',inplace=True)
tq_df['24ST'].fillna('冬至',inplace=True)
# data为数据集 product_tags为需要编码的特征列(假设为第一列)
le = LabelEncoder()
tq_df['24ST'] = le.fit_transform(tq_df['24ST'])
print(tq_df)
file2=os.getcwd()+'/气象数据.csv'
try:
tq_df.to_csv(file2,encoding='gbk')
except:
tq_df.to_csv(file2,encoding='utf-8')
elec_df = pd.read_excel(r'C:\Users\鸽子\Desktop\杭州\杭州日电量.xlsx')
elec_df['售电量'] = (elec_df['售电量']/10000).map(lambda x:round(x,2))
print(elec_df.columns)
elec_df.sort_values(by='',ascending=True,inplace=True)
elec_df[''] = pd.to_datetime(elec_df[''],format='%Y%m%d')
print(elec_df.info())
print(elec_df)
df = pd.merge(tq_df,elec_df,left_on='dtdate',right_on='')
df.drop(columns='',inplace=True)
df.set_index('dtdate',inplace=True,drop=True)
print(df)
df.to_csv(os.getcwd()+'\入模数据.csv',encoding='gbk')