From 106468285413b4004d70fdd6a27505abe4d23ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=B8=BD=E5=AD=90?= <2316994765@qq.com> Date: Thu, 23 Nov 2023 18:16:23 +0800 Subject: [PATCH] =?UTF-8?q?=E8=BE=93=E5=87=BA=E9=A2=84=E6=B5=8B=E7=BB=93?= =?UTF-8?q?=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 浙江行业电量/prophet_行业电量.py | 48 ++++++++++- 浙江行业电量/test1.py | 84 +++++++++++++------ 浙江行业电量/分类归一化.py | 76 +++++++++++++++++ ...py => 行业电量_输出为3_27步长.py} | 0 ...=> 行业电量_输出为3_步长为10.py} | 10 +-- 5 files changed, 184 insertions(+), 34 deletions(-) create mode 100644 浙江行业电量/分类归一化.py rename 浙江行业电量/{行业电量_输出为3.py => 行业电量_输出为3_27步长.py} (100%) rename 浙江行业电量/{行业电量_输出为5.py => 行业电量_输出为3_步长为10.py} (96%) diff --git a/浙江行业电量/prophet_行业电量.py b/浙江行业电量/prophet_行业电量.py index 6047954..1ff4ac2 100644 --- a/浙江行业电量/prophet_行业电量.py +++ b/浙江行业电量/prophet_行业电量.py @@ -2,8 +2,50 @@ from prophet import Prophet import pandas as pd import os import datetime +import numpy as np + + +def normal(data): + high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%']) + low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%']) + return data[(data<=high)&(data>=low)] + file_dir = './浙江各地市行业电量数据' -city = os.listdir(file_dir)[0] -df_city = pd.read_excel(os.path.join(file_dir,city)) -print(df_city.columns) \ No newline at end of file +for city in os.listdir(file_dir): + df_city = pd.read_excel(os.path.join(file_dir, city)) + df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10]) + df_city['stat_date'] = pd.to_datetime(df_city['stat_date']) + list_goal = [] + list_industry = [] + for industry in df_city.columns[2:]: + s1 = df_city[['stat_date', industry]] + s1 = s1[(s1['stat_date'] >= '2022-09-30') & (s1['stat_date'] <= '2023-10-31')] + s1 = s1.loc[normal(s1[industry]).index] + s1.rename(columns={'stat_date': 'ds', industry: 'y'}, inplace=True) + + df_train = s1[(s1['ds'] >= '2022-08-31') & (s1['ds'] <= '2023-10-31')].sort_values(by='ds') + df_test = s1[(s1['ds'] >= '2022-08-31') & (s1['ds'] <= '2023-10-31')].sort_values(by='ds') + + model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) + model.add_country_holidays(country_name="CN") + model.fit(df_train) + future = model.make_future_dataframe(periods=3, freq='D') + + predict = model.predict(future) + predict = predict[['ds', 'yhat']].set_index('ds') + print(city,industry) + print(predict.loc['2023-10']) + + # df = predict.join(s1.set_index('ds')).loc['2023-8'] + # df['偏差率'] = (df['y'] - df['yhat']) / df['y'] + # df['goal'] = (df['y'] - df['yhat'])[-3:].sum() / df['y'].sum() + # list_goal.append((df['y'] - df['yhat'])[-3:].sum() / df['y'].sum()) + # list_industry.append(industry) + + # df = pd.DataFrame({'industry': list_industry, 'goal': list_goal}) + # df.to_csv(fr'C:\Users\鸽子\Desktop\行业8月偏差\{city[:2]}_goal.csv', index=False, encoding='gbk') + # + # with open(r'C:\Users\鸽子\Desktop\goal_8.txt','a') as f: + # f.write(f'{city[:2]}\n') + # df['goal'].value_counts(bins=[-np.inf,-0.05, -0.01, -0.005, 0, 0.005, 0.01, 0.02, 0.05,np.inf], sort=False).to_csv(f,header=False,sep='\t') diff --git a/浙江行业电量/test1.py b/浙江行业电量/test1.py index 7e0be8e..96023a3 100644 --- a/浙江行业电量/test1.py +++ b/浙江行业电量/test1.py @@ -53,30 +53,62 @@ def normal(df): pass return df -file_dir = './浙江各地市行业电量数据' -city1 = os.listdir(file_dir)[0] -df_city = pd.read_excel(os.path.join(file_dir, city1)) -df_city = normal(df_city) -df_city = df_city.drop(columns='地市') -df_city[df_city.columns[1:]] /= 10000 -df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10]) -df_city.stat_date = pd.to_datetime(df_city.stat_date) -print(df_city.describe()) +# file_dir = './浙江各地市行业电量数据' +# city1 = os.listdir(file_dir)[0] +# df_city = pd.read_excel(os.path.join(file_dir, city1)) +# df_city = normal(df_city) +# df_city = df_city.drop(columns='地市') +# df_city[df_city.columns[1:]] /= 10000 +# df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10]) +# df_city.stat_date = pd.to_datetime(df_city.stat_date) +# print(df_city.describe()) +# +# list_1000 = [] +# list_100 = [] +# list_10 = [] +# list_1 = [] +# for i in df_city.columns[1:]: +# if df_city[i].describe()['mean']>=1000: +# list_1000.append(i) +# if df_city[i].describe()['mean'] < 1000 and df_city[i].describe()['mean'] >= 100: +# list_100.append(i) +# if df_city[i].describe()['mean'] < 100 and df_city[i].describe()['mean'] >= 10: +# list_10.append(i) +# else: +# list_1.append(i) +# print('list_1:',list_1) +# print('list_10:',list_10) +# print('list_100:',list_100) +# print('list_1000:',list_1000) +import pandas as pd + +# 创建一个简单的DataFrame +data = pd.DataFrame({'A': [1, 2, 3000, 4, 500], + 'B': [10, 20, 30, 40, 50]}) + + +Q1 = data['A'].quantile(0.25) +Q3 = data['A'].quantile(0.75) +IQR = Q3 - Q1 + +lower_threshold = Q1 - 1.5 * IQR +upper_threshold = Q3 + 1.5 * IQR +# 向下移动一行 +outliers = (data['A'] < lower_threshold) | (data['A'] > upper_threshold) +print(outliers) +print( data['A'].shift(1)) +# 替换异常值为临近一个值 + +data = {'A': [1, 2, 3, 4, 5], + 'B': [10, 20, 30, 40, 50]} +df = pd.DataFrame(data) + +# 将满足条件的元素替换为新值 +condition = df['A'] > 3 +df_new = df.where(condition, other=-1) + +print("原始数据:") +print(df) -list_1000 = [] -list_100 = [] -list_10 = [] -list_1 = [] -for i in df_city.columns[1:]: - if df_city[i].describe()['mean']>=1000: - list_1000.append(i) - if df_city[i].describe()['mean'] < 1000 and df_city[i].describe()['mean'] >= 100: - list_100.append(i) - if df_city[i].describe()['mean'] < 100 and df_city[i].describe()['mean'] >= 10: - list_10.append(i) - else: - list_1.append(i) -print('list_1:',list_1) -print('list_10:',list_10) -print('list_100:',list_100) -print('list_1000:',list_1000) +print("\n根据条件替换后的数据:") +print(df_new) diff --git a/浙江行业电量/分类归一化.py b/浙江行业电量/分类归一化.py new file mode 100644 index 0000000..e61021f --- /dev/null +++ b/浙江行业电量/分类归一化.py @@ -0,0 +1,76 @@ +import os +import numpy as np +import pandas as pd +from sklearn.preprocessing import MinMaxScaler + + +def normal(data): + high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%']) + low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%']) + return (data >= low) & (data <= high) + + +# file_dir = './浙江各地市行业电量数据' +# +# # 合并11个市 +# df = pd.DataFrame({}) +# for city in os.listdir(file_dir): +# +# df_city = pd.read_excel(os.path.join(file_dir, city)) +# +# # 对每个市的每一个行业异常值 向后填充 +# for industry in df_city.columns[2:]: +# outliers_index = normal(df_city[industry]).index +# df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill() +# df_city[industry].fillna(method='ffill',inplace=True) +# df = pd.concat([df,df_city]) +# print(df.shape) +# +# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK') +df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk') +print(sum(df.isnull().sum())) +print(df.describe()) +# 对df每一行业进行归一化 +column_params = {} +for column in df.columns[2:]: + scaler = MinMaxScaler() + + df[column] = scaler.fit_transform(df[[column]]) + + column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]} + +print(column_params) +print(df.head()) + + +def create_dataset(data, days_for_train=10) -> (np.array, np.array): + dataset_x, dataset_y = [], [] + for i in range(len(data) - days_for_train - 3): + dataset_x.append(data[i:(i + days_for_train)]) + dataset_y.append(data[i + days_for_train:i + days_for_train + 3]) + + return (np.array(dataset_x), np.array(dataset_y)) + + +# 切分x,y数据集,步长为10.最小单位为单个城市的单个行业。 +# 先从第一个行业切分,合并所有城市。 + +industry = df.columns[2:][0] +city = df['地市'].drop_duplicates()[0] +df_city_industry = df[df['地市'] == city][industry] +dataset_x, dataset_y = create_dataset(df_city_industry) + +for city in df['地市'].drop_duplicates()[1:]: + df_city_industry = df[df['地市'] == city][industry] + x, y = create_dataset(df_city_industry) + dataset_x,dataset_y = np.concatenate([dataset_x,x]),np.concatenate([dataset_y,y]) + +for industry in df.columns[2:][1:]: + for city in df['地市'].drop_duplicates(): + df_city_industry = df[df['地市'] == city][industry] + x, y = create_dataset(df_city_industry) + dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y]) + +print(dataset_x.shape, dataset_y.shape) + + diff --git a/浙江行业电量/行业电量_输出为3.py b/浙江行业电量/行业电量_输出为3_27步长.py similarity index 100% rename from 浙江行业电量/行业电量_输出为3.py rename to 浙江行业电量/行业电量_输出为3_27步长.py diff --git a/浙江行业电量/行业电量_输出为5.py b/浙江行业电量/行业电量_输出为3_步长为10.py similarity index 96% rename from 浙江行业电量/行业电量_输出为5.py rename to 浙江行业电量/行业电量_输出为3_步长为10.py index 95b7bc5..7e51b96 100644 --- a/浙江行业电量/行业电量_输出为5.py +++ b/浙江行业电量/行业电量_输出为3_步长为10.py @@ -28,9 +28,9 @@ class LSTM_Regression(nn.Module): def create_dataset(data, days_for_train=5) -> (np.array, np.array): dataset_x, dataset_y = [], [] - for i in range(len(data) - days_for_train-5): + for i in range(len(data) - days_for_train-3): dataset_x.append(data[i:(i + days_for_train)]) - dataset_y.append(data[i + days_for_train:i + days_for_train+5]) + dataset_y.append(data[i + days_for_train:i + days_for_train+3]) # print(dataset_x,dataset_y) return (np.array(dataset_x), np.array(dataset_y)) @@ -103,13 +103,13 @@ train_y = dataset_y[:train_size] # # 将数据改变形状,RNN 读入的数据维度是 (seq_size, batch_size, feature_size) train_x = train_x.reshape(-1, 1, DAYS_FOR_TRAIN) -train_y = train_y.reshape(-1, 1, 5) +train_y = train_y.reshape(-1, 1, 3) # # 转为pytorch的tensor对象 train_x = torch.from_numpy(train_x).to(device).type(torch.float32) train_y = torch.from_numpy(train_y).to(device).type(torch.float32) print('=====================================',train_x.shape) -model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=5, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等 +model = LSTM_Regression(DAYS_FOR_TRAIN, 32, output_size=3, num_layers=2).to(device) # 导入模型并设置模型的参数输入输出层、隐藏层等 # train_loss = [] loss_function = nn.MSELoss() @@ -146,7 +146,7 @@ for i in range(1500): # plt.legend(loc='best') # plt.show() -model.load_state_dict(torch.load('hy5.pth',map_location=torch.device('cpu'))) +model.load_state_dict(torch.load('hy3.pth',map_location=torch.device('cpu'))) max_value = 354024930.8 min_value = 0.0