|
|
import os
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
|
|
|
|
def normal(data):
|
|
|
high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%'])
|
|
|
low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%'])
|
|
|
return (data >= low) & (data <= high)
|
|
|
|
|
|
|
|
|
# file_dir = './浙江各地市行业电量数据'
|
|
|
#
|
|
|
# # 合并11个市
|
|
|
# df = pd.DataFrame({})
|
|
|
# for city in os.listdir(file_dir):
|
|
|
#
|
|
|
# df_city = pd.read_excel(os.path.join(file_dir, city))
|
|
|
#
|
|
|
# # 对每个市的每一个行业异常值 向后填充
|
|
|
# for industry in df_city.columns[2:]:
|
|
|
# outliers_index = normal(df_city[industry]).index
|
|
|
# df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill()
|
|
|
# df_city[industry].fillna(method='ffill',inplace=True)
|
|
|
# df = pd.concat([df,df_city])
|
|
|
# print(df.shape)
|
|
|
#
|
|
|
# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK')
|
|
|
df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk')
|
|
|
print(sum(df.isnull().sum()))
|
|
|
print(df.describe())
|
|
|
# 对df每一行业进行归一化
|
|
|
column_params = {}
|
|
|
for column in df.columns[2:]:
|
|
|
scaler = MinMaxScaler()
|
|
|
|
|
|
df[column] = scaler.fit_transform(df[[column]])
|
|
|
|
|
|
column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]}
|
|
|
|
|
|
print(column_params)
|
|
|
print(df.head())
|
|
|
|
|
|
|
|
|
def create_dataset(data, days_for_train=10) -> (np.array, np.array):
|
|
|
dataset_x, dataset_y = [], []
|
|
|
for i in range(len(data) - days_for_train - 3):
|
|
|
dataset_x.append(data[i:(i + days_for_train)])
|
|
|
dataset_y.append(data[i + days_for_train:i + days_for_train + 3])
|
|
|
|
|
|
return (np.array(dataset_x), np.array(dataset_y))
|
|
|
|
|
|
|
|
|
# 切分x,y数据集,步长为10.最小单位为单个城市的单个行业。
|
|
|
# 先从第一个行业切分,合并所有城市。
|
|
|
|
|
|
industry = df.columns[2:][0]
|
|
|
city = df['地市'].drop_duplicates()[0]
|
|
|
df_city_industry = df[df['地市'] == city][industry]
|
|
|
dataset_x, dataset_y = create_dataset(df_city_industry)
|
|
|
|
|
|
for city in df['地市'].drop_duplicates()[1:]:
|
|
|
df_city_industry = df[df['地市'] == city][industry]
|
|
|
x, y = create_dataset(df_city_industry)
|
|
|
dataset_x,dataset_y = np.concatenate([dataset_x,x]),np.concatenate([dataset_y,y])
|
|
|
|
|
|
for industry in df.columns[2:][1:]:
|
|
|
for city in df['地市'].drop_duplicates():
|
|
|
df_city_industry = df[df['地市'] == city][industry]
|
|
|
x, y = create_dataset(df_city_industry)
|
|
|
dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
|
|
|
|
|
|
print(dataset_x.shape, dataset_y.shape)
|
|
|
|
|
|
|