import os import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler def normal(data): high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%']) low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%']) return (data >= low) & (data <= high) # file_dir = './浙江各地市行业电量数据' # # # 合并11个市 # df = pd.DataFrame({}) # for city in os.listdir(file_dir): # # df_city = pd.read_excel(os.path.join(file_dir, city)) # # # 对每个市的每一个行业异常值 向后填充 # for industry in df_city.columns[2:]: # outliers_index = normal(df_city[industry]).index # df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill() # df_city[industry].fillna(method='ffill',inplace=True) # df = pd.concat([df,df_city]) # print(df.shape) # # df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK') df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk') print(sum(df.isnull().sum())) print(df.describe()) # 对df每一行业进行归一化 column_params = {} for column in df.columns[2:]: scaler = MinMaxScaler() df[column] = scaler.fit_transform(df[[column]]) column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]} print(column_params) print(df.head()) def create_dataset(data, days_for_train=10) -> (np.array, np.array): dataset_x, dataset_y = [], [] for i in range(len(data) - days_for_train - 3): dataset_x.append(data[i:(i + days_for_train)]) dataset_y.append(data[i + days_for_train:i + days_for_train + 3]) return (np.array(dataset_x), np.array(dataset_y)) # 切分x,y数据集,步长为10.最小单位为单个城市的单个行业。 # 先从第一个行业切分,合并所有城市。 industry = df.columns[2:][0] city = df['地市'].drop_duplicates()[0] df_city_industry = df[df['地市'] == city][industry] dataset_x, dataset_y = create_dataset(df_city_industry) for city in df['地市'].drop_duplicates()[1:]: df_city_industry = df[df['地市'] == city][industry] x, y = create_dataset(df_city_industry) dataset_x,dataset_y = np.concatenate([dataset_x,x]),np.concatenate([dataset_y,y]) for industry in df.columns[2:][1:]: for city in df['地市'].drop_duplicates(): df_city_industry = df[df['地市'] == city][industry] x, y = create_dataset(df_city_industry) dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y]) print(dataset_x.shape, dataset_y.shape)