You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
2.6 KiB
Python

10 months ago
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def normal(data):
high = data.describe()['75%'] + 1.5 * (data.describe()['75%'] - data.describe()['25%'])
low = data.describe()['25%'] - 1.5 * (data.describe()['75%'] - data.describe()['25%'])
return (data >= low) & (data <= high)
# file_dir = './浙江各地市行业电量数据'
#
# # 合并11个市
# df = pd.DataFrame({})
# for city in os.listdir(file_dir):
#
# df_city = pd.read_excel(os.path.join(file_dir, city))
#
# # 对每个市的每一个行业异常值 向后填充
# for industry in df_city.columns[2:]:
# outliers_index = normal(df_city[industry]).index
# df_city[industry] = df_city[industry].where(normal(df_city[industry]), other=np.nan).bfill()
# df_city[industry].fillna(method='ffill',inplace=True)
# df = pd.concat([df,df_city])
# print(df.shape)
#
# df.to_csv('11市行业数据(已处理异常).csv',index=False,encoding='GBK')
df = pd.read_csv('11市行业数据(已处理异常).csv', encoding='gbk')
print(sum(df.isnull().sum()))
print(df.describe())
# 对df每一行业进行归一化
column_params = {}
for column in df.columns[2:]:
scaler = MinMaxScaler()
df[column] = scaler.fit_transform(df[[column]])
column_params[column] = {'min': scaler.data_min_[0], 'max': scaler.data_max_[0]}
print(column_params)
print(df.head())
def create_dataset(data, days_for_train=10) -> (np.array, np.array):
dataset_x, dataset_y = [], []
for i in range(len(data) - days_for_train - 3):
dataset_x.append(data[i:(i + days_for_train)])
dataset_y.append(data[i + days_for_train:i + days_for_train + 3])
return (np.array(dataset_x), np.array(dataset_y))
# 切分x,y数据集步长为10.最小单位为单个城市的单个行业。
# 先从第一个行业切分,合并所有城市。
industry = df.columns[2:][0]
city = df['地市'].drop_duplicates()[0]
df_city_industry = df[df['地市'] == city][industry]
dataset_x, dataset_y = create_dataset(df_city_industry)
for city in df['地市'].drop_duplicates()[1:]:
df_city_industry = df[df['地市'] == city][industry]
x, y = create_dataset(df_city_industry)
dataset_x,dataset_y = np.concatenate([dataset_x,x]),np.concatenate([dataset_y,y])
for industry in df.columns[2:][1:]:
for city in df['地市'].drop_duplicates():
df_city_industry = df[df['地市'] == city][industry]
x, y = create_dataset(df_city_industry)
dataset_x, dataset_y = np.concatenate([dataset_x, x]), np.concatenate([dataset_y, y])
print(dataset_x.shape, dataset_y.shape)