import pandas as pd import datetime import os import numpy as np import torch from torch import nn def normalise(dataset_x,dataset_y): max_value = np.max(dataset_x) min_value = np.min(dataset_x) dataset_x = (dataset_x - min_value) / (max_value - min_value) dataset_y = (dataset_y - min_value) / (max_value - min_value) return dataset_x,dataset_y,max_value,min_value def create_data(df_industry, industry): dataset_x = [] dataset_y = [] for i in df_industry['地市'].drop_duplicates(): data = df_industry[df_industry['地市'] == i] grouped = data.groupby(data['stat_date'].dt.to_period('M')) # 遍历每个月的数据 for name, group in grouped: if len(group) == 31: dataset_x.append(list(group[industry].values[1:28])) dataset_y.append(list(group[industry].values[-3:])) if len(group) == 30: dataset_x.append(list(group[industry].values[:27])) dataset_y.append(list(group[industry].values[-3:])) if len(group) == 28: fst = group[industry].values[0] dataset_x.append([fst, fst, fst] + list(group[industry].values[1:25])) dataset_y.append(list(group[industry].values[-3:])) else: fst = group[industry].values[0] if len([fst, fst] + list(group[industry].values[1:26])) != 27: break dataset_x.append([fst, fst] + list(group[industry].values[1:26])) dataset_y.append(list(group[industry].values[-3:])) return np.array(dataset_x), np.array(dataset_y) df = pd.read_csv('合并行业数据.csv') df[df.columns[2:]] /= 10000 df['stat_date'] = df['stat_date'].map(lambda x:str(x).strip()[:10]) df['stat_date'] = pd.to_datetime(df['stat_date'],format='%Y-%m-%d') list_1 = ['1.煤炭开采和洗选业'] list_2 = ['2.石油和天然气开采业', '3.黑色金属矿采选业'] list_3 = ['2.林业', '4.有色金属矿采选业', '4.烟草制品业', '31.金属制品、机械和设备修理业', '2.燃气生产和供应业', '3.建筑安装业', '4.航空运输业', '5.管道运输业', '6.多式联运和运输代理业', '8.邮政业'] list_4 = ['第一产业', '一、农、林、牧、渔业', '1.农业', '3.畜牧业', '4.渔业', '5.农、林、牧、渔专业及辅助性活动', '(一)采矿业', '5.非金属矿采选业', '6.其他采矿业', '1.农副食品加工业', '2.食品制造业', '3.酒、饮料及精制茶制造业', '8.木材加工和木、竹、藤、棕、草制品业', '9.家具制造业', '11.印刷和记录媒介复制业', '25.铁路、船舶、航空航天和其他运输设备制造业', '28.仪器仪表制造业', '30.废弃资源综合利用业', '1.房屋建筑业', '4.建筑装饰、装修和其他建筑业', '1.铁路运输业', '2.道路运输业', '3.水上运输业', '7.装卸搬运和仓储业', '2.互联网和相关服务', '3.软件和信息技术服务业', '八、金融业', '1.科学研究和技术服务业', '3.居民服务、修理和其他服务业'] list_5 = ['城镇居民', '6.纺织服装、服饰业', '7.皮革、毛皮、羽毛及其制品和制鞋业', '10.造纸和纸制品业', '12.文教、工美、体育和娱乐用品制造业', '13.石油、煤炭及其他燃料加工业', '14.化学原料和化学制品制造业', '15.医药制造业', '16.化学纤维制造业', '17.橡胶和塑料制品业', '18.非金属矿物制品业', '19.黑色金属冶炼和压延加工业', '20.有色金属冶炼和压延加工业', '21.金属制品业', '22.通用设备制造业', '23.专用设备制造业', '24.汽车制造业', '26.电气机械和器材制造业', '27.计算机、通信和其他电子设备制造业', '29.其他制造业', '(三)电力、热力、燃气及水的生产和供应业', '1.电力、热力生产和供应业', '3.水的生产和供应业', '三、建筑业', '2.土木工程建筑业', '四、交通运输、仓储和邮政业', '五、信息传输、软件和信息技术服务业', '1.电信、广播电视和卫星传输服务', '六、批发和零售业', '七、住宿和餐饮业', '九、房地产业', '十、租赁和商务服务业', '十一、公共服务及管理组织', '2.水利、环境和公共设施管理业', '4.教育、文化、体育和娱乐业', '5.卫生和社会工作', '6.公共管理和社会组织、国际组织'] list_6 = ['全社会用电总计', 'a、全行业用电合计', '第二产业', '第三产业', 'b、城乡居民生活用电合计', '乡村居民', '二、工业', '(二)制造业', '5.纺织业'] # list_1 industry = '1.煤炭开采和洗选业' df_industry = df[['地市', 'stat_date', industry]] dataset_x1,dataset_y1 = create_data(df_industry,industry) print('list_1:',np.max(dataset_x1),np.min(dataset_x1)) dataset_x1,dataset_y1 = normalise(dataset_x1,dataset_y1)[:2] # list_2 industry = '2.石油和天然气开采业' df_industry = df[['地市', 'stat_date', industry]] dataset_x2,dataset_y2 = create_data(df_industry,industry) for i in list_2[1:]: df_industry = df[['地市', 'stat_date', industry]] x, y = create_data(df_industry, industry) dataset_x2 = np.concatenate([dataset_x2, x]) dataset_y2 = np.concatenate([dataset_y2, y]) print('list_2:',np.max(dataset_x2),np.min(dataset_x2)) dataset_x2,dataset_y2 = normalise(dataset_x2,dataset_y2)[:2] # list_3 industry = '2.林业' df_industry = df[['地市', 'stat_date', industry]] dataset_x3,dataset_y3 = create_data(df_industry,industry) for i in list_3[1:]: df_industry = df[['地市', 'stat_date', industry]] x, y = create_data(df_industry, industry) dataset_x3 = np.concatenate([dataset_x3, x]) dataset_y3 = np.concatenate([dataset_y3, y]) print('list_3:',np.max(dataset_x3),np.min(dataset_x3)) dataset_x3,dataset_y3 = normalise(dataset_x3,dataset_y3)[:2] # list_4 industry = list_4[0] df_industry = df[['地市', 'stat_date', industry]] dataset_x4,dataset_y4 = create_data(df_industry,industry) for i in list_4[1:]: df_industry = df[['地市', 'stat_date', industry]] x, y = create_data(df_industry, industry) dataset_x4 = np.concatenate([dataset_x4, x]) dataset_y4 = np.concatenate([dataset_y4, y]) print('list_4:',np.max(dataset_x4),np.min(dataset_x4)) dataset_x4,dataset_y4 = normalise(dataset_x4,dataset_y4)[:2] # list_5 industry = list_5[0] df_industry = df[['地市', 'stat_date', industry]] dataset_x5,dataset_y5 = create_data(df_industry,industry) for i in list_5[1:]: df_industry = df[['地市', 'stat_date', industry]] x, y = create_data(df_industry, industry) dataset_x5 = np.concatenate([dataset_x5, x]) dataset_y5 = np.concatenate([dataset_y5, y]) print('list_5:',np.max(dataset_x5),np.min(dataset_x5)) dataset_x5,dataset_y5 = normalise(dataset_x5,dataset_y5)[:2] # list_6 industry = list_6[0] df_industry = df[['地市', 'stat_date', industry]] dataset_x6,dataset_y6 = create_data(df_industry,industry) for i in list_6[1:]: df_industry = df[['地市', 'stat_date', industry]] x, y = create_data(df_industry, industry) dataset_x6 = np.concatenate([dataset_x6, x]) dataset_y6 = np.concatenate([dataset_y6, y]) print('list_6:',np.max(dataset_x6),np.min(dataset_x6)) dataset_x6,dataset_y6 = normalise(dataset_x6,dataset_y6)[:2] dataset_x = np.concatenate([dataset_x1,dataset_x2,dataset_x3,dataset_x4,dataset_x5,dataset_x6]) dataset_y = np.concatenate([dataset_y1,dataset_y2,dataset_y3,dataset_y4,dataset_y5,dataset_y6]) print(dataset_y.shape,dataset_x.shape)