You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
7.4 KiB
Python

import pandas as pd
import datetime
import os
import numpy as np
import torch
from torch import nn
def normalise(dataset_x,dataset_y):
max_value = np.max(dataset_x)
min_value = np.min(dataset_x)
dataset_x = (dataset_x - min_value) / (max_value - min_value)
dataset_y = (dataset_y - min_value) / (max_value - min_value)
return dataset_x,dataset_y,max_value,min_value
def create_data(df_industry, industry):
dataset_x = []
dataset_y = []
for i in df_industry['地市'].drop_duplicates():
data = df_industry[df_industry['地市'] == i]
grouped = data.groupby(data['stat_date'].dt.to_period('M'))
# 遍历每个月的数据
for name, group in grouped:
if len(group) == 31:
dataset_x.append(list(group[industry].values[1:28]))
dataset_y.append(list(group[industry].values[-3:]))
if len(group) == 30:
dataset_x.append(list(group[industry].values[:27]))
dataset_y.append(list(group[industry].values[-3:]))
if len(group) == 28:
fst = group[industry].values[0]
dataset_x.append([fst, fst, fst] + list(group[industry].values[1:25]))
dataset_y.append(list(group[industry].values[-3:]))
else:
fst = group[industry].values[0]
if len([fst, fst] + list(group[industry].values[1:26])) != 27:
break
dataset_x.append([fst, fst] + list(group[industry].values[1:26]))
dataset_y.append(list(group[industry].values[-3:]))
return np.array(dataset_x), np.array(dataset_y)
df = pd.read_csv('合并行业数据.csv')
df[df.columns[2:]] /= 10000
df['stat_date'] = df['stat_date'].map(lambda x:str(x).strip()[:10])
df['stat_date'] = pd.to_datetime(df['stat_date'],format='%Y-%m-%d')
list_1 = ['1.煤炭开采和洗选业']
list_2 = ['2.石油和天然气开采业', '3.黑色金属矿采选业']
list_3 = ['2.林业', '4.有色金属矿采选业', '4.烟草制品业', '31.金属制品、机械和设备修理业', '2.燃气生产和供应业', '3.建筑安装业', '4.航空运输业', '5.管道运输业', '6.多式联运和运输代理业', '8.邮政业']
list_4 = ['第一产业', '一、农、林、牧、渔业', '1.农业', '3.畜牧业', '4.渔业', '5.农、林、牧、渔专业及辅助性活动', '(一)采矿业', '5.非金属矿采选业', '6.其他采矿业', '1.农副食品加工业', '2.食品制造业', '3.酒、饮料及精制茶制造业', '8.木材加工和木、竹、藤、棕、草制品业', '9.家具制造业', '11.印刷和记录媒介复制业', '25.铁路、船舶、航空航天和其他运输设备制造业', '28.仪器仪表制造业', '30.废弃资源综合利用业', '1.房屋建筑业', '4.建筑装饰、装修和其他建筑业', '1.铁路运输业', '2.道路运输业', '3.水上运输业', '7.装卸搬运和仓储业', '2.互联网和相关服务', '3.软件和信息技术服务业', '八、金融业', '1.科学研究和技术服务业', '3.居民服务、修理和其他服务业']
list_5 = ['城镇居民', '6.纺织服装、服饰业', '7.皮革、毛皮、羽毛及其制品和制鞋业', '10.造纸和纸制品业', '12.文教、工美、体育和娱乐用品制造业', '13.石油、煤炭及其他燃料加工业', '14.化学原料和化学制品制造业', '15.医药制造业', '16.化学纤维制造业', '17.橡胶和塑料制品业', '18.非金属矿物制品业', '19.黑色金属冶炼和压延加工业', '20.有色金属冶炼和压延加工业', '21.金属制品业', '22.通用设备制造业', '23.专用设备制造业', '24.汽车制造业', '26.电气机械和器材制造业', '27.计算机、通信和其他电子设备制造业', '29.其他制造业', '(三)电力、热力、燃气及水的生产和供应业', '1.电力、热力生产和供应业', '3.水的生产和供应业', '三、建筑业', '2.土木工程建筑业', '四、交通运输、仓储和邮政业', '五、信息传输、软件和信息技术服务业', '1.电信、广播电视和卫星传输服务', '六、批发和零售业', '七、住宿和餐饮业', '九、房地产业', '十、租赁和商务服务业', '十一、公共服务及管理组织', '2.水利、环境和公共设施管理业', '4.教育、文化、体育和娱乐业', '5.卫生和社会工作', '6.公共管理和社会组织、国际组织']
list_6 = ['全社会用电总计', 'a、全行业用电合计', '第二产业', '第三产业', 'b、城乡居民生活用电合计', '乡村居民', '二、工业', '(二)制造业', '5.纺织业']
# list_1
industry = '1.煤炭开采和洗选业'
df_industry = df[['地市', 'stat_date', industry]]
dataset_x1,dataset_y1 = create_data(df_industry,industry)
print('list_1:',np.max(dataset_x1),np.min(dataset_x1))
dataset_x1,dataset_y1 = normalise(dataset_x1,dataset_y1)[:2]
# list_2
industry = '2.石油和天然气开采业'
df_industry = df[['地市', 'stat_date', industry]]
dataset_x2,dataset_y2 = create_data(df_industry,industry)
for i in list_2[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x2 = np.concatenate([dataset_x2, x])
dataset_y2 = np.concatenate([dataset_y2, y])
print('list_2:',np.max(dataset_x2),np.min(dataset_x2))
dataset_x2,dataset_y2 = normalise(dataset_x2,dataset_y2)[:2]
# list_3
industry = '2.林业'
df_industry = df[['地市', 'stat_date', industry]]
dataset_x3,dataset_y3 = create_data(df_industry,industry)
for i in list_3[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x3 = np.concatenate([dataset_x3, x])
dataset_y3 = np.concatenate([dataset_y3, y])
print('list_3:',np.max(dataset_x3),np.min(dataset_x3))
dataset_x3,dataset_y3 = normalise(dataset_x3,dataset_y3)[:2]
# list_4
industry = list_4[0]
df_industry = df[['地市', 'stat_date', industry]]
dataset_x4,dataset_y4 = create_data(df_industry,industry)
for i in list_4[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x4 = np.concatenate([dataset_x4, x])
dataset_y4 = np.concatenate([dataset_y4, y])
print('list_4:',np.max(dataset_x4),np.min(dataset_x4))
dataset_x4,dataset_y4 = normalise(dataset_x4,dataset_y4)[:2]
# list_5
industry = list_5[0]
df_industry = df[['地市', 'stat_date', industry]]
dataset_x5,dataset_y5 = create_data(df_industry,industry)
for i in list_5[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x5 = np.concatenate([dataset_x5, x])
dataset_y5 = np.concatenate([dataset_y5, y])
print('list_5:',np.max(dataset_x5),np.min(dataset_x5))
dataset_x5,dataset_y5 = normalise(dataset_x5,dataset_y5)[:2]
# list_6
industry = list_6[0]
df_industry = df[['地市', 'stat_date', industry]]
dataset_x6,dataset_y6 = create_data(df_industry,industry)
for i in list_6[1:]:
df_industry = df[['地市', 'stat_date', industry]]
x, y = create_data(df_industry, industry)
dataset_x6 = np.concatenate([dataset_x6, x])
dataset_y6 = np.concatenate([dataset_y6, y])
print('list_6:',np.max(dataset_x6),np.min(dataset_x6))
dataset_x6,dataset_y6 = normalise(dataset_x6,dataset_y6)[:2]
dataset_x = np.concatenate([dataset_x1,dataset_x2,dataset_x3,dataset_x4,dataset_x5,dataset_x6])
dataset_y = np.concatenate([dataset_y1,dataset_y2,dataset_y3,dataset_y4,dataset_y5,dataset_y6])
print(dataset_y.shape,dataset_x.shape)