删除重复数据集
parent
b782ac9193
commit
6308d6d555
@ -1,4 +1,4 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="C:\anaconda\envs\pytorch" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="pytorch_gpu" project-jdk-type="Python SDK" />
|
||||||
</project>
|
</project>
|
@ -0,0 +1,128 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
def normalise(dataset_x,dataset_y):
|
||||||
|
max_value = np.max(dataset_x)
|
||||||
|
min_value = np.min(dataset_x)
|
||||||
|
dataset_x = (dataset_x - min_value) / (max_value - min_value)
|
||||||
|
dataset_y = (dataset_y - min_value) / (max_value - min_value)
|
||||||
|
return dataset_x,dataset_y,max_value,min_value
|
||||||
|
def create_data(df_industry, industry):
|
||||||
|
dataset_x = []
|
||||||
|
dataset_y = []
|
||||||
|
for i in df_industry['地市'].drop_duplicates():
|
||||||
|
data = df_industry[df_industry['地市'] == i]
|
||||||
|
grouped = data.groupby(data['stat_date'].dt.to_period('M'))
|
||||||
|
|
||||||
|
# 遍历每个月的数据
|
||||||
|
for name, group in grouped:
|
||||||
|
if len(group) == 31:
|
||||||
|
dataset_x.append(list(group[industry].values[1:28]))
|
||||||
|
dataset_y.append(list(group[industry].values[-3:]))
|
||||||
|
if len(group) == 30:
|
||||||
|
dataset_x.append(list(group[industry].values[:27]))
|
||||||
|
dataset_y.append(list(group[industry].values[-3:]))
|
||||||
|
if len(group) == 28:
|
||||||
|
fst = group[industry].values[0]
|
||||||
|
|
||||||
|
dataset_x.append([fst, fst, fst] + list(group[industry].values[1:25]))
|
||||||
|
dataset_y.append(list(group[industry].values[-3:]))
|
||||||
|
else:
|
||||||
|
fst = group[industry].values[0]
|
||||||
|
if len([fst, fst] + list(group[industry].values[1:26])) != 27:
|
||||||
|
break
|
||||||
|
dataset_x.append([fst, fst] + list(group[industry].values[1:26]))
|
||||||
|
dataset_y.append(list(group[industry].values[-3:]))
|
||||||
|
|
||||||
|
return np.array(dataset_x), np.array(dataset_y)
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv('合并行业数据.csv')
|
||||||
|
df[df.columns[2:]] /= 10000
|
||||||
|
df['stat_date'] = df['stat_date'].map(lambda x:str(x).strip()[:10])
|
||||||
|
df['stat_date'] = pd.to_datetime(df['stat_date'],format='%Y-%m-%d')
|
||||||
|
list_1 = ['1.煤炭开采和洗选业']
|
||||||
|
list_2 = ['2.石油和天然气开采业', '3.黑色金属矿采选业']
|
||||||
|
list_3 = ['2.林业', '4.有色金属矿采选业', '4.烟草制品业', '31.金属制品、机械和设备修理业', '2.燃气生产和供应业', '3.建筑安装业', '4.航空运输业', '5.管道运输业', '6.多式联运和运输代理业', '8.邮政业']
|
||||||
|
list_4 = ['第一产业', '一、农、林、牧、渔业', '1.农业', '3.畜牧业', '4.渔业', '5.农、林、牧、渔专业及辅助性活动', '(一)采矿业', '5.非金属矿采选业', '6.其他采矿业', '1.农副食品加工业', '2.食品制造业', '3.酒、饮料及精制茶制造业', '8.木材加工和木、竹、藤、棕、草制品业', '9.家具制造业', '11.印刷和记录媒介复制业', '25.铁路、船舶、航空航天和其他运输设备制造业', '28.仪器仪表制造业', '30.废弃资源综合利用业', '1.房屋建筑业', '4.建筑装饰、装修和其他建筑业', '1.铁路运输业', '2.道路运输业', '3.水上运输业', '7.装卸搬运和仓储业', '2.互联网和相关服务', '3.软件和信息技术服务业', '八、金融业', '1.科学研究和技术服务业', '3.居民服务、修理和其他服务业']
|
||||||
|
list_5 = ['城镇居民', '6.纺织服装、服饰业', '7.皮革、毛皮、羽毛及其制品和制鞋业', '10.造纸和纸制品业', '12.文教、工美、体育和娱乐用品制造业', '13.石油、煤炭及其他燃料加工业', '14.化学原料和化学制品制造业', '15.医药制造业', '16.化学纤维制造业', '17.橡胶和塑料制品业', '18.非金属矿物制品业', '19.黑色金属冶炼和压延加工业', '20.有色金属冶炼和压延加工业', '21.金属制品业', '22.通用设备制造业', '23.专用设备制造业', '24.汽车制造业', '26.电气机械和器材制造业', '27.计算机、通信和其他电子设备制造业', '29.其他制造业', '(三)电力、热力、燃气及水的生产和供应业', '1.电力、热力生产和供应业', '3.水的生产和供应业', '三、建筑业', '2.土木工程建筑业', '四、交通运输、仓储和邮政业', '五、信息传输、软件和信息技术服务业', '1.电信、广播电视和卫星传输服务', '六、批发和零售业', '七、住宿和餐饮业', '九、房地产业', '十、租赁和商务服务业', '十一、公共服务及管理组织', '2.水利、环境和公共设施管理业', '4.教育、文化、体育和娱乐业', '5.卫生和社会工作', '6.公共管理和社会组织、国际组织']
|
||||||
|
list_6 = ['全社会用电总计', 'a、全行业用电合计', '第二产业', '第三产业', 'b、城乡居民生活用电合计', '乡村居民', '二、工业', '(二)制造业', '5.纺织业']
|
||||||
|
|
||||||
|
|
||||||
|
# list_1
|
||||||
|
industry = '1.煤炭开采和洗选业'
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
dataset_x1,dataset_y1 = create_data(df_industry,industry)
|
||||||
|
print('list_1:',np.max(dataset_x1),np.min(dataset_x1))
|
||||||
|
dataset_x1,dataset_y1 = normalise(dataset_x1,dataset_y1)[:2]
|
||||||
|
|
||||||
|
# list_2
|
||||||
|
industry = '2.石油和天然气开采业'
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
dataset_x2,dataset_y2 = create_data(df_industry,industry)
|
||||||
|
for i in list_2[1:]:
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
x, y = create_data(df_industry, industry)
|
||||||
|
dataset_x2 = np.concatenate([dataset_x2, x])
|
||||||
|
dataset_y2 = np.concatenate([dataset_y2, y])
|
||||||
|
print('list_2:',np.max(dataset_x2),np.min(dataset_x2))
|
||||||
|
dataset_x2,dataset_y2 = normalise(dataset_x2,dataset_y2)[:2]
|
||||||
|
|
||||||
|
# list_3
|
||||||
|
industry = '2.林业'
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
dataset_x3,dataset_y3 = create_data(df_industry,industry)
|
||||||
|
for i in list_3[1:]:
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
x, y = create_data(df_industry, industry)
|
||||||
|
dataset_x3 = np.concatenate([dataset_x3, x])
|
||||||
|
dataset_y3 = np.concatenate([dataset_y3, y])
|
||||||
|
print('list_3:',np.max(dataset_x3),np.min(dataset_x3))
|
||||||
|
dataset_x3,dataset_y3 = normalise(dataset_x3,dataset_y3)[:2]
|
||||||
|
|
||||||
|
# list_4
|
||||||
|
industry = list_4[0]
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
dataset_x4,dataset_y4 = create_data(df_industry,industry)
|
||||||
|
for i in list_4[1:]:
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
x, y = create_data(df_industry, industry)
|
||||||
|
dataset_x4 = np.concatenate([dataset_x4, x])
|
||||||
|
dataset_y4 = np.concatenate([dataset_y4, y])
|
||||||
|
print('list_4:',np.max(dataset_x4),np.min(dataset_x4))
|
||||||
|
dataset_x4,dataset_y4 = normalise(dataset_x4,dataset_y4)[:2]
|
||||||
|
|
||||||
|
# list_5
|
||||||
|
industry = list_5[0]
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
dataset_x5,dataset_y5 = create_data(df_industry,industry)
|
||||||
|
for i in list_5[1:]:
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
x, y = create_data(df_industry, industry)
|
||||||
|
dataset_x5 = np.concatenate([dataset_x5, x])
|
||||||
|
dataset_y5 = np.concatenate([dataset_y5, y])
|
||||||
|
print('list_5:',np.max(dataset_x5),np.min(dataset_x5))
|
||||||
|
dataset_x5,dataset_y5 = normalise(dataset_x5,dataset_y5)[:2]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# list_6
|
||||||
|
industry = list_6[0]
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
dataset_x6,dataset_y6 = create_data(df_industry,industry)
|
||||||
|
for i in list_6[1:]:
|
||||||
|
df_industry = df[['地市', 'stat_date', industry]]
|
||||||
|
x, y = create_data(df_industry, industry)
|
||||||
|
dataset_x6 = np.concatenate([dataset_x6, x])
|
||||||
|
dataset_y6 = np.concatenate([dataset_y6, y])
|
||||||
|
print('list_6:',np.max(dataset_x6),np.min(dataset_x6))
|
||||||
|
dataset_x6,dataset_y6 = normalise(dataset_x6,dataset_y6)[:2]
|
||||||
|
|
||||||
|
dataset_x = np.concatenate([dataset_x1,dataset_x2,dataset_x3,dataset_x4,dataset_x5,dataset_x6])
|
||||||
|
|
||||||
|
dataset_y = np.concatenate([dataset_y1,dataset_y2,dataset_y3,dataset_y4,dataset_y5,dataset_y6])
|
||||||
|
print(dataset_y.shape,dataset_x.shape)
|
Loading…
Reference in New Issue