import os import numpy as np import pandas as pd n1 = np.array([[1,1,1]]) n2 = np.array([1,1,1]).reshape(1,-1) print(n2) n2 = np.array([]).reshape(3,-1) print(np.max([[1,2,3],[4,5,6]])) file_dir = r'../浙江电压等级电量/浙江各地市分电压日电量数据' excel = os.listdir(file_dir)[0] df = pd.read_excel(os.path.join(file_dir, excel), sheet_name=0, index_col='stat_date') df.columns = df.columns.map(lambda x:x.strip()) df.index = pd.to_datetime(df.index) df.sort_index(inplace=True) df = df.loc['2021-01':'2023-08'][:-3] print(df.tail()) # for city in df['地市'].drop_duplicates(): # df_city = df[df['地市']== city] # df_city['stat_date'] = df_city['stat_date'].map(lambda x:x.strip()) # df_city['stat_date'] = pd.to_datetime(df_city['stat_date'],format='%Y-%m-%d') # df_city = df_city[df_city.columns[:-1]] # df_city.sort_values(by='stat_date',ascending=True,inplace=True) # df_city['stat_date'] = df_city['stat_date'].astype('str') # df_city.to_excel(fr'C:\Users\user\Desktop\浙江各地市分电压日电量数据\{city}.xlsx',index=False) # file_Dir = r'C:\Users\鸽子\Desktop\浙江各地市行业电量数据' # for excel in os.listdir(file_Dir): # df1 = pd.read_excel(r'C:\Users\鸽子\Desktop\浙江各地市日电量数据-27-28).xlsx',sheet_name=1) # df1.columns = df1.columns.map(lambda x:x.strip()) # df2 = pd.read_excel(os.path.join(file_Dir,excel)) # df2['地市'] = df2['地市'].map(lambda x:x.strip()) # city = df2['地市'].iloc[0] # col_list = df2.columns # df1 = df1[col_list] # df1 = df1[(df1['stat_date']==20231028)&(df1['地市']==city)] # df1['stat_date'] = pd.to_datetime(df1['stat_date'],format='%Y%m%d') # df2 = pd.concat((df2,df1),ignore_index=True) # df2.to_excel(fr'C:\Users\鸽子\Desktop\浙江各地市行业电量数据\{city}.xlsx') pd.set_option('display.width',None) def normal(df): drop_col = [x for x in df.columns if len(df[df[x]==0])/len(df) >= 0.5] df.drop(columns=drop_col,inplace=True) for col in df.columns: try: high = df[col].describe()['75%'] + 1.5 * (df[col].describe()['75%'] - df[col].describe()['25%']) low = df[col].describe()['25%'] - 1.5 * (df[col].describe()['75%'] - df[col].describe()['25%']) df[col] = df[col].map(lambda x: np.nan if (x >= high) | (x <= low) else x) df[col] = df[col].fillna(method='ffill') df[col] = df[col].fillna(method='bfill') except: pass return df # file_dir = './浙江各地市行业电量数据' # city1 = os.listdir(file_dir)[0] # df_city = pd.read_excel(os.path.join(file_dir, city1)) # df_city = normal(df_city) # df_city = df_city.drop(columns='地市') # df_city[df_city.columns[1:]] /= 10000 # df_city['stat_date'] = df_city['stat_date'].map(lambda x: str(x).strip()[:10]) # df_city.stat_date = pd.to_datetime(df_city.stat_date) # print(df_city.describe()) # # list_1000 = [] # list_100 = [] # list_10 = [] # list_1 = [] # for i in df_city.columns[1:]: # if df_city[i].describe()['mean']>=1000: # list_1000.append(i) # if df_city[i].describe()['mean'] < 1000 and df_city[i].describe()['mean'] >= 100: # list_100.append(i) # if df_city[i].describe()['mean'] < 100 and df_city[i].describe()['mean'] >= 10: # list_10.append(i) # else: # list_1.append(i) # print('list_1:',list_1) # print('list_10:',list_10) # print('list_100:',list_100) # print('list_1000:',list_1000) import pandas as pd # 创建一个简单的DataFrame data = pd.DataFrame({'A': [1, 2, 3000, 4, 500], 'B': [10, 20, 30, 40, 50]}) Q1 = data['A'].quantile(0.25) Q3 = data['A'].quantile(0.75) IQR = Q3 - Q1 lower_threshold = Q1 - 1.5 * IQR upper_threshold = Q3 + 1.5 * IQR # 向下移动一行 outliers = (data['A'] < lower_threshold) | (data['A'] > upper_threshold) print(outliers) print( data['A'].shift(1)) # 替换异常值为临近一个值 data = {'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]} df = pd.DataFrame(data) # 将满足条件的元素替换为新值 condition = df['A'] > 3 df_new = df.where(condition, other=-1) print("原始数据:") print(df) print("\n根据条件替换后的数据:") print(df_new)