pandas库学习基础和业务经验（一）--重点推荐

xiaoxiao2022-07-05 181

近期用到的函数总结：

#分组排序功能 import pandas as pd def test_f(df,column,istopn = False,n=1): """ df:数据框 column:为需要对之聚合的列 istopn:返回每一组的第n行数据 """ count = len(df) distinct_count = df[column].nunique() sum = df[column].sum() avg = df[column].mean() if istopn == True: # 降序生序，自己注意 # df.sort_values(by=column, ascending=False)[:n] # 返回前n个 temp_data = df.sort_values(by=column, ascending=False) temp_data['row'] = range(len(df)) return temp_data else: return pd.DataFrame({'count':[count],'distinct_count':[distinct_count],'sum':[sum],'avg':[avg]}) #应用函数 https ://blog.csdn.net/wendaomudong_l2d4/article/details/84818133 df_sort = df.groupby(['user']).apply(test_f,column = 'amount',istopn = True) ===============================2019-5-21-总结=============================================== df_sort df_sort.index = range(len(df_sort)) df_sort #分组排序求值 #https: //mp.weixin.qq.com/s?src=11×tamp=1558517199&ver=1622&signature= #ewVTWFGVfhJknoV9tNKp6LU3gTg8qy7whwHziCriX1k0IH5sjcmlkuMutaFbQKH*xWX7*jpJQ7UGt8roQAY7PsLobAojY1rAltgvWSG04y-J55iXkU634aehmUqOtmCn&new=1 best_rating_per_price = reviews.groupby('price')['points'].max().sort_index() best_rating_per_price.head() #https: //blog.csdn.net/Li_qf/article/details/84852633 price_extremes = reviews.groupby('variety')['price'].agg(['max', 'min']) price_extremes.head() #重点分组排序一起http: //www.cnblogs.com/fatcici2017/p/6634910.html #先筛选出还有'from'列中带有'iphone 6s'的行，然后对这些数据进行groupby，结果倒序排 #约等同于sql中的groupby+where+order by +desc df[df['from'].str.contains('iphone 6s plus')].groupby(['from','to'])['uid'].agg({'uv':'count'}).sort_values(by='uv',ascending=0) #字段拆分重命名 demo=pd.merge(df_6b, pd.DataFrame(df_6b["index"].str.split('-',expand=True)), how='left', left_index=True, right_index=True) demo.head() #多个字段合并成一个新字段，如果加一个常数值，则应该另做一个字段，然后再关联 df_5["index"]=df_5["端口"].str.cat([df_5["类型"],df_5["店铺"],df_5["month"]],sep = '-') #字段删除或不显示 df.drop(df.columns["index"], axis=1, inplace=True) # 可以通过subset参数来删除在age和sex中含有空数据的全部行 df4 = df4.dropna(subset=["age", "sex"]) #修改列名 a.rename(columns={'A':'a', 'B':'b', 'C':'c'}, inplace = True) #字符串替换 df_1['店铺'] = df_1['店铺'].apply(lambda x : x.replace("较前一月","")) #关联http: //www.cnblogs.com/keye/p/10791705.html # 基于共同列alpha的内连接 df7 = pd.merge(df_6a,df_6b,on=["端口","类型","店铺","month"],how='inner') # 创建DataFrame对象 df = pd.DataFrame([1, 2, 3, 4, 5], columns=['cols'], index=['a','b','c','d','e']) # 对DataFrame对象进行列扩充 df2['col4'] = ['cnn','rnn'] ================================2019-5-23-总结==================================================== #查看数据框字段的类型 #1.少量字段 df.dtypes #2.大量字段 cols = df.columns for col in cols: print(col+' : '+ str(df[col].dtype)) #3.指定字段 cols = df.columns for col in cols: if str(df[col].dtype) == 'object': print(col) #4.查看每一列类型 df.info() #5.字符串转日期并做索引 df['date'] = pd.to_datetime(df['date']) df.set_index("date", inplace=True) #先索引，后转类型 df2.set_index("date", inplace=True) df2.index = pd.DatetimeIndex(df.index) #或者df2.index = pd.DatetimeIndex(df2["date"]) # 想要真正的改变数据框，通常需要通过赋值来进行，比如 df["字段名"] = df["Customer Number"].astype("int") ##通过自定义函数清理数据 def convert_currency(var): """ convert the string number to a float _ 去除$ - 去除逗号， - 转化为浮点数类型 """ new_value = var.replace(",","").replace("$","") return float(new_value) # 通过replace函数将$以及逗号去掉，然后字符串转化为浮点数，让pandas选择pandas认为合适的特定类型，float或者int，该例子中将数据转化为了float64 # 通过pandas中的apply函数将2016列中的数据全部转化 df["2016"].apply(convert_currency) # 当然可以通过lambda 函数将这个比较简单的函数一行带过 df["2016"].apply(lambda x: x.replace(",","").replace("$","")).astype("float64") #同样可以利用lambda表达式将PercentGrowth进行数据清理 df["Percent Growth"].apply(lambda x: x.replace("%","")).astype("float")/100 # 同样可以通过自定义函数进行解决，结果同上 # 最后一个自定义函数是利用np.where() function 将Active 列转化为布尔值。 df["Active"] = np.where(df["Active"] == "Y", True, False) ##读取文件 read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None,names=None) ######字段的拆分与合并 #1.合并（字符串合并） df_5["index"]=df_5["端口"].str.cat([df_5["类型"],df_5["店铺"],df_5["month"]],sep = '-') df_5["index"]=df_5["端口"].str.cat(df_5["类型"],sep = '-') #2.合并（含有数值） df_5["index"] = df_5["index"].map(lambda x:str(x)) df_5["index"]=df_5["端口"].str.cat(df_5["类型"],sep = '-') #3.字段拆分重命名 demo=pd.merge(df_6b, pd.DataFrame(df_6b["index"].str.split('-',expand=True)), how='left', left_index=True, right_index=True) #分段函数:在分段的时候有6个值，但是分段的标签只有5个 #默认左开右闭，那么只需要加一个参数：right = False就可以。 df["字段"] = pd.cut(df["字段"],["阈值","阈值","阈值","阈值"],labels = ["阈值","阈值","阈值"]) #删除表中全部为NaN的行 df.dropna(axis=0,how='all') #删除表中含有任何NaN的行 df.dropna(axis=0,how='any') #drop all rows that have any NaN values ============================================================================================== #总结一下astype()函数有效的情形： #数据列中的每一个单位都能简单的解释为数字(2, 2.12等） #数据列中的每一个单位都是数值类型且向字符串object类型转换 #如果数据中含有缺失值、特殊字符astype()函数可能失效。 #使用自定义函数进行数据类型转换 #方案一 def convert_currency(value): """ 转换字符串数字为float类型 - 移除￥ , - 转化为float类型 """ new_value = value.replace(',', '').replace('￥', '') return np.float(new_value) data['2016'].apply(convert_currency) #方案二 data['2016'].apply(lambda x: x.replace('￥', '').replace(',', '')).astype('float') #import pandas as pd 日期函数的应用 dates = ['2017-01-05', 'Jan 5, 2017', '01/05/2017', '2017.01.05', '2017/01/05','20170105'] pd.to_datetime(dates) ===================参考博客http:// www.cnblogs.com/ onemorepoint/p/9404753.html==================================================================== #混合字符串类型替换成数值函数 def convert_currency(var): """ convert the string number to a float _ 去除$ - 去除逗号， - 转化为浮点数类型 """ new_value = var.replace(",","").replace("$","") return float(new_value) #将百分数型号的字符串转化为数值类型函数 def convert_percent(value): """ 转换字符串百分数为float类型小数 - 移除 % - 除以100转换为小数 """ new_value = value.replace('%', '') return float(new_value) / 100 #导入数据进行整体形式替换 data2 = pd.read_csv("data.csv", converters={ '客户编号': str, '2016': convert_currency, '2017': convert_currency, '增长率': convert_percent, '所属组': lambda x: pd.to_numeric(x, errors='coerce'), '状态': lambda x: np.where(x == "Y", True, False) }, encoding='gbk') #可视图表的展示，要统计的值，横坐标，纵坐标，用的函数 mean_demo = temp_demo.pivot_table("2019年消费",index = "年",columns = "活跃度",aggfunc = "mean") mean_demo #tips.pivot_table(values=['tip_pct', 'size'], index=['sex', 'day'], columns='smoker') #取列，求对应列的函数 a = temp_demo.columns columns = ["2018年消费","2019年消费","百分占比","单元"] temp_demo[columns].max() #博客：blog.csdn.net/liuhehe123/article/details/85921930 #//blog.csdn.net/Li_qf/article/details/84852633 葡萄酒问题 #排序 sorted_varieties = price_extremes.sort_values(by=['min', 'max'], ascending=False) sorted_varieties #分组 price_extremes = reviews.groupby("variety").price.agg(['min','max']) price_extremes #1.读取数据 import pandas as pd reviews = pd.read_csv('./winemag-data-130k-v2.csv', index_col=0) #2.谁是品酒最多的人？创建一个由taster_twitter_handle 为索引，值为每个人品酒次数的Series reviews_written = reviews.groupby('taster_twitter_handle').size() reviews_written #3.创建一个以价格(price)为索引，最高评分(points)为值的Series，排序以价格递增。 best_rating_per_price = reviews.groupby('price')['points'].max().sort_index() best_rating_per_price.head() #4.创建一个DataFrame，以种类(variety)为索引，值为每个种类的最大值和最小值 price_extremes = reviews.groupby('variety')['price'].agg(['max', 'min']) price_extremes.head() #5.对上题中price_extremes进行处理，先按min降序排列，再按max降序排列。 sorted_varieties = price_extremes.sort_values(['min', 'max'], ascending=False) sorted_varieties.head() #6.创建一个Series，索引为品酒师(taster_name)，值为该人所有评分(points)的平均分。 reviewer_mean_ratings = reviews.groupby('taster_name').points.mean() reviewer_mean_ratings #7.创建一个Series，索引为多索引{country, variety}，值为个数，按值递减。 country_variety_counts = reviews.groupby(['country', 'variety']).size().sort_values(ascending=False) country_variety_counts.head() #8.删除空行 blog.csdn.net/houyanhua1/article/details/87855228 df4 = df4.dropna(subset=['age', 'body','home.dest']) df.dropna(axis=0, inplace=True) # inplace=True表示原地修改，修改后的结果直接作用于原df。默认False #9.NaN的处理方式二：填充 df2 = df.fillna(100) # 填充成100 # 可以只填充某一列 df4 = df["YY"].fillna(df["YY"].mean()) # df.mean()表示每一列的平均值（Series类型） # 填充平均值 df3 = df.fillna(df.mean()) # df.mean()表示每一列的平均值（Series类型）。 df.median()中位数 #9.基础学习 blog.csdn.net/claroja/article/details/65661826 #10# create new Title column #从pandas中的一个单元格的字符串中提取字符串 df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

最新回复(0)