输出结果: 使用字典的方式创建:
import pandas as pd import numpy as np a={'B':1,'C':pd.Categorical(['aaa','bbb','ccc'])} b=pd.DataFrame(a) print(b)输出结果: 基本操作:
import pandas as pd import numpy as np a={'B':1,'C':pd.Categorical(['aaa','bbb','ccc'])} b=pd.DataFrame(a) print("打印每一列的类型:",b.dtypes) print("行的索引:",b.index) print("列的索引:",b.columns) print("打印值:",b.values) print("获取参数(只会计算数值类型):",b.describe()) print("将数据看作矩阵transpose:",b.T) print("根据索引排序(横向 倒叙):",b.sort_index(axis=1,ascending=False)) print("对里边的值进行排序:",b.sort_values(by='B')) 选择数据: import pandas as pd import numpy as np dates=pd.date_range('2015-5-25',periods=6) d=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) print("输出某一列:",d.a,d['a']) print("切片输出:",d[0:3],d['2015-05-25':'2015-05-27']) print("根据标签选择loc(以逗号做分割,前边为行,后边为列的选择):",d.loc[:,['a','b']]) print("根据索引选择iloc(index local )",d.iloc[3]) print("标签和索引混合筛选ix(mixed selection):",d.ix[:3,['a','b']]) print("条件筛选,本例中筛选a中大于8的DataFrame:",d[d.a>8]) 修改DataFrame中的值: import pandas as pd import numpy as np dates=pd.date_range('2015-05-25',periods=6) d=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) # 根据索引进行修改,将[2,2]中的值改为111 d.iloc[2,2]=111 # 根据标签进行修改,将2015-05-25 a的值改为1111 # 同理可以使用d.ix[]进行混合选择方式修改 d.loc['2015-05-25','a']=1111 # d中a列中大于3的数改为0 d.a[d.a>10]=0 # 添加一个空的列 d['f']=np.nan 处理丢失数据(nan): import pandas as pd import numpy as np dates=pd.date_range('2015-5-25',periods=6) d=pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['a','b','c','d']) d.iloc[2,2]=np.nan d.iloc[3,2]=np.nan # how ={'any','all'}有任何一个nan就丢掉/全部是nan丢掉 c=d.dropna(axis=0,how='any') print("填充nan:",d.fillna(value=0)) print("检查是否缺失(返回True/False的DataFrame,缺失则为True):",d.isnull()) print("数据较大时,检查是否有=空数据:",np.any(d.isnull())==True) 导入导出数据:官方文档:http://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
基本读、写操作:
import pandas as pd import numpy as np dates=pd.date_range('2015-5-25',periods=6) d=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d']) d.to_csv('student.csv') data=pd.read_csv('student.csv') 多个DataFrame合并: concat import pandas as pd import numpy as np f1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d']) f2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d']) f3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d']) # 连接DataFrame 忽视索引 重新创建0、1、2.。。的索引 # inner模式只处理相同部分的合并,默认为outer模式 # 如果未outer模式存在不相同部分时,不存在的位置为NAN # axis 合并方向 # 如果axis=1 左右合并时 以f1的index做索引值,如果不设置,两者都会考虑,不存在的为nan res=pd.concat([f1,f2,f3],ignore_index=True,join='inner',axis=0,join_axes=[f1.index]) s1=pd.Series([1,2,3,4],index=['a','b','c','d']) print("添加一个DataFrame:",f1.append(s1,ignore_index=True)) merge import pandas as pd import numpy as np left=pd.DataFrame({'key1':['K4','K1','K2','K3'], 'key2':['K0','K1','K2','K3'], 'a':[1,2,3,4], 'b':[5,6,7,8]}) right=pd.DataFrame({'key1':['K0','K1','K2','K3'], 'key2':['K0','K1','K2','K3'], 'a':[8,9,10,11], 'd':[0,1,3,4] }) # 以key进行合并,只会合并共同存在的 # 默认how为inner属性,如果有相同部分,进行合并how={'inner','outer','left','right'} # indicator显示合并方式 想改变indicator的显示值直接indicator=name res=pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True) # 改变合并方式 考虑为index res=pd.merge(left,right,left_index=True,right_index=right,how='inner') # suffixes 当有相同项进行合并时使用suffixes做区别 res=pd.merge(left,right,on='key1',suffixes=['_boy','_girl'],how='inner') plot画图: import matplotlib.pyplot as plt import numpy as np import pandas as pd data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD')) data=data.cumsum() data.plot() plt.show()结果:
import matplotlib.pyplot as plt import numpy as np import pandas as pd data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD')) data=data.cumsum() # plot methods: # bar -条形图 hist box kde area scatter hexbin pie # 点图 横轴为A 纵轴为B ax=data.plot.scatter(x='A',y='B',color='red',label='Class 1') # 蓝色附着在 ax 上 data.plot.scatter(x='C',y='D',color='blue',ax=ax,label='Class 2') plt.show()结果:
matplotlib其他见菜鸟教程:https://www.runoob.com/numpy/numpy-matplotlib.html