Pandas

pandas API

Pandas使用教程


Pandas之csv文件对列行的相关操作

Python pandas用法

panda数据处理

Pandas之csv文件对列行的相关数据库操作

pandas进行工作相关数据清洗

Pandas数据分析实战项目

数据分析应用

map、apply、applymap详解

Series

data = [1,2,3,4]
index = list('abcd')
s1 = pd.Series(data, index=index, name='books'

追加
s2 = s1.append(Series({"e": 100, "a":200}))

相同索引相加
s3 = s1.add(Series({"e": 100, "a":200}))

排序
s2.sort_index()
s2.sort_values(drop=True)   #降序

去掉索引
s4 = s2.reset_index(drop=True)

更换索引
inx = "this is a test".split()"
s3.reindex(idx)

s1.max()
s1.idxmax()             #最大值索引
s1.count()
s1.sum()
s1.var()
s1.mean()
s2.describe()           #各类统计列表

DataFrame

pd.DataFrame()
l = [[1,2,3], [4,5,6]]
df = pd.append(l)

s = pd.Series({"x":1, "y": 2}, name='a')
pd.append(s, ignore_index=True)             #自动加索引, 避免索引相同


data = np.array([[5, 5, 3, 3, 4], [3, 4, 5, 5, 4],
                 [3, 4, 3, 4, 5], [5, 5, 3, 4, 4]])
df = pd.DataFrame(data, columns=['The Shawshank Redemption',
                                 'Forrest Gump', 'Avengers: Endgame',
                                 'Iron Man', 'Titanic'],
                  index=['user1', 'user2', 'user3', 'user4'])
                  
print(df['Iron Man'])                   #返回Series
print(df[['Iron Man', 'Titanic']])     #返回DataFrame

#改列名
df3.rename(columns={'value_x':'valueX', 'value_y':'valueY'},inplace=True)

print('增加一列,两种方式')
colnames.append('memo')
df5 = df3.reindex(columns=colnames)
df5.eval('memo=lkey+rkey',inplace=True)
df5['total'] = df5['valueX']+df5['valueY']

分组和统计
print(df2.groupby('grade').count())
print(df2.groupby('grade').sum())
print(df2.groupby('grade')['Iron Man'].count())
print(df2.groupby('grade').count()['Iron Man'])

索引list
df1.index[:3].tolist()

截取部分行
df1.iloc[:3]

更新一列的所有值
df2['grade'] = 2020

更新一行一列值
df2.loc['user1','grade']=2019
df2.at['user2','grade']=2019
df2.at['user3','grade']=2021
df2.at['user5','grade']=2021

选取一行
df1.loc['user1']

选取多行
df1.loc[['user1', 'user2']]

选取一列
df1['Iron Man']

选取多列
df1[['Iron Man', 'Titanic']]

类SQL(select name,age from staff where genda = 'm')
df1[df.genda = 'm'][['name', 'age']]

类SQL(update staff set age = age + 1 where genga = 'm' and age = 21)
df1.loc[(df.genda == 'm') & (df.age=21), 'age'] = df1['age'] + 1 

增加一个统计行
df1.loc['sum'] = df.apply(lambda x: x.sum())

增加一个统计列
df1['sum'] = df.apply(lambda x: x.sum(), axis=1)