numpy & pandas 笔记

记录一下 python 的数据处理工具,教材来自莫烦PYTHON

1. numpy 相关

1.1 numpy 属性

  • ndim 维度
  • shape 行列
  • size 个数
1
2
3
4
5
6
7
8
import numpy as np
array = np.array([[1,2,3],[2,3,4]])
# 维度
array.ndim
# 行列
array.shape
# 个数
array.size

1.2 array 创建

  • array 创建数组
  • dtype 指定数据类型
  • zeros 全 0 数组
  • ones 全 1 数组
  • empty 接近 0 数组
  • arrange 按指定范围创建数据
  • linespace 创建线段
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# 全零数组
a = np.zeros((3,4))
"""
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
"""
# 指定类型
a = np.ones((3,4),dtype=np.int)
"""
array([[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1]])
"""
# 接近 0
a = np.empty((3,4))
"""
array([[ 1.49166815e-154, -2.32036217e+077, 2.96439388e-323,
0.00000000e+000],
[ 0.00000000e+000, 6.82116729e-043, 7.78937022e-091,
3.93892987e-062],
[ 3.61353389e+174, 4.01062531e+174, 3.99910963e+252,
8.34404819e-309]])
"""
# 范围 步长 2 从 10 到 20
a = np.arange(10,20,2)
"""
array([10, 12, 14, 16, 18])
"""
# 改变形状
a = np.arange(12).reshape((3,4))
"""
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
"""
# 线段,指定起点和终点和间隔即可,结果是一维的
a = np.linspace(1,10,20)
"""
array([ 1. , 1.47368421, 1.94736842, 2.42105263, 2.89473684,
3.36842105, 3.84210526, 4.31578947, 4.78947368, 5.26315789,
5.73684211, 6.21052632, 6.68421053, 7.15789474, 7.63157895,
8.10526316, 8.57894737, 9.05263158, 9.52631579, 10. ])
"""
# 线段 + 改变形状
a = np.linspace(1,10,20).reshape((5,4))
"""
array([[ 1. , 1.47368421, 1.94736842, 2.42105263],
[ 2.89473684, 3.36842105, 3.84210526, 4.31578947],
[ 4.78947368, 5.26315789, 5.73684211, 6.21052632],
[ 6.68421053, 7.15789474, 7.63157895, 8.10526316],
[ 8.57894737, 9.05263158, 9.52631579, 10. ]])
"""

1.3 numpy 基础运算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 先上两个矩阵 a 和 b
a = np.array([10,20,30,40])
"""
array([10, 20, 30, 40])
"""
b = np.arange(4)
"""
array([0, 1, 2, 3])
"""
# 对于每个元素当然可以使用 +-*/ 以及 ** 这样的操作
# 值得注意的是,点乘使用的是 np.dot
c = a*b
"""
array([ 0, 20, 60, 120])
"""
d = np.dot(a,b) # a.dot(b)
"""
200
"""
# np.sum/min/max 就不试了
# 值得注意的是参数 axis 0 输出整行,1 输出整列
a = np.arange(2,14).reshape(3,4)
"""
array([[ 2, 3, 4, 5],
[ 6, 7, 8, 9],
[10, 11, 12, 13]])
"""
# 最小值索引
np.argmin(a)
"""
0
"""
# 最大值索引
np.argmax(a)
"""
11
"""
# np.mean(a) / np.average(a) / a.mean() 求平均
# np.median(a) / a.median() 求中位数
# 累加
np.cumsum(a)
"""
array([ 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
"""
# 累差 计算的是一行中后一项和前一项的差,故 3*4 累差得到 3*3 的矩阵
np.diff(a)
"""
array([[1, 1, 1],
[1, 1, 1],
[1, 1, 1]])
"""
# 非零元素坐标 第一个 array 是行,第二个 array 是列,输出的是坐标
np.nonzero(a)
"""
(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]),
array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]))
"""

1.4 numpy 索引

可以像 python 数组一样使用 numpy 的 array ,也能切片,遍历的时候注意一下。

1
2
3
4
5
6
7
8
9
10
11
12
# 按行遍历
a = np.arange(3,15).reshape(3,4)
for row in a:
...
# 按列遍历 转置
for line in a.T:
...
# 扁平化 降一维
a.flatten()
"""
[ 3 4 5 6 7 8 9 10 11 12 13 14]
"""

1.5 array 合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# 垂直合并
a = np.array([1,1,1])
b = np.array([2,2,2])
np.vstack((a,b))
"""
array([[1, 1, 1],
[2, 2, 2]])
"""
# 水平合并
np.hstack((a,b))
"""
array([1, 1, 1, 2, 2, 2])
"""
# 通用转置
# 简单使用 T 只对矩阵有效,这里的 a 就无法转置,即使使用也依然是 (3,) 的类型
# 3 个元素的数组转换成 1*3 的矩阵
a[np.newaxis,:]
"""
array([[1, 1, 1]])
"""
a[np.newaxis,:].shape
"""
(1, 3)
"""
# 3 个元素的数组转换成 3*1 的矩阵
a[:,np.newaxis]
"""
array([[1],
[1],
[1]])
"""
a[:,np.newaxis].shape
"""
(3, 1)
"""
# 把 newaxis 和 h/vstack 实操一下
a = np.array([1,1,1])[:,np.newaxis]
b = np.array([2,2,2])[:,np.newaxis]
np.vstack((a,b))
"""
array([[1],
[1],
[1],
[2],
[2],
[2]])
"""
np.hstack((a,b))
"""
array([[1, 2],
[1, 2],
[1, 2]])
"""
# 不得不说 h/vstack 的职责过于单一,有通用方式 concatenate
np.concatenate((a,b),axis=0)
"""
array([[1],
[1],
[1],
[2],
[2],
[2]])
"""
np.concatenate((a,b),axis=1)
"""
array([[1, 2],
[1, 2],
[1, 2]])
"""

1.6 array 分割

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
a = np.arange(12).reshape((3,4))
"""
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
"""
# 纵向分割
np.split(a,2,axis=1)
"""
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
"""
# 横向分割 这里分割 2 个会报错,因为 3 不能被 2 整除
np.split(a,3,axis=0)
"""
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
"""
# 不等量分割
np.array_split(a,2,axis=0)
"""
[array([[0, 1, 2, 3],
[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
"""
# 也有不通用的方式,和上文 h/vstack 类似
# 垂直切分 等价于 np.split(a, 3, axis=0)
np.vsplit(a,3)
"""
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
"""
# 水平切分 等价于 np.split(a, 4, axis=1)
np.hsplit(a,4)
"""
[array([[0],
[4],
[8]]), array([[1],
[5],
[9]]), array([[ 2],
[ 6],
[10]]), array([[ 3],
[ 7],
[11]])]
"""

1.7 numpy copy & deep copy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
a = np.arange(4)
b = a
c = a
d = b
"""
array([0, 1, 2, 3])
"""
# 在 a b c d 中更改任意一个值,这 4 个值都会更改,这是 python 的引用方式
a[0]=11
d
"""
array([11, 1, 2, 3])
"""
# 如果使用 copy 就是深拷贝,不是引用的方式
a = np.arange(4)
b = a.copy()
a[0]=11
b
"""
array([0, 1, 2, 3])
"""

2. pandas 相关

如果 numpy 是列表,那么 pandas 就是字典。

2.1 pandas 基本介绍

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import numpy as np
# Series 的字符串表现形式为:索引在左边,值在右边。
a = pd.Series([1,3,6,np.nan,44,1])
"""
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
"""
# DataFrame 是一个有行列索引的表格,它包含有一组有序的列,每列可以是不同的值类型(数值,字符串,布尔值等)。
dates = pd.date_range('20200101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
"""
a b c d
2020-01-01 1.364291 -1.316093 -1.534013 0.167940
2020-01-02 0.406588 -0.485523 -0.033501 -0.689241
2020-01-03 1.222131 0.029937 -0.862602 2.134857
2020-01-04 1.514998 -1.980169 -0.274395 -0.349385
2020-01-05 -0.279809 0.023722 0.313403 0.027728
2020-01-06 0.263307 -0.100027 -0.462106 -1.664943
"""
# 可以用列索引找到指定列
df['b']
"""
2020-01-01 -2.525535
2020-01-02 -1.020658
2020-01-03 0.445319
2020-01-04 2.011729
2020-01-05 1.403210
2020-01-06 -0.528488
Freq: D, Name: b, dtype: float64
"""
# 创建没有索引的表格
pd.DataFrame(np.arange(12).reshape((3,4)))
"""
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
# 也可以一列列来创建
df2 = pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3]*4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'})
"""
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
"""
# 所以每一列的类型可以都不一样
df2.dtypes
"""
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
"""
# 查看索引
df2.index
"""
Int64Index([0, 1, 2, 3], dtype='int64')
"""
# 查看列名
df2.columns
"""
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
"""
# 单纯看值
df2.values
"""
array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
dtype=object)
"""
# 查看总结
df2.describe()
"""
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
"""
# 当然也能转置
df2.T
"""
0 1 2 3
A 1 1 1 1
B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00
C 1 1 1 1
D 3 3 3 3
E test train test train
F foo foo foo foo
"""
# 注意一下,列和行的开头都叫做索引,下面是索引排序
df2.sort_index(axis=0,ascending=False)
"""
A B C D E F
3 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
0 1.0 2013-01-02 1.0 3 test foo
"""
# 上面是索引排序,下面是值排序
df2.sort_values(by='B')
"""
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
"""

2.2 pandas 选择数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
dates = pd.date_range('20200101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A','B','C','D'])
"""
A B C D
2020-01-01 0 1 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 10 11
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
"""
# 简单筛选
# 列
df.['A']
df.A
"""
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int64
"""
# 行
df[0:3]
df['20200101':'20200103']
"""
A B C D
2020-01-01 0 1 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 10 11
"""
# 根据标签进行数据 loc
# 某行
df.loc['20200101']
"""
A 0
B 1
C 2
D 3
Name: 2020-01-01 00:00:00, dtype: int64
"""
# 跨行
df.loc[:,['A','B']]
"""
A B
2020-01-01 0 1
2020-01-02 4 5
2020-01-03 8 9
2020-01-04 12 13
2020-01-05 16 17
2020-01-06 20 21
"""
# 指定行多列
df.loc['20200101',['A','B']]
"""
A 0
B 1
Name: 2020-01-01 00:00:00, dtype: int64
"""
# 根据坐标 iloc
df.iloc[3,1]
"""
13
"""
df.iloc[3:5,1:3]
"""
B C
2020-01-04 13 14
2020-01-05 17 18
"""
df.iloc[[1,3,5],1:3]
"""
B C
2020-01-02 5 6
2020-01-04 13 14
2020-01-06 21 22
"""
# 判断筛选
df[df.A>8]
"""
A B C D
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
"""

2.3 pandas 设置值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 根据 iloc 和 loc 设置值
df.iloc[2,2]=1111
df.loc['20200101','B']=2222
"""
A B C D
2020-01-01 0 2222 2 3
2020-01-02 4 5 6 7
2020-01-03 8 9 1111 11
2020-01-04 12 13 14 15
2020-01-05 16 17 18 19
2020-01-06 20 21 22 23
"""
# 根据条件设置值
df.B[df.A>4]=0
"""
A B C D
2020-01-01 0 2222 2 3
2020-01-02 4 5 6 7
2020-01-03 8 0 1111 11
2020-01-04 12 0 14 15
2020-01-05 16 0 18 19
2020-01-06 20 0 22 23
"""
# 根据行列设置值
df['F']=np.nan
"""
A B C D F
2020-01-01 0 2222 2 3 NaN
2020-01-02 4 5 6 7 NaN
2020-01-03 8 0 1111 11 NaN
2020-01-04 12 0 14 15 NaN
2020-01-05 16 0 18 19 NaN
2020-01-06 20 0 22 23 NaN
"""
# 添加数据
df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20200101',periods=6))
"""
A B C D F E
2020-01-01 0 2222 2 3 NaN 1
2020-01-02 4 5 6 7 NaN 2
2020-01-03 8 0 1111 11 NaN 3
2020-01-04 12 0 14 15 NaN 4
2020-01-05 16 0 18 19 NaN 5
2020-01-06 20 0 22 23 NaN 6
"""

2.4 pandas 处理丢失数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# 含 nan 矩阵
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates, columns=['A','B','C','D'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
"""
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
"""
# 删除为 nan 的行或列,axis=0 为行,axis=1 为列,how='any' 为一个即删除,how='all' 为全部才删除
df.dropna(axis=0,how='any')
"""
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
"""
# 替换
df.fillna(value=0)
"""
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
"""
# 判断数据缺失
df.isnull()
"""
A B C D
2013-01-01 False True False False
2013-01-02 False False True False
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False
"""
# 判断存在数据
np.any(df.isnull())
"""
True
"""

2.5 pandas 导入导出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 读取 csv
data=pd.read_csv('student.csv')
"""
Student ID name age gender
0 1100 Kelly 22 Female
1 1101 Clo 21 Female
2 1102 Tilly 22 Female
3 1103 Tony 24 Male
4 1104 David 20 Male
5 1105 Catty 22 Female
6 1106 M 3 Female
7 1107 N 43 Male
8 1108 A 13 Male
9 1109 S 12 Male
10 1110 David 33 Male
11 1111 Dw 3 Female
12 1112 Q 23 Male
13 1113 W 21 Female
"""
# 将数据存储为 pickle 类型
data.to_pickle('student.pickle')

2.6 pandas 合并 concat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pandas as pd
import numpy as np
# 定义资料
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
# concat 纵向合并
res = pd.concat([df1,df2,df3])
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
"""
# 重置 index
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
"""
# 纵向内合并 df1 与 df2
res = pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
"""
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
"""
# 合并两个 dataframe
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=1)
"""
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
"""
# append 可以增加行,但不可以增加列
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(df2, ignore_index=True)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
"""
res = df1.append([df2, df3], ignore_index=True)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 1.0 1.0 1.0 1.0
7 1.0 1.0 1.0 1.0
8 1.0 1.0 1.0 1.0
"""
res = df1.append(s1, ignore_index=True)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
"""

2.7 pandas 合并 merge

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# 根据一组 key 合并
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
res = pd.merge(left,right,on='key')
"""
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
"""
# 根据两组 key 合并
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
res = pd.merge(left,right,on=['key1','key2'],how='inner')
"""
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
"""
res = pd.merge(left,right,on=['key1','key2'],how='outer')
"""
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
"""
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
"""
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
"""
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
"""
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# Indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
"""
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
"""
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
"""
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
"""
# 依据 index 合并
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
"""
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
"""
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
"""
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
"""
# 使用 suffixes 解决 overlapping 的问题
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
"""
k age_boy age_girl
0 K0 1 4
1 K0 1 5
"""