Pandas数据处理踩坑记录：Series和DataFrame实战笔记

2023-01-08

Python, 实战

Pandas数据处理踩坑记录

做数据分析时Pandas用得最多，记录一下Series和DataFrame的常用操作，主要是数据选择、清洗和合并这些高频需求。

Series：带标签的一维数组

import pandas as pd
import numpy as np

# 创建Series
s = pd.Series([1, 3, 6, np.nan, 44, 1])
print(s)
"""
0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64
"""

特点：左边是自动生成的整数索引，右边是数据值，自动推断类型。

DataFrame：带标签的二维表格

# 创建日期索引
dates = pd.date_range('20160101', periods=6)

# 创建DataFrame
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
"""
                   A         B         C         D
2016-01-01 -0.253065 -2.071051 -0.640515  0.613663
2016-01-02 -1.147178  1.532470  0.989255 -0.499761
2016-01-03  1.221656 -2.390171  1.862914  0.778070
2016-01-04  1.473877 -0.046419  0.610046  0.204672
2016-01-05 -1.584752 -0.700592  1.487264 -1.778293
2016-01-06  0.633675 -1.414157 -0.277066 -0.442545
"""

DataFrame既有行索引(index)也有列索引(columns)，可以看成多个Series组成的字典。

多种创建方式

# 方式1：从数组创建
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df1)
"""
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
"""

# 方式2：从字典创建（混合数据类型）
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
print(df2)
"""
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
"""

数据属性查看

# 查看数据类型
print(df2.dtypes)

# 查看行索引
print(df2.index)

# 查看列名
print(df2.columns)

# 查看所有值（返回NumPy数组）
print(df2.values)

# 数据统计摘要
print(df2.describe())
"""
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
"""

数据转置

1	print(df2.T)

排序操作

# 按列名排序（axis=1表示按列）
print(df2.sort_index(axis=1, ascending=False))

# 按值排序
print(df2.sort_values(by='B'))

数据选择

选列

# 单列（返回Series）
print(df['A'])
# 或
print(df.A)

选行

# 前3行
print(df[0:3])

# 按标签范围选（包含边界）
print(df['20160102':'20160104'])

loc标签选择

# 单行
print(df.loc['20160102'])

# 多行多列
print(df.loc[:, ['A', 'B']])
print(df.loc['20160102', ['A', 'B']])

iloc位置选择

# 特定位置
print(df.iloc[3, 1])

# 区域
print(df.iloc[3:5, 1:3])

# 不连续行/列
print(df.iloc[[1, 3, 5], 1:3])

条件筛选

1 2	# 布尔索引 print(df[df.A > 0])

数据清洗

设置值

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])

# 按位置设置
df.iloc[2, 2] = 1111

# 按标签设置
df.loc['20130101', 'B'] = 2222

# 条件设置
df.B[df.A > 4] = 0

# 添加新列
df['F'] = np.nan
df['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods=6))

处理缺失值

# 造点NaN
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan

# 删除含NaN的行/列
df.dropna(
    axis=0,     # 0: 对行操作; 1: 对列操作
    how='any'   # 'any': 存在NaN即删除; 'all': 全为NaN才删除
)

# 填充NaN
df.fillna(value=0)

# 检查缺失值
print(df.isnull())

# 是否存在缺失值
np.any(df.isnull()) == True

数据合并

concat纵向合并

df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])

res = pd.concat([df1, df2, df3], axis=0)
print(res)

# 重置索引
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

merge横向合并

# 依据key合并
left = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})

res = pd.merge(left, right, on='key')
print(res)

不同合并方式

left = pd.DataFrame({
    'key1': ['K0', 'K0', 'K1', 'K2'],
    'key2': ['K0', 'K1', 'K0', 'K1'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
    'key1': ['K0', 'K1', 'K1', 'K2'],
    'key2': ['K0', 'K0', 'K0', 'K0'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})

# inner: 只保留匹配项
res_inner = pd.merge(left, right, on=['key1', 'key2'], how='inner')

# outer: 保留所有项，不匹配的填NaN
res_outer = pd.merge(left, right, on=['key1', 'key2'], how='outer')

# left: 保留左侧所有数据
res_left = pd.merge(left, right, on=['key1', 'key2'], how='left')

# right: 保留右侧所有数据
res_right = pd.merge(left, right, on=['key1', 'key2'], how='right')

添加数据

# 纵向添加（Pandas 1.4+用concat）
res = pd.concat([df1, df2], ignore_index=True)

# 添加Series
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res = pd.concat([df1, s1.to_frame().T], ignore_index=True)

数据导入导出

import pandas as pd

# 读取CSV
data = pd.read_csv('student.csv')
print(data)

# 保存为pickle（更快，保留数据类型）
data.to_pickle('student.pickle')

# 从pickle读取
data = pd.read_pickle('student.pickle')

数据可视化

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Series绘图
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data.cumsum()
data.plot()
plt.show()

# DataFrame绘图
data = pd.DataFrame(
    np.random.randn(1000, 4),
    index=np.arange(1000),
    columns=list("ABCD")
)
data.cumsum()
data.plot()
plt.show()

# 散点图
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class1')
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class2', ax=ax)
plt.show()

性能优化建议

操作	推荐方式	避免方式
数据选择	`df.loc[]`, `df.iloc[]`	`df[][]` 链式索引
迭代	向量化操作	`for` 循环
添加数据	`pd.concat()`	`append()` 循环
缺失值处理	`fillna()`, `dropna()`	手动遍历

常见错误

SettingWithCopyWarning：用.loc[]或.iloc[]赋值，别用链式索引
链式索引：避免df[condition][column]，用df.loc[condition, column]
数据类型：注意自动类型推断可能带来的问题

与其他库配合

# Pandas + NumPy
df['sum'] = np.sum(df[['A', 'B']], axis=1)

# Pandas + Matplotlib
df.plot(kind='bar')
plt.title('数据分析图表')
plt.show()

# Pandas + Scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[['A', 'B']], df['C'], test_size=0.2)

核心就是这些：数据结构的创建和查看、各种选择方法、缺失值处理、数据合并、导入导出和可视化。代码都是实际用过并验证过的。

后端开发Python