TensorBoard使用笔记:训练过程可视化踩坑记录

TensorBoard使用笔记

记录TensorBoard的安装配置和常用功能,TensorFlow和PyTorch都能用。

安装与启动

1
2
3
4
5
6
7
8
# 使用conda环境
conda activate pythonProject2

# 安装TensorBoard
pip install tensorboard -i https://pypi.tuna.tsinghua.edu.cn/simple

# 如果需要PyTorch版本
pip install torch torchvision -i https://pypi.tuna.tsinghua.edu.cn/simple

启动TensorBoard:

1
2
3
4
5
6
7
# 指定日志目录
tensorboard --logdir f:\logs

# 指定端口
tensorboard --logdir f:\logs --port 6006

# 浏览器打开 http://localhost:6006/

TensorFlow中的使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import tensorflow as tf
import os

# 解决多线程库冲突问题(可选)
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# 加载MNIST数据集
mnist_data = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = mnist_data.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

# 定义模型
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax')
])

# 编译模型
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)

# 创建TensorBoard回调
tf_callback = tf.keras.callbacks.TensorBoard(log_dir="f:/logs")

# 训练并启用TensorBoard
model.fit(X_train, y_train, epochs=5, callbacks=[tf_callback])

PyTorch中的使用

基础用法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# 创建SummaryWriter
writer = SummaryWriter(log_dir="f:/logs")

# 记录数据
writer.add_scalar('training/loss', 0.5, 0)
writer.add_scalar('training/accuracy', 0.9, 0)

# 关闭writer
writer.close()

单条曲线(Scalar)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

# 记录损失值变化
for x in range(1, 101):
writer.add_scalar('y = 2x', x * 2, x)
writer.add_scalar('y = pow(2, x)', 2 ** x, x)

writer.close()

add_scalar参数

  • tag:数据标识符
  • scalar_value:要保存的数值
  • global_step:训练步数
  • walltime:时间戳(可选)

多条曲线对比(Scalars)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

r = 5
for x in range(1, 101):
writer.add_scalars('run_14h', {
'xsinx': x * np.sin(x / r),
'xcosx': x * np.cos(x / r),
'xtanx': x * np.tan(x / r)
}, x)

writer.close()

直方图(Histogram)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

# 记录权重分布变化
for step in range(10):
x = np.random.randn(1000)
writer.add_histogram('distribution_of_gaussian', x, step)

writer.close()

应用场景:监控权重分布、检查梯度消失/爆炸、观察激活值分布。

图像可视化(Image)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import cv2 as cv
import torch
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

# 读取图像
img = cv.imread('image.jpg', cv.IMREAD_COLOR)
img = cv.cvtColor(img, cv.COLOR_BGR2RGB)

# 转换为Tensor格式 (C, H, W)
img = torch.tensor(img.transpose(2, 0, 1))

# 记录图像
writer.add_image('sample_image', img, 0)

writer.close()

注意:OpenCV读取的是(H, W, C)即HWC格式,TensorBoard需要(C, H, W)即CHW格式,要transpose转换。

图形可视化(Figure)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

# 创建matplotlib图表
x = np.linspace(0, 10, 1000)
y = np.sin(x)

figure1 = plt.figure()
plt.plot(x, y, 'r-')
plt.title('Sine Wave')

# 记录图表
writer.add_figure('my_figure', figure1, 0)

writer.close()

网络结构可视化(Graph)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import torch
import torch.nn as nn
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

# 定义MLP模型
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.Net = nn.Sequential(
nn.Linear(784, 512),
nn.ReLU(),
nn.Linear(512, 128),
nn.ReLU(),
nn.Linear(128, 10)
)

def forward(self, input):
input = input.view(-1, 28 * 28)
return self.Net(input)

model = MLP()

# 创建模拟输入
input_tensor = torch.FloatTensor(np.random.rand(32, 1, 28, 28))

# 记录网络结构
writer.add_graph(model, input_tensor)

writer.close()

嵌入向量可视化(Embedding)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

writer = SummaryWriter(log_dir="f:/logs")

# 创建高维数据
mat = np.random.randn(100, 50)
metadata = [f'class_{i % 10}' for i in range(100)]

# 记录嵌入向量
writer.add_embedding(
mat,
metadata=metadata,
tag='embedding_viz'
)

writer.close()

参数说明

  • mat:特征矩阵,每行是一个数据点的特征向量
  • metadata:标签列表
  • label_img:对应图像(可选)
  • tag:嵌入名称

完整训练监控示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# 创建模型
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 64)
self.fc3 = nn.Linear(64, 10)
self.relu = nn.ReLU()

def forward(self, x):
x = x.view(-1, 784)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x

# 创建模拟数据
X = torch.randn(1000, 784)
y = torch.randint(0, 10, (1000,))
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 初始化
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 创建TensorBoard writer
writer = SummaryWriter('runs/experiment_1')

# 记录网络结构
sample_input = torch.randn(1, 784)
writer.add_graph(model, sample_input)

# 训练循环
for epoch in range(10):
running_loss = 0.0
for i, (inputs, labels) in enumerate(dataloader):
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

running_loss += loss.item()

# 每10批次记录一次
if i % 10 == 9:
step = epoch * len(dataloader) + i
avg_loss = running_loss / 10
writer.add_scalar('training loss', avg_loss, step)
writer.add_histogram('fc1 weights', model.fc1.weight, step)
running_loss = 0.0

print('训练完成')
writer.close()

日志目录组织

1
2
3
4
5
6
7
logs/
├── experiment_1/
│ ├── events.out.tfevents.*
├── experiment_2/
│ ├── events.out.tfevents.*
└── baseline/
└── ...

多实验对比:

1
2
3
4
5
6
# 不同log_dir区分实验
writer_exp1 = SummaryWriter('logs/experiment_1')
writer_exp2 = SummaryWriter('logs/experiment_2')

# 启动TensorBoard时指定父目录
tensorboard --logdir logs/

性能优化

  1. 频繁写入优化:别每一步都写,每N步写一次
  2. 文件大小:定期清理旧日志
  3. 内存管理:训练结束调用writer.close()
1
2
3
# 每100步写入一次
if step % 100 == 0:
writer.add_scalar('loss', loss, step)

TensorBoard界面功能

标签页 功能
Scalars 查看损失、准确率等曲线
Images 查看训练过程中保存的图像
Audio 播放音频数据
Graphs 可视化模型结构图
Distributions 查看张量分布随时间变化
Histograms 查看张量直方图
Embeddings 可视化高维数据降维结果
Text 显示文本数据
HParams 超参数调优比较

常见问题

无法访问TensorBoard

1
2
# 指定host和port
tensorboard --logdir logs --host 0.0.0.0 --port 6006

数据不更新

  1. 检查日志路径是否正确
  2. 确保writer调用了flush()close()
  3. 刷新TensorBoard页面

多实验对比

1
2
3
# 所有实验放在logs目录下不同子目录
tensorboard --logdir logs/
# TensorBoard会自动识别子目录并区分实验

核心功能就这些:标量曲线、直方图、图像、网络图、嵌入向量可视化。配合PyTorch或TensorFlow使用都很方便。

参考