Pytorch中混合精度训练的使用和debug
1
『简介』
# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()
for epoch in epochs:
for input, target in data:
optimizer.zero_grad()
output = model(input)
loss = loss_fn(output, target)
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
scaler.scale(loss).backward()
# scaler.step() first unscales gradients of the optimizer's params.
# If gradients don't contain infs/NaNs, optimizer.step() is then called,
# otherwise, optimizer.step() is skipped.
scaler.step(optimizer)
# Updates the scale for next iteration.
scaler.update()
2
『Forward期间出现nan』
v_length = values.size(1)
values = values / v_length # prevent fp16 overflow
KV = torch.einsum("nshd,nshv->nhdv", K, values) # (S,D)' @ S,V
Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps) # 出现NAN
queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
proj = torch.matmul(intrinsic, extrinsic) # intrinsic内参矩阵和extrinsic外参矩阵相乘出现NAN
with torch.cuda.amp.autocast(enable=False): # 禁止amp自动切换精度
# 之前的输出大概率是fp16,调整回fp32
Q = Q.to(torch.float32)
K = K.to(torch.float32)
values = values.to(torch.float32)
v_length = values.size(1)
values = values / v_length # prevent fp16 overflow
KV = torch.einsum("nshd,nshv->nhdv", K, values) # (S,D)' @ S,V
Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps) # 没有上溢问题
queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
with torch.cuda.amp.autocast(enable=False): # 禁止amp自动切换精度
proj = torch.matmul(intrinsic.to(torch.float32), extrinsic.to(torch.float32))
3
『loss scale一泻千里』
scaler = torch.cuda.amp.GradScaler()
current_loss_scale = scaler.get_scale()
if step % log_iter == 0:
print('scale:', current_loss_scale)
# 区分params为不同group,以方便定位对应的layer_name
param_groups = []
for n, p in model.named_parameters():
if p.requires_grad:
param_groups.append({'params': [p], 'lr': opt_args['lr'], 'weight_decay': opt_args['weight_decay'], 'layer_name': n})
optimizer = torch.optim.AdamW(param_groups, lr=opt_args['lr'], weight_decay=opt_args['weight_decay'])
...
# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()
for epoch in epochs:
for input, target in data:
optimizer.zero_grad()
output = model(input)
loss = loss_fn(output, target)
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
scaler.scale(loss).backward()
# 检查梯度
with torch.no_grad():
for group in optimizer.param_groups:
for param in group["params"]:
if param.grad is None:
continue
if param.grad.is_sparse:
if param.grad.dtype is torch.float16:
param.grad = param.grad.coalesce()
to_unscale = param.grad._values()
else:
to_unscale = param.grad
v = to_unscale.clone().abs().max()
if torch.isinf(v) or torch.isnan(v):
print('INF in', group['layer_name'], 'of step', global_step, '!!!')
# scaler.step() first unscales gradients of the optimizer's params.
# If gradients don't contain infs/NaNs, optimizer.step() is then called,
# otherwise, optimizer.step() is skipped.
scaler.step(optimizer)
# Updates the scale for next iteration.
scaler.update()
参考
https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html
https://pytorch.org/docs/stable/notes/amp_examples.html
https://github.com/NVIDIA/apex/blob/082f999a6e18a3d02306e27482cc7486dab71a50/apex/amp/lists/functional_overrides.py
扫描二维码添加小助手微信
关于我们
微信扫码关注该文公众号作者
戳这里提交新闻线索和高质量文章给我们。
来源: qq
点击查看作者最近其他文章