深入浅出详解张量自动求导机制
©作者 | 清川
单位 | 上海交通大学博士生
研究方向 | 联邦学习、端云协同推断
反向传播
链式法则
# import numpy as np
class Tensor:
def __init__(self, values, requires_grad=False, dependency=None):
self._values = np.array(values)
self.shape = self.values.shape
self.grad = None
if requires_grad: self.zero_grad()
self.requires_grad = requires_grad
if dependency is None: dependency = []
self.dependency = dependency
@property
def values(self):
return self._values
@values.setter
def values(self, new_values):
self._values = np.array(new_values)
self.grad = None
class Tensor:
# ...
def zero_grad(self):
self.grad = np.zeros(self.shape)
4.2 实现反向传播
class Tensor:
# ...
def backward(self, grad=None):
assert self.requires_grad, "Call backward() on a non-requires-grad tensor."
assert not (grad is None and self.values.size > 1), "grad can be implicitly created only for scalar outputs"
grad = 1.0 if grad is None else grad
grad = np.array(grad)
self.grad += grad
for dep in self.dependency:
grad_for_dep = dep["grad_fn"](grad)
dep["tensor"].backward(grad_for_dep)
RentimeError: grad can be implicitly created only for scalar outputs
# ...
for dep in self.dependency:
grad_for_dep = dep["grad_fn"](grad)
dep["tensor"].backward(grad_for_dep)
def as_tensor(obj):
if not isinstance(obj, Tensor):
obj = Tensor(obj)
return obj
class Tensor:
# ...
def __matmul__(self, other):
# 0. make sure other is Tensor
other = as_tensor(other)
# 1. calculate forward values
values = self.values @ other.values
# 2. if output tensor requires_grad
requires_grad = self.requires_grad or other.requires_grad
# 3. build dependency list
dependency = []
if self.requires_grad:
def grad_fn1(grad):
pass # TODO HERE
dependency.append(dict(tensor=self, grad_fn=grad_fn1))
if other.requires_grad:
def grad_fn2(grad):
pass # TODO HERE
dependency.append(dict(tensor=other, grad_fn=grad_fn2))
return Tensor(values, requires_grad, dependency)
def grad_fn_1(grad):
return np_matmul(grad, other.values.T)
def grad_fn_2(grad):
return np_matmul(self.values.T, grad)
def build_binary_ops(this, that, values, grad_fn_1, grad_fn_2):
requires_grad = this.requires_grad or that.requires_grad
dependency = []
if this.requires_grad:
dependency.append(dict(tensor=this, grad_fn=grad_fn_1))
if that.requires_grad:
dependency.append(dict(tensor=that, grad_fn=grad_fn_2))
return this.__class__(values, requires_grad, dependency)
5.2 求平均值算子
class Tensor:
# ...
def reduce_mean(self, axis=None):
values = self.values.mean(axis=axis)
def grad_fn(grad):
grad = grad / self.values.size * np.ones_like(self.values)
return grad
return build_unary_ops(self, values, grad_fn)
class Tensor:
# ...
def reduce_mean(self, axis=None):
values = self.values.mean(axis=axis)
if axis is not None:
repeat = self.values.shape[axis]
def grad_fn(grad):
if axis is None:
grad = grad / self.values.size * np.ones_like(self.values)
else:
grad = np.expand_dims(grad / repeat, axis)
grad = np.repeat(grad, repeat, axis)
return grad
return build_unary_ops(self, values, grad_fn)
x = np.zeros((2, 3, 4))
--------------------------------
array([[[0., 0., 0., 0.], # 3x4
[0., 0., 0., 0.],
[0., 0., 0., 0.]],
[[0., 0., 0., 0.], # 3x4
[0., 0., 0., 0.],
[0., 0., 0., 0.]]])
# (1,)向量或者标量与x相加,将加在每一个元素上
>>> x + np.random.RandomState(0).rand(1,)
array([[[0.55, 0.55, 0.55, 0.55],
[0.55, 0.55, 0.55, 0.55],
[0.55, 0.55, 0.55, 0.55]],
[[0.55, 0.55, 0.55, 0.55],
[0.55, 0.55, 0.55, 0.55],
[0.55, 0.55, 0.55, 0.55]]])
# (4,)向量x相加,将x的2x3个形状为(4,)的子数组与该向量对应位置元素相加
>>> x + np.random.RandomState(0).rand(4,)
array([[[0.55, 0.72, 0.6 , 0.54],
[0.55, 0.72, 0.6 , 0.54],
[0.55, 0.72, 0.6 , 0.54]],
[[0.55, 0.72, 0.6 , 0.54],
[0.55, 0.72, 0.6 , 0.54],
[0.55, 0.72, 0.6 , 0.54]]])
# (3, 1)向量与x相加,沿着1轴对应位置元素相加,沿其他轴重复
>>> x + np.random.RandomState(0).rand(3, 1)
array([[[0.55, 0.55, 0.55, 0.55],
[0.72, 0.72, 0.72, 0.72],
[0.6 , 0.6 , 0.6 , 0.6 ]],
[[0.55, 0.55, 0.55, 0.55],
[0.72, 0.72, 0.72, 0.72],
[0.6 , 0.6 , 0.6 , 0.6 ]]])
# (3, 4)向量与x相加,将x的每一组3x4的子数组与该向量对应元素相加
>>> x + np.random.RandomState(0).rand(3, 4)
array([[[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]],
[[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]]])
# (2, 3, 4)向量与x相加,对应位置元素相加
>>> x + np.random.RandomState(0).rand(2, 3, 4)
array([[[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]],
[[0.57, 0.93, 0.07, 0.09],
[0.02, 0.83, 0.78, 0.87],
[0.98, 0.8 , 0.46, 0.78]]])
# (2,)向量与x相加,将报维度不匹配的错误
>>> x + np.random.RandomState(0).rand(2,)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: operands could not be broadcast together with shapes (2,3,4) (2,)
def __add__(ts1, ts2):
ts2 = as_tensor(ts2)
values = ts1.values + ts2.values
# ...
>>> y
array([[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]])
>>> y = np.expand_dims(y, 0)
>>> y
array([[[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]]])
>>> y = np.repeat(y, 2, axis=0)
>>> y
array([[[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]],
[[0.55, 0.72, 0.6 , 0.54],
[0.42, 0.65, 0.44, 0.89],
[0.96, 0.38, 0.79, 0.53]]])
def grad_fn_ts1(grad):
# handle broadcasting (5, 3) + (3,) -> (5, 3)
for _ in range(grad.ndim - ts1.values.ndim):
grad = grad.sum(axis=0)
# handle broadcasting (5, 3) + (1, 3) -> (5, 3)
for i, dim in enumerate(ts1.shape):
if dim == 1:
grad = grad.sum(axis=i, keepdims=True)
return grad
5.4 用户自定义算子
# 一元运算
def unary_operation(operand, *args, **kwargs):
# forward
values = unary_operation_forward(operand)
# backward
def grad_fn(grad):
# grad = ...
return grad
return build_unary_ops(operand, values, grad_fn)
# 二元运算(多元运算以此类推)
def binary_operation(operand_1, operand_2, *args, **kwargs):
# forward
values = binary_operation_forward(operand_1, operand_2)
# backward
def grad_fn_1(grad):
# grad = ...
return grad
def grad_fn_2(grad):
# grad = ...
return grad
return build_binary_ops(
operand_1, operand_2, values, grad_fn_1, grad_fn_2)
注意事项
def np_matmul(arr1, arr2):
if arr1.ndim == 1 and arr2.ndim == 1:
arr1 = np.mat(arr1).T
arr2 = np.mat(arr2)
return arr1 @ arr2
附录:完整代码
参考文献
[1] https://borgwang.github.io/dl/2019/09/15/autograd.html
[2] https://www.cnblogs.com/yangzhen-ahujhc/p/12300189.html
[3] https://www.runoob.com/design-pattern/design-pattern-intro.html
[4] https://blog.csdn.net/lien0906/article/details/78863118
[5] https://discuss.gluon.ai/t/topic/5831
[6] https://keras.io/api/applications/
[7] https://www.zhihu.com/question/303070254/answer/573037166
[8] https://zhuanlan.zhihu.com/p/24709748
[9] https://borgwang.github.io/dl/2019/09/15/autograd.html
更多阅读
#投 稿 通 道#
让你的文字被更多人看到
如何才能让更多的优质内容以更短路径到达读者群体,缩短读者寻找优质内容的成本呢?答案就是:你不认识的人。
总有一些你不认识的人,知道你想知道的东西。PaperWeekly 或许可以成为一座桥梁,促使不同背景、不同方向的学者和学术灵感相互碰撞,迸发出更多的可能性。
PaperWeekly 鼓励高校实验室或个人,在我们的平台上分享各类优质内容,可以是最新论文解读,也可以是学术热点剖析、科研心得或竞赛经验讲解等。我们的目的只有一个,让知识真正流动起来。
📝 稿件基本要求:
• 文章确系个人原创作品,未曾在公开渠道发表,如为其他平台已发表或待发表的文章,请明确标注
• 稿件建议以 markdown 格式撰写,文中配图以附件形式发送,要求图片清晰,无版权问题
• PaperWeekly 尊重原作者署名权,并将为每篇被采纳的原创首发稿件,提供业内具有竞争力稿酬,具体依据文章阅读量和文章质量阶梯制结算
📬 投稿通道:
• 投稿邮箱:[email protected]
• 来稿请备注即时联系方式(微信),以便我们在稿件选用的第一时间联系作者
• 您也可以直接添加小编微信(pwbot02)快速投稿,备注:姓名-投稿
△长按添加PaperWeekly小编
🔍
现在,在「知乎」也能找到我们了
进入知乎首页搜索「PaperWeekly」
微信扫码关注该文公众号作者