Source code for clumsygrad.grad

"""
This module contains backward functions for various tensor operations in a computational graph.

All functions have a generic signature that takes a tensor and the any/tensor's gradient as inputs, 
and returns a tuple of gradients for each parent tensor.

Args:
    tensor: Result tensor from the forward operation
    grad: Gradient flowing back from the next layer
    
Returns:
    Tuple of gradients for each parent tensor
"""

from __future__ import annotations

from typing import Tuple

import numpy as np

from .tensor import Tensor

GradientTuple = Tuple[np.ndarray, ...]
"""
A tuple of gradients for each parent tensor.
"""

"""
Elemetary backward functions for tensor operations.
"""

[docs]def transpose_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for transpose operation. For :math:`z = x^T`: .. math:: \frac{\partial z}{\partial x} = \text{grad}^T """ return (grad.T,)
[docs]def add_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for addition operation. For :math:`z = x + y`: .. math:: \frac{\partial z}{\partial x} = 1, \quad \frac{\partial z}{\partial y} = 1 """ return (grad, grad)
[docs]def add_scalar_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for tensor + scalar operation. For :math:`z = x + c` where :math:`c` is scalar: .. math:: \frac{\partial z}{\partial x} = 1 """ return (grad,)
[docs]def sub_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for subtraction operation. For :math:`z = x - y`: .. math:: \frac{\partial z}{\partial x} = 1, \quad \frac{\partial z}{\partial y} = -1 """ return (grad, -grad)
[docs]def sub_scalar_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for tensor - scalar operation. For :math:`z = x - c` where :math:`c` is scalar: .. math:: \frac{\partial z}{\partial x} = 1 """ return (grad,)
[docs]def mul_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for element-wise multiplication. For :math:`z = x \odot y`: .. math:: \frac{\partial z}{\partial x} = y, \quad \frac{\partial z}{\partial y} = x """ x, y = tensor._parents return (grad * y._data, grad * x._data)
[docs]def mul_scalar_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for tensor * scalar operation. For :math:`z = x \cdot c` where :math:`c` is scalar: .. math:: \frac{\partial z}{\partial x} = c """ scalar = tensor._extra.get('scalar_value', 1) return (grad * scalar,)
[docs]def matmul_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for matrix multiplication. For :math:`Z = X \cdot Y`: .. math:: \frac{\partial Z}{\partial X} = \text{grad} \cdot Y^T, \quad \frac{\partial Z}{\partial Y} = X^T \cdot \text{grad} """ x, y = tensor._parents return (grad @ y._data.T, x._data.T @ grad)
[docs]def power_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for power operation. For :math:`z = x^n`: .. math:: \frac{\partial z}{\partial x} = n \cdot x^{n-1} """ x = tensor._parents[0] power = tensor._extra.get('power', 1) return (grad * (power * (x._data ** (power - 1))),)
[docs]def negate_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for negation operation. For :math:`z = -x`: .. math:: \frac{\partial z}{\partial x} = -1 """ return (-grad,)
[docs]def abs_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for absolute value operation. For :math:`z = |x|`: .. math:: \frac{\partial z}{\partial x} = \text{sign}(x) = \begin{cases} 1 & \text{if } x > 0 \\ -1 & \text{if } x < 0 \\ 0 & \text{if } x = 0 \end{cases} """ x = tensor._parents[0] return (grad * np.sign(x._data),)
[docs]def reshape_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for reshape operation. For :math:`z = \text{reshape}(x, \text{new\_shape})`: .. math:: \frac{\partial z}{\partial x} = \text{reshape}(\text{grad}, \text{original\_shape}) Since reshape only changes the view of the data without changing values, the gradient is simply reshaped back to the original shape. """ input_shape = tensor._extra.get('original_shape') return (grad.reshape(input_shape),)
""" Backward functions for reduction operations. """
[docs]def sum_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for sum operation. For :math:`z = \sum_{i \in \text{axis}} x_i`: .. math:: \frac{\partial z}{\partial x_i} = \begin{cases} 1 & \text{if } i \in \text{axis} \\ 0 & \text{otherwise} \end{cases} This function handles dimension reduction by broadcasting gradients back to the original shape. """ input_shape = tensor._extra.get('input_shape') axis = tensor._extra.get('axis') keepdims = tensor._extra.get('keepdims', False) if axis is not None and not keepdims: if isinstance(axis, int): axis = (axis,) elif axis is None: axis = tuple(range(len(input_shape))) expanded_grad = grad for ax in sorted(axis): expanded_grad = np.expand_dims(expanded_grad, axis=ax) else: expanded_grad = grad broadcasted_grad = np.broadcast_to(expanded_grad, input_shape) return (broadcasted_grad,)
[docs]def mean_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for mean operation. For :math:`z = \frac{1}{n}\sum_{i \in \text{axis}} x_i`: .. math:: \frac{\partial z}{\partial x_i} = \begin{cases} \frac{1}{n} & \text{if } i \in \text{axis} \\ 0 & \text{otherwise} \end{cases} where :math:`n` is the number of elements being averaged. """ input_shape = tensor._extra.get('input_shape') axis = tensor._extra.get('axis') keepdims = tensor._extra.get('keepdims', False) if axis is None: n = np.prod(input_shape) broadcasted_grad = np.broadcast_to(grad / n, input_shape) else: if isinstance(axis, int): axis = (axis,) n = np.prod([input_shape[ax] for ax in axis]) expanded_grad = grad / n if not keepdims: for ax in sorted(axis): expanded_grad = np.expand_dims(expanded_grad, axis=ax) broadcasted_grad = np.broadcast_to(expanded_grad, input_shape) return (broadcasted_grad,)
""" Backward functions for mathematical operations. """
[docs]def exp_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for exponential operation. For :math:`z = e^x`: .. math:: \frac{\partial z}{\partial x} = e^x """ return (grad * tensor._data,)
[docs]def log_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for natural logarithm operation. For :math:`z = \ln(x)`: .. math:: \frac{\partial z}{\partial x} = \frac{1}{x} """ x = tensor._parents[0] return (grad / x._data,)
[docs]def sqrt_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for square root operation. For :math:`z = \sqrt{x}`: .. math:: \frac{\partial z}{\partial x} = \frac{1}{2\sqrt{x}} """ x = tensor._parents[0] return (grad / (2 * np.sqrt(x._data)),)
[docs]def sin_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for sine operation. For :math:`z = \sin(x)`: .. math:: \frac{\partial z}{\partial x} = \cos(x) """ x = tensor._parents[0] return (grad * np.cos(x._data),)
[docs]def cos_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for cosine operation. For :math:`z = \cos(x)`: .. math:: \frac{\partial z}{\partial x} = -\sin(x) """ x = tensor._parents[0] return (grad * -np.sin(x._data),)
[docs]def tan_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for tangent operation. For :math:`z = \tan(x)`: .. math:: \frac{\partial z}{\partial x} = \sec^2(x) """ x = tensor._parents[0] return (grad * (1 / np.cos(x._data) ** 2),)
""" Backward functions for activation functions. """
[docs]def relu_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for ReLU activation. For :math:`z = \text{ReLU}(x) = \max(0, x)`: .. math:: \frac{\partial z}{\partial x} = \begin{cases} 1 & \text{if } x > 0 \\ 0 & \text{otherwise} \end{cases} """ return (grad * (tensor._data > 0).astype(np.float32),)
[docs]def sigmoid_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for Sigmoid activation. For :math:`z = \sigma(x) = \frac{1}{1 + e^{-x}}`: .. math:: \frac{\partial z}{\partial x} = \sigma(x) \cdot (1 - \sigma(x)) """ sigmoid_output = 1 / (1 + np.exp(-tensor._data)) return (grad * sigmoid_output * (1 - sigmoid_output),)
[docs]def tanh_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for hyperbolic tangent activation. For :math:`z = \tanh(x)`: .. math:: \frac{\partial z}{\partial x} = 1 - \tanh^2(x) """ tanh_output = tensor._data return (grad * (1 - tanh_output ** 2),)
[docs]def softmax_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for softmax activation. For :math:`z_i = \frac{e^{x_i}}{\sum_{j} e^{x_j}}`: .. math:: \frac{\partial z_i}{\partial x_j} = \begin{cases} z_i(1 - z_i) & \text{if } i = j \\ -z_i z_j & \text{if } i \neq j \end{cases} The gradient computation simplifies to: .. math:: \frac{\partial L}{\partial x} = z \odot \left(\text{grad} - \sum(z \odot \text{grad})\right) """ softmax_output = tensor._data axis = tensor._extra.get('axis', -1) grad_input = softmax_output * (grad - np.sum(softmax_output * grad, axis=axis, keepdims=True)) return (grad_input,)
""" Backward functions for loss functions. """
[docs]def mse_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for Mean Squared Error loss. For :math:`L = (\text{pred} - \text{target})^2`: .. math:: \frac{\partial L}{\partial \text{pred}} = 2(\text{pred} - \text{target}) \frac{\partial L}{\partial \text{target}} = -2(\text{pred} - \text{target}) """ pred, target = tensor._parents diff = pred._data - target._data return (2 * diff * grad, -2 * diff * grad)
[docs]def mae_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for Mean Absolute Error loss. For :math:`L = |\text{pred} - \text{target}|`: .. math:: \frac{\partial L}{\partial \text{pred}} = \frac{1}{n} \cdot \text{sign}(\text{pred} - \text{target}) \frac{\partial L}{\partial \text{target}} = -\frac{1}{n} \cdot \text{sign}(\text{pred} - \text{target}) where :math:`n` is the number of elements. """ pred, target = tensor._parents diff = pred._data - target._data n = pred._data.size sign_diff = np.sign(diff) return (grad * sign_diff / n, grad * (-sign_diff) / n)
""" Backward functions fro broadcasting operations. """
[docs]def add_broadcast_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for broadcasted addition operation. For :math:`z = x + y` where :math:`x` and :math:`y` have different but broadcastable shapes: .. math:: \frac{\partial z}{\partial x} = \text{reduce}(\text{grad}, \text{left\_shape}) \frac{\partial z}{\partial y} = \text{reduce}(\text{grad}, \text{right\_shape}) The gradients are reduced (summed) along broadcasted dimensions and reshaped to match the original tensor shapes. """ left_shape = tensor._extra.get('left_shape') right_shape = tensor._extra.get('right_shape') left_grad = _reduce_gradient_to_shape(grad, left_shape) right_grad = _reduce_gradient_to_shape(grad, right_shape) return (left_grad, right_grad)
[docs]def sub_broadcast_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for broadcasted subtraction operation. For :math:`z = x - y` where :math:`x` and :math:`y` have different but broadcastable shapes: .. math:: \frac{\partial z}{\partial x} = \text{reduce}(\text{grad}, \text{left\_shape}) \frac{\partial z}{\partial y} = \text{reduce}(-\text{grad}, \text{right\_shape}) The gradient for the first operand is positive, for the second operand is negative. Both are reduced to match the original tensor shapes. """ left_shape = tensor._extra.get('left_shape') right_shape = tensor._extra.get('right_shape') left_grad = _reduce_gradient_to_shape(grad, left_shape) right_grad = _reduce_gradient_to_shape(-grad, right_shape) return (left_grad, right_grad)
[docs]def mul_broadcast_backward(tensor: Tensor, grad: np.ndarray) -> GradientTuple: r""" Backward function for broadcasted element-wise multiplication. For :math:`z = x \odot y` where :math:`x` and :math:`y` have different but broadcastable shapes: .. math:: \frac{\partial z}{\partial x} = \text{reduce}(\text{grad} \odot \text{broadcast}(y), \text{left\_shape}) \frac{\partial z}{\partial y} = \text{reduce}(\text{grad} \odot \text{broadcast}(x), \text{right\_shape}) Each tensor's gradient is the gradient times the other tensor's values, then reduced to match the original tensor shapes. """ left_shape = tensor._extra.get('left_shape') right_shape = tensor._extra.get('right_shape') x, y = tensor._parents y_broadcasted = np.broadcast_to(y._data, grad.shape) x_broadcasted = np.broadcast_to(x._data, grad.shape) left_grad = _reduce_gradient_to_shape(grad * y_broadcasted, left_shape) right_grad = _reduce_gradient_to_shape(grad * x_broadcasted, right_shape) return (left_grad, right_grad)
def _reduce_gradient_to_shape(grad: np.ndarray, target_shape: tuple) -> np.ndarray: r""" Reduce gradient from broadcasted shape back to target shape. Args: grad: Gradient with broadcasted shape target_shape: Original tensor shape to reduce back to Returns: Gradient reduced to target shape """ if target_shape == (): return np.sum(grad) result = grad ndim_added = result.ndim - len(target_shape) if ndim_added > 0: axes_to_sum = tuple(range(ndim_added)) result = np.sum(result, axis=axes_to_sum) for i, (result_dim, target_dim) in enumerate(zip(result.shape, target_shape)): if target_dim == 1 and result_dim > 1: result = np.sum(result, axis=i, keepdims=True) result = result.reshape(target_shape) return result