CUDA最简实现计算图和自动微分

计算节点(ComputeNode) + 计算图(ComputeGraph)

一个基于计算图的自动微分流程，其核心由以下部分组成：

计算节点(ComputeNode): 包含操作类型、前向传播值、梯度值和输入节点
计算图(ComputeGraph): 管理所有计算节点，执行前向和反向传播
操作类型(OpType): 包括输入、加法、减法、乘法、除法、矩阵乘法和各种激活函数

1、梯度矩阵化的CUDA过程

1.1 基本原理：链式法则的矩阵化表示

代码中的反向传播遵循链式法则，对于复合函数 f(g(x))，其导数 df/dx = df/dg * dg/dx。在深度学习中，这被扩展到矩阵形式。

1.2 各操作的梯度计算实现

加法操作 (OP_ADD)

对于 C = A + B，梯度反向传播为：


xxxxxxxxxx
void addBackwardCUDA(const Matrix_CU& outGrad, Matrix_CU& inGradA, Matrix_CU& inGradB) {
    // 加法的梯度直接复制到两个输入
    const int size = outGrad.row * outGrad.col * sizeof(float);
    std::memcpy(inGradA.data, outGrad.data, size);
    std::memcpy(inGradB.data, outGrad.data, size);
}

这里体现了加法的导数特性：∂C/∂A = ∂C/∂B = 1，所以输出梯度直接传递给输入梯度。

减法操作 (OP_SUB)

对于 C = A - B，梯度反向传播为：


xxxxxxxxxx
void subBackwardCUDA(const Matrix_CU& outGrad, Matrix_CU& inGradA, Matrix_CU& inGradB) {
    // 对于A: 直接复制梯度
    std::memcpy(inGradA.data, outGrad.data, size);
    
    // 对于B: 计算负梯度
    negGradKernel<<<numBlocks, blockSize>>>(d_outGrad, d_inGradB, totalElements);
}

这里体现了减法的导数特性：∂C/∂A = 1，∂C/∂B = -1，所以A的梯度等于输出梯度，B的梯度等于输出梯度的负值。

元素级乘法 (OP_MUL)

对于 C = A * B（元素级乘法），梯度计算为：

Loss对A求导，要先针对C求导，再对A求导，Loss对B求导同理：

\begin{matrix} (1) & \begin{matrix} A : \partial L / \partial A = \partial L / \partial (A * B) * \partial (A * B) / \partial A = \partial L / \partial (A * B) * B \\ B : \partial L / \partial B = \partial L / \partial (A * B) * \partial (A * B) / \partial B = \partial L / \partial (A * B) * A \end{matrix} \end{matrix}


xxxxxxxxxx
__global__ void mulBackwardKernel(const float* outGrad, const float* A, const float* B, 
                                 float* inGradA, float* inGradB, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        inGradA[idx] = outGrad[idx] * B[idx];
        inGradB[idx] = outGrad[idx] * A[idx];
    }
}

链式法则：A的梯度是输出梯度乘以B，B的梯度是输出梯度乘以A。

矩阵乘法 (OP_MATMUL)

对于矩阵乘法 C = A * B，梯度计算更为复杂：


xxxxxxxxxx
// A的梯度计算
matmulBackwardAKernel<<<gridSizeA, blockSizeA>>>(d_outGrad, d_B, d_inGradA, 
                                               outGrad_rows, B_rows, B_cols);

// B的梯度计算                                               
matmulBackwardBKernel<<<gridSizeB, blockSizeB>>>(d_A, d_outGrad, d_inGradB, 
                                               A_rows, A_cols, outGrad_cols);

假设：

$\mathbf{A} \in \mathbb{R}^{m \times n}$

$\mathbf{B} \in \mathbb{R}^{n \times p}$

$\mathbf{C} = \mathbf{A} \mathbf{B} \Rightarrow \mathbf{C} \in \mathbb{R}^{m \times p}$

$\mathbf{C}$ $\frac{\partial L}{\partial \mathbf{C}} \in \mathbb{R}^{m \times p}$

$\frac{\partial L}{\partial \mathbf{A}}$ ：

根据矩阵求导规则：

$\frac{\partial L}{\partial \mathbf{A}} = \frac{\partial L}{\partial \mathbf{C}} \cdot \frac{\partial \mathbf{C}}{\partial \mathbf{A}}$

$\mathbf{C} = \mathbf{A} \mathbf{B}$ $\mathbf{A}$ 求导：

$\frac{\partial \mathbf{C}}{\partial \mathbf{A}} = \mathbf{B}^T$

所以：

$\frac{\partial L}{\partial \mathbf{A}} = \frac{\partial L}{\partial \mathbf{C}} \cdot \mathbf{B}^T$

维度匹配：

$\frac{\partial L}{\partial \mathbf{C}} \in \mathbb{R}^{m \times p}$

$\mathbf{B}^T \in \mathbb{R}^{p \times n}$

$计算 \frac{\partial L}{\partial \mathbf{A}}：$

$\frac{\partial L}{\partial \mathbf{A}} = \left( m \times p \right) \times \left( p \times n \right) = m \times n$

$\mathbf{A}$ 的维度。

$\mathbf{B}^T$ 的引入是为了确保矩阵乘法合法，同时维度匹配正确。

所以，具体CUDA核函数实现为：

\begin{matrix} (2) & A 的 梯 度 : d L / d A = d L / d C * d C / d A = d L / d C * B^{T} \end{matrix}

代码实现：


xxxxxxxxxx

__global__ void matmulBackwardAKernel(const float* outGrad, const float* B, float* inGradA,
                                     int outGrad_rows, int B_rows, int B_cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < outGrad_rows && col < B_rows) {
        float sum = 0.0f;
        for (int k = 0; k < B_cols; ++k) {
            sum += outGrad[row * B_cols + k] * B[col * B_cols + k];
        }
        inGradA[row * B_rows + col] = sum;
    }
}

// B的梯度: dL/dB = dL/dC * dC/dB = A^T * dL/dC
__global__ void matmulBackwardBKernel(const float* A, const float* outGrad, float* inGradB,
                                     int A_rows, int A_cols, int outGrad_cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < A_cols && col < outGrad_cols) {
        float sum = 0.0f;
        for (int k = 0; k < A_rows; ++k) {
            sum += A[k * A_cols + row] * outGrad[k * outGrad_cols + col];
        }
        inGradB[row * outGrad_cols + col] = sum;
    }
}

激活函数 (OP_RELU, OP_SIGMOID)

对于ReLU函数：


xxxxxxxxxx
__global__ void reluBackwardKernel(const float* outGrad, const float* input, float* inGrad, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        inGrad[idx] = (input[idx] > 0.0f) ? outGrad[idx] : 0.0f;
    }
}

对于Sigmoid函数：


xxxxxxxxxx
__global__ void sigmoidBackwardKernel(const float* outGrad, const float* output, float* inGrad, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        // sigmoid的导数是 sigmoid(x) * (1 - sigmoid(x))
        inGrad[idx] = outGrad[idx] * output[idx] * (1.0f - output[idx]);
    }
}

这些实现体现了激活函数的导数特点：

ReLU: 当输入>0时，导数为1；否则为0
Sigmoid: 导数为sigmoid(x)*(1-sigmoid(x))

2、梯度的层层计算过程

2.1 反向传播的初始化与执行流程

反向传播从输出节点开始，按照计算图的逆向顺序进行：


xxxxxxxxxx
void backward(ComputeNode* outputNode) {
    // 重置所有节点的梯度
    for (ComputeNode* node : nodes) {
        node->zeroGrad();
    }
    
    // 输出节点的梯度初始化为1
    std::fill(outputNode->grad.data, outputNode->grad.data + outputNode->grad.row * outputNode->grad.col, 1.0f);
    
    // 从输出节点开始反向传播
    for (int i = nodes.size() - 1; i >= 0; --i) {
        ComputeNode* node = nodes[i];
        // 根据节点类型执行相应的反向传播操作
        switch (node->op) {
            // 各种操作的反向传播
        }
    }
}

关键步骤：

初始化所有梯度为0

设置输出节点梯度为1（假设损失函数是输出的和）


xxxxxxxxxx
std::fill(outputNode->grad.data, outputNode->grad.data + outputNode->grad.row * outputNode->grad.col, 1.0f);

$∂L/∂L$ = 1，表示损失函数相对于自身的变化率为 1。

按计算图的逆序遍历节点，执行反向传播

2.2 具体示例：以main函数中的网络为例

main函数构建了一个简单的前向网络：


xxxxxxxxxx
ComputeNode* input = graph.addInput(input_rows, input_cols, batch_size);
ComputeNode* weight = graph.addInput(input_cols, output_cols);
ComputeNode* fc1 = graph.addMatMul(input, weight);  // 全连接层
ComputeNode* output_node = fc1;
if (use_relu) {
    output_node = graph.addReLU(fc1);
}

这构建了一个计算图：input -> matmul(with weight) -> (optional)relu -> output

反向传播过程：

输出层初始化：
- 如果使用ReLU，则ReLU输出节点的梯度设为1
  - 因为如果 use_relu = true，则计算图变成：
    $\begin{matrix} (3) & input \to MatMul \to ReLU \to output \end{matrix}$
    输出变成ReLU。
- 否则，MatMul输出节点的梯度设为1
  - $\begin{matrix} (4) & input \to MatMul \to output \end{matrix}$
ReLU层梯度传播
（如果使用）：
- 计算fc1节点的梯度：fc1_grad = relu_grad * (input > 0 ? 1 : 0)
  即根据ReLU的导数规则，只保留输入为正值处的梯度
MatMul层梯度传播：
- 计算weight的梯度：weight_grad = input^T * fc1_grad
- 计算input的梯度：input_grad = fc1_grad * weight^T

最后输出计算过程：


xxxxxxxxxx
====== Configuration ======
Input matrix: 100 x 100
Weight matrix: 100 x 50
Batch size: 32
Activation: ReLU

====== CUDA Results ======
Input first 5x5:
0.840188 0.394383 0.783099 0.79844 0.911647 
0.482491 0.215825 0.950252 0.920128 0.14766 
0.979434 0.743811 0.903366 0.983596 0.66688 
0.297288 0.904932 0.909643 0.873979 0.498144 
0.119111 0.589637 0.578635 0.529899 0.595045 
Weight first 5x5:
0.212602 0.10695 0.462352 0.157342 0.110305 
0.153127 0.651945 0.305722 0.0621202 0.965235 
0.897866 0.783286 0.952307 0.0414321 0.72427 
0.438603 0.141648 0.258162 0.285166 0.896459 
0.0949283 0.838575 0.910116 0.679985 0.466453 
Output first 5x5:
27.2103 28.0387 29.5829 26.7451 28.159 
26.2911 25.4754 26.7194 25.0945 25.6666 
25.6583 26.8631 29.3364 24.3494 27.4707 
23.2694 24.5181 26.6447 22.6948 26.0994 
23.6185 24.6267 26.4008 22.2588 23.9217 
Input gradient first 5x5:
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
Weight gradient first 5x5:
1594.88 1594.88 1594.88 1594.88 1594.88 
1582.36 1582.36 1582.36 1582.36 1582.36 
1608.85 1608.85 1608.85 1608.85 1608.85 
1620.08 1620.08 1620.08 1620.08 1620.08 
1592.11 1592.11 1592.11 1592.11 1592.11 

====== CPU Verification ======
CPU Output first 5x5:
27.2103 28.0387 29.5829 26.7451 28.159 
26.2911 25.4754 26.7194 25.0945 25.6666 
25.6582 26.8631 29.3364 24.3494 27.4707 
23.2694 24.5181 26.6447 22.6948 26.0994 
23.6185 24.6267 26.4008 22.2588 23.9217 
CPU Input gradient first 5x5:
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
25.3451 24.5174 28.2764 27.9038 24.9504 
CPU Weight gradient first 5x5:
1594.88 1594.88 1594.88 1594.88 1594.88 
1582.36 1582.36 1582.36 1582.36 1582.36 
1608.85 1608.85 1608.85 1608.85 1608.85 
1620.08 1620.08 1620.08 1620.08 1620.08 
1592.11 1592.11 1592.11 1592.11 1592.11 

====== Verification Results ======
Average output difference: 7.33018e-07
Average input gradient difference: 0
Average weight gradient difference: 0

====== Performance Comparison ======
CUDA computation time: 3.0699 ms
CPU computation time: 285.632 ms
Speedup: 93.0426x

完整代码：

https://github.com/AllenZYJ/Edge-Computing-Engine/blob/master/cuda_mat/mat_grad.cu