matrix_mul_benchmark.cu 513 行 | cu 文件
#include <stdio.h>#include <stdlib.h>#include <string.h>#include <cuda_runtime.h>#include <cuda_fp16.h>#include <cublas_v2.h>#include <omp.h>// ============== Configuration ==============int SIZES[] = {1024, 2048, 4096, 8192, 16384};#define NUM_SIZES (sizeof(SIZES) / sizeof(SIZES[0]))#define TILE_SIZE 32// ============== CPU Matrix Multiplication (OpenMP) ==============void matrixMulCPU(float *A, float *B, float *C, int N, int num_threads) {    #pragma omp parallel for num_threads(num_threads) collapse(2)    for (int i = 0; i < N; i++) {        for (int j = 0; j < N; j++) {            float sum = 0.0f;            for (int k = 0; k < N; k++) {                sum += A[i * N + k] * B[k * N + j];            }            C[i * N + j] = sum;        }    }}// ============== 1. Naive Implementation (CUDA Core) ==============__global__ void matrixMulNaive(float *A, float *B, float *C, int N) {    int row = blockIdx.y * blockDim.y + threadIdx.y;    int col = blockIdx.x * blockDim.x + threadIdx.x;    if (row < N && col < N) {        float sum = 0.0f;        for (int k = 0; k < N; k++) {            sum += A[row * N + k] * B[k * N + col];        }        C[row * N + col] = sum;    }}// ============== 2. Tiled Implementation (Shared Memory, CUDA Core) ==============__global__ void matrixMulTiled(float *A, float *B, float *C, int N) {    __shared__ float tileA[TILE_SIZE][TILE_SIZE];    __shared__ float tileB[TILE_SIZE][TILE_SIZE];    int row = blockIdx.y * TILE_SIZE + threadIdx.y;    int col = blockIdx.x * TILE_SIZE + threadIdx.x;    float sum = 0.0f;    for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {        if (row < N && t * TILE_SIZE + threadIdx.x < N)            tileA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE + threadIdx.x];        else            tileA[threadIdx.y][threadIdx.x] = 0.0f;        if (col < N && t * TILE_SIZE + threadIdx.y < N)            tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * N + col];        else            tileB[threadIdx.y][threadIdx.x] = 0.0f;        __syncthreads();        for (int k = 0; k < TILE_SIZE; k++) {            sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];        }        __syncthreads();    }    if (row < N && col < N) {        C[row * N + col] = sum;    }}// ============== Benchmark CPU ==============float benchmarkCPU(int N, int num_threads, bool skip_large) {    if (skip_large && N > 4096) return -1;  // Skip for large matrices (too slow)    size_t size = (size_t)N * N * sizeof(float);    float *h_A = (float*)malloc(size);    float *h_B = (float*)malloc(size);    float *h_C = (float*)malloc(size);    if (!h_A || !h_B || !h_C) {        free(h_A); free(h_B); free(h_C);        return -1;    }    // Initialize with simple pattern    for (int i = 0; i < N * N; i++) {        h_A[i] = 1.0f;        h_B[i] = 1.0f;    }    double start = omp_get_wtime();    matrixMulCPU(h_A, h_B, h_C, N, num_threads);    double end = omp_get_wtime();    free(h_A);    free(h_B);    free(h_C);    return (float)((end - start) * 1000.0);  // Return milliseconds}// ============== Benchmark GPU (cuBLAS) ==============float benchmarkGPU(int N, cublasHandle_t handle) {    size_t size = (size_t)N * N * sizeof(float);    float *d_A, *d_B, *d_C;    if (cudaMalloc(&d_A, size) != cudaSuccess) return -1;    if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; }    if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; }    // Initialize with ones    float *h_init = (float*)malloc(size);    for (int i = 0; i < N * N; i++) h_init[i] = 1.0f;    cudaMemcpy(d_A, h_init, size, cudaMemcpyHostToDevice);    cudaMemcpy(d_B, h_init, size, cudaMemcpyHostToDevice);    free(h_init);    const float alpha = 1.0f;    const float beta = 0.0f;    // Warmup    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,                &alpha, d_B, N, d_A, N, &beta, d_C, N);    cudaDeviceSynchronize();    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,                &alpha, d_B, N, d_A, N, &beta, d_C, N);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms = 0;    cudaEventElapsedTime(&ms, start, stop);    cudaEventDestroy(start);    cudaEventDestroy(stop);    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    return ms;}// ============== Benchmark Naive ==============float benchmarkNaive(int N, bool skip_large) {    if (skip_large && N > 4096) return -1;    size_t size = (size_t)N * N * sizeof(float);    float *d_A, *d_B, *d_C;    if (cudaMalloc(&d_A, size) != cudaSuccess) return -1;    if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; }    if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; }    cudaMemset(d_A, 0, size);    cudaMemset(d_B, 0, size);    dim3 threadsPerBlock(16, 16);    dim3 numBlocks((N + 15) / 16, (N + 15) / 16);    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    matrixMulNaive<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms = 0;    cudaEventElapsedTime(&ms, start, stop);    cudaEventDestroy(start);    cudaEventDestroy(stop);    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    return ms;}// ============== Benchmark Tiled ==============float benchmarkTiled(int N, bool skip_large) {    if (skip_large && N > 4096) return -1;    size_t size = (size_t)N * N * sizeof(float);    float *d_A, *d_B, *d_C;    if (cudaMalloc(&d_A, size) != cudaSuccess) return -1;    if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; }    if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; }    cudaMemset(d_A, 0, size);    cudaMemset(d_B, 0, size);    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);    dim3 numBlocks((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    matrixMulTiled<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms = 0;    cudaEventElapsedTime(&ms, start, stop);    cudaEventDestroy(start);    cudaEventDestroy(stop);    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    return ms;}// ============== Benchmark cuBLAS FP32 ==============float benchmarkCuBLAS_FP32(int N, cublasHandle_t handle) {    size_t size = (size_t)N * N * sizeof(float);    float *d_A, *d_B, *d_C;    if (cudaMalloc(&d_A, size) != cudaSuccess) return -1;    if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; }    if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; }    cudaMemset(d_A, 0, size);    cudaMemset(d_B, 0, size);    const float alpha = 1.0f;    const float beta = 0.0f;    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,                &alpha, d_B, N, d_A, N, &beta, d_C, N);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms = 0;    cudaEventElapsedTime(&ms, start, stop);    cudaEventDestroy(start);    cudaEventDestroy(stop);    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    return ms;}// ============== Benchmark cuBLAS TF32 ==============float benchmarkCuBLAS_TF32(int N, cublasHandle_t handle) {    size_t size = (size_t)N * N * sizeof(float);    float *d_A, *d_B, *d_C;    if (cudaMalloc(&d_A, size) != cudaSuccess) return -1;    if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; }    if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; }    cudaMemset(d_A, 0, size);    cudaMemset(d_B, 0, size);    const float alpha = 1.0f;    const float beta = 0.0f;    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,                &alpha, d_B, N, d_A, N, &beta, d_C, N);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms = 0;    cudaEventElapsedTime(&ms, start, stop);    cudaEventDestroy(start);    cudaEventDestroy(stop);    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    return ms;}// ============== Benchmark cuBLAS FP16 ==============float benchmarkCuBLAS_FP16(int N, cublasHandle_t handle) {    size_t size_fp16 = (size_t)N * N * sizeof(half);    size_t size_fp32 = (size_t)N * N * sizeof(float);    half *d_A, *d_B;    float *d_C;    if (cudaMalloc(&d_A, size_fp16) != cudaSuccess) return -1;    if (cudaMalloc(&d_B, size_fp16) != cudaSuccess) { cudaFree(d_A); return -1; }    if (cudaMalloc(&d_C, size_fp32) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; }    cudaMemset(d_A, 0, size_fp16);    cudaMemset(d_B, 0, size_fp16);    const float alpha = 1.0f;    const float beta = 0.0f;    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,                 &alpha, d_B, CUDA_R_16F, N, d_A, CUDA_R_16F, N,                 &beta, d_C, CUDA_R_32F, N,                 CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms = 0;    cudaEventElapsedTime(&ms, start, stop);    cudaEventDestroy(start);    cudaEventDestroy(stop);    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    return ms;}// ============== Main ==============int main() {    // Get CPU thread count    int num_threads = omp_get_max_threads();    // Print GPU info    cudaDeviceProp prop;    cudaGetDeviceProperties(&prop, 0);    // ==================== Part 1: CPU vs GPU Comparison ====================    printf("+==============================================================================+\n");    printf("|            Matrix Multiplication Benchmark: CPU vs GPU                       |\n");    printf("+==============================================================================+\n\n");    printf("GPU: %s\n", prop.name);    printf("CUDA Cores: %d\n", prop.multiProcessorCount * 128);    printf("GPU Memory: %.0f MB\n", prop.totalGlobalMem / (1024.0 * 1024.0));    printf("CPU Threads: %d (auto-detected)\n\n", num_threads);    // Initialize cuBLAS for GPU benchmark    cublasHandle_t handle_gpu;    cublasCreate(&handle_gpu);    printf("+----------+--------------+--------------+--------------+----------------------+\n");    printf("|  Size    |   GPU (ms)   |   CPU (ms)   |   Speedup    |    GFLOPS (GPU)      |\n");    printf("+----------+--------------+--------------+--------------+----------------------+\n");    for (int i = 0; i < NUM_SIZES; i++) {        int N = SIZES[i];        float time_gpu = benchmarkGPU(N, handle_gpu);        float time_cpu = benchmarkCPU(N, num_threads, true);        if (time_gpu < 0) {            printf("| %5d    | GPU memory allocation failed\n", N);            continue;        }        double ops = 2.0 * (double)N * N * N;        double gflops_gpu = ops / (time_gpu * 1e6);        if (time_cpu > 0) {            float speedup = time_cpu / time_gpu;            printf("| %5d    | %12.2f | %12.2f | %10.1fx  | %20.2f |\n",                   N, time_gpu, time_cpu, speedup, gflops_gpu);        } else {            printf("| %5d    | %12.2f |   (skipped)  |      -       | %20.2f |\n",                   N, time_gpu, gflops_gpu);        }    }    printf("+----------+--------------+--------------+--------------+----------------------+\n");    printf("\nNotes:\n");    printf("- GPU time = kernel execution only (excludes data transfer)\n");    printf("- CPU uses %d threads\n", num_threads);    printf("- CPU benchmark skipped for N > 4096 (too slow)\n");    printf("- GFLOPS = Giga Floating Point Operations Per Second\n");    printf("- Matrix multiply: 2*N^3 operations\n\n");    cublasDestroy(handle_gpu);    // ==================== Part 2: CUDA Core vs Tensor Core ====================    printf("\n");    printf("+==============================================================================+\n");    printf("|     Matrix Multiplication: CUDA Core vs Tensor Core Comparison              |\n");    printf("|     (Memory released after each test to support larger matrices)            |\n");    printf("+==============================================================================+\n\n");    printf("GPU: %s\n", prop.name);    printf("CUDA Cores: %d\n", prop.multiProcessorCount * 128);    printf("Tensor Cores: %d (estimated)\n", prop.multiProcessorCount * 4);    printf("GPU Memory: %.0f MB\n", prop.totalGlobalMem / (1024.0 * 1024.0));    printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);    // Initialize cuBLAS handles    cublasHandle_t handle_cuda_core;    cublasHandle_t handle_tensor_tf32;    cublasHandle_t handle_tensor_fp16;    cublasCreate(&handle_cuda_core);    cublasCreate(&handle_tensor_tf32);    cublasCreate(&handle_tensor_fp16);    // Set math modes    cublasSetMathMode(handle_cuda_core, CUBLAS_DEFAULT_MATH);    cublasSetMathMode(handle_tensor_tf32, CUBLAS_TF32_TENSOR_OP_MATH);    cublasSetMathMode(handle_tensor_fp16, CUBLAS_DEFAULT_MATH);    printf("Math Modes:\n");    printf("- CUDA Core:    CUBLAS_DEFAULT_MATH (FP32 input, CUDA Cores only)\n");    printf("- TensorCore TF32: CUBLAS_TF32_TENSOR_OP_MATH (FP32 input, TF32 compute)\n");    printf("- TensorCore FP16: cublasGemmEx (FP16 input, FP32 output, max throughput)\n\n");    printf("Memory per matrix:\n");    for (int i = 0; i < NUM_SIZES; i++) {        int N = SIZES[i];        printf("  %5d x %5d: FP32 = %4zu MB, FP16 = %4zu MB\n",               N, N, (size_t)N*N*4/(1024*1024), (size_t)N*N*2/(1024*1024));    }    printf("\n");    printf("+-------+----------+----------+----------+----------+----------+----------+-----------+\n");    printf("| Size  | Naive    | Tiled    | CUDA Core| TC TF32  | TC FP16  | Speedup  | FP16      |\n");    printf("|       | (ms)     | (ms)     | (ms)     | (ms)     | (ms)     | FP16/CUDA| GFLOPS    |\n");    printf("+-------+----------+----------+----------+----------+----------+----------+-----------+\n");    for (int i = 0; i < NUM_SIZES; i++) {        int N = SIZES[i];        // Test each method separately, releasing memory after each        float time_naive = benchmarkNaive(N, true);        float time_tiled = benchmarkTiled(N, true);        float time_cublas_cuda = benchmarkCuBLAS_FP32(N, handle_cuda_core);        float time_cublas_tf32 = benchmarkCuBLAS_TF32(N, handle_tensor_tf32);        float time_cublas_fp16 = benchmarkCuBLAS_FP16(N, handle_tensor_fp16);        // Check if FP16 test succeeded        if (time_cublas_fp16 < 0) {            printf("| %5d | GPU memory allocation failed (need more VRAM)\n", N);            continue;        }        // Calculate GFLOPS for FP16        double ops = 2.0 * (double)N * N * N;        double gflops_fp16 = ops / (time_cublas_fp16 * 1e6);        // Speedup calculation        float speedup_fp16 = (time_cublas_cuda > 0) ? time_cublas_cuda / time_cublas_fp16 : 0;        // Print results        if (time_naive > 0 && time_tiled > 0) {            printf("| %5d | %8.2f | %8.2f | %8.2f | %8.2f | %8.2f | %8.2fx | %9.1f |\n",                   N, time_naive, time_tiled, time_cublas_cuda, time_cublas_tf32,                   time_cublas_fp16, speedup_fp16, gflops_fp16);        } else {            // Large matrix - skip Naive and Tiled (too slow)            printf("| %5d |  skipped |  skipped | %8.2f | %8.2f | %8.2f | %8.2fx | %9.1f |\n",                   N, time_cublas_cuda, time_cublas_tf32,                   time_cublas_fp16, speedup_fp16, gflops_fp16);        }    }    printf("+-------+----------+----------+----------+----------+----------+----------+-----------+\n");    printf("\n");    printf("+==============================================================================+\n");    printf("|                              Legend                                          |\n");    printf("+==============================================================================+\n");    printf("| Naive      : Basic CUDA kernel, each thread computes one element            |\n");    printf("| Tiled      : CUDA kernel with shared memory, tile size = %d x %d             |\n", TILE_SIZE, TILE_SIZE);    printf("| CUDA Core  : cuBLAS FP32 with CUBLAS_DEFAULT_MATH (CUDA Cores only)         |\n");    printf("| TC TF32    : cuBLAS FP32 with CUBLAS_TF32_TENSOR_OP_MATH (Tensor Cores)     |\n");    printf("| TC FP16    : cuBLAS FP16 input with cublasGemmEx (Tensor Cores, max speed)  |\n");    printf("| Speedup    : CUDA Core time / TC FP16 time                                  |\n");    printf("| FP16 GFLOPS: Performance based on TC FP16 time                              |\n");    printf("+==============================================================================+\n");    printf("\nNote: Memory is released after each test to support larger matrices.\n");    printf("      Naive and Tiled are skipped for N > 4096 (too slow).\n");    // Cleanup cuBLAS    cublasDestroy(handle_cuda_core);    cublasDestroy(handle_tensor_tf32);    cublasDestroy(handle_tensor_fp16);    return 0;}