#include <stdio.h>#include <stdlib.h>#include <string.h>#include <cuda_runtime.h>#include <cuda_fp16.h>#include <cublas_v2.h>#include <omp.h>// ============== Configuration ==============int SIZES[] = {1024, 2048, 4096, 8192, 16384};#define NUM_SIZES (sizeof(SIZES) / sizeof(SIZES[0]))#define TILE_SIZE 32// ============== CPU Matrix Multiplication (OpenMP) ==============void matrixMulCPU(float *A, float *B, float *C, int N, int num_threads) { #pragma omp parallel for num_threads(num_threads) collapse(2) for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float sum = 0.0f; for (int k = 0; k < N; k++) { sum += A[i * N + k] * B[k * N + j]; } C[i * N + j] = sum; } }}// ============== 1. Naive Implementation (CUDA Core) ==============__global__ void matrixMulNaive(float *A, float *B, float *C, int N) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < N && col < N) { float sum = 0.0f; for (int k = 0; k < N; k++) { sum += A[row * N + k] * B[k * N + col]; } C[row * N + col] = sum; }}// ============== 2. Tiled Implementation (Shared Memory, CUDA Core) ==============__global__ void matrixMulTiled(float *A, float *B, float *C, int N) { __shared__ float tileA[TILE_SIZE][TILE_SIZE]; __shared__ float tileB[TILE_SIZE][TILE_SIZE]; int row = blockIdx.y * TILE_SIZE + threadIdx.y; int col = blockIdx.x * TILE_SIZE + threadIdx.x; float sum = 0.0f; for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) { if (row < N && t * TILE_SIZE + threadIdx.x < N) tileA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE + threadIdx.x]; else tileA[threadIdx.y][threadIdx.x] = 0.0f; if (col < N && t * TILE_SIZE + threadIdx.y < N) tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * N + col]; else tileB[threadIdx.y][threadIdx.x] = 0.0f; __syncthreads(); for (int k = 0; k < TILE_SIZE; k++) { sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x]; } __syncthreads(); } if (row < N && col < N) { C[row * N + col] = sum; }}// ============== Benchmark CPU ==============float benchmarkCPU(int N, int num_threads, bool skip_large) { if (skip_large && N > 4096) return -1; // Skip for large matrices (too slow) size_t size = (size_t)N * N * sizeof(float); float *h_A = (float*)malloc(size); float *h_B = (float*)malloc(size); float *h_C = (float*)malloc(size); if (!h_A || !h_B || !h_C) { free(h_A); free(h_B); free(h_C); return -1; } // Initialize with simple pattern for (int i = 0; i < N * N; i++) { h_A[i] = 1.0f; h_B[i] = 1.0f; } double start = omp_get_wtime(); matrixMulCPU(h_A, h_B, h_C, N, num_threads); double end = omp_get_wtime(); free(h_A); free(h_B); free(h_C); return (float)((end - start) * 1000.0); // Return milliseconds}// ============== Benchmark GPU (cuBLAS) ==============float benchmarkGPU(int N, cublasHandle_t handle) { size_t size = (size_t)N * N * sizeof(float); float *d_A, *d_B, *d_C; if (cudaMalloc(&d_A, size) != cudaSuccess) return -1; if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; } if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; } // Initialize with ones float *h_init = (float*)malloc(size); for (int i = 0; i < N * N; i++) h_init[i] = 1.0f; cudaMemcpy(d_A, h_init, size, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_init, size, cudaMemcpyHostToDevice); free(h_init); const float alpha = 1.0f; const float beta = 0.0f; // Warmup cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C, N); cudaDeviceSynchronize(); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C, N); cudaEventRecord(stop); cudaEventSynchronize(stop); float ms = 0; cudaEventElapsedTime(&ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return ms;}// ============== Benchmark Naive ==============float benchmarkNaive(int N, bool skip_large) { if (skip_large && N > 4096) return -1; size_t size = (size_t)N * N * sizeof(float); float *d_A, *d_B, *d_C; if (cudaMalloc(&d_A, size) != cudaSuccess) return -1; if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; } if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; } cudaMemset(d_A, 0, size); cudaMemset(d_B, 0, size); dim3 threadsPerBlock(16, 16); dim3 numBlocks((N + 15) / 16, (N + 15) / 16); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); matrixMulNaive<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N); cudaEventRecord(stop); cudaEventSynchronize(stop); float ms = 0; cudaEventElapsedTime(&ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return ms;}// ============== Benchmark Tiled ==============float benchmarkTiled(int N, bool skip_large) { if (skip_large && N > 4096) return -1; size_t size = (size_t)N * N * sizeof(float); float *d_A, *d_B, *d_C; if (cudaMalloc(&d_A, size) != cudaSuccess) return -1; if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; } if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; } cudaMemset(d_A, 0, size); cudaMemset(d_B, 0, size); dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE); dim3 numBlocks((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); matrixMulTiled<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N); cudaEventRecord(stop); cudaEventSynchronize(stop); float ms = 0; cudaEventElapsedTime(&ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return ms;}// ============== Benchmark cuBLAS FP32 ==============float benchmarkCuBLAS_FP32(int N, cublasHandle_t handle) { size_t size = (size_t)N * N * sizeof(float); float *d_A, *d_B, *d_C; if (cudaMalloc(&d_A, size) != cudaSuccess) return -1; if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; } if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; } cudaMemset(d_A, 0, size); cudaMemset(d_B, 0, size); const float alpha = 1.0f; const float beta = 0.0f; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C, N); cudaEventRecord(stop); cudaEventSynchronize(stop); float ms = 0; cudaEventElapsedTime(&ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return ms;}// ============== Benchmark cuBLAS TF32 ==============float benchmarkCuBLAS_TF32(int N, cublasHandle_t handle) { size_t size = (size_t)N * N * sizeof(float); float *d_A, *d_B, *d_C; if (cudaMalloc(&d_A, size) != cudaSuccess) return -1; if (cudaMalloc(&d_B, size) != cudaSuccess) { cudaFree(d_A); return -1; } if (cudaMalloc(&d_C, size) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; } cudaMemset(d_A, 0, size); cudaMemset(d_B, 0, size); const float alpha = 1.0f; const float beta = 0.0f; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C, N); cudaEventRecord(stop); cudaEventSynchronize(stop); float ms = 0; cudaEventElapsedTime(&ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return ms;}// ============== Benchmark cuBLAS FP16 ==============float benchmarkCuBLAS_FP16(int N, cublasHandle_t handle) { size_t size_fp16 = (size_t)N * N * sizeof(half); size_t size_fp32 = (size_t)N * N * sizeof(float); half *d_A, *d_B; float *d_C; if (cudaMalloc(&d_A, size_fp16) != cudaSuccess) return -1; if (cudaMalloc(&d_B, size_fp16) != cudaSuccess) { cudaFree(d_A); return -1; } if (cudaMalloc(&d_C, size_fp32) != cudaSuccess) { cudaFree(d_A); cudaFree(d_B); return -1; } cudaMemset(d_A, 0, size_fp16); cudaMemset(d_B, 0, size_fp16); const float alpha = 1.0f; const float beta = 0.0f; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, CUDA_R_16F, N, d_A, CUDA_R_16F, N, &beta, d_C, CUDA_R_32F, N, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); cudaEventRecord(stop); cudaEventSynchronize(stop); float ms = 0; cudaEventElapsedTime(&ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return ms;}// ============== Main ==============int main() { // Get CPU thread count int num_threads = omp_get_max_threads(); // Print GPU info cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); // ==================== Part 1: CPU vs GPU Comparison ==================== printf("+==============================================================================+\n"); printf("| Matrix Multiplication Benchmark: CPU vs GPU |\n"); printf("+==============================================================================+\n\n"); printf("GPU: %s\n", prop.name); printf("CUDA Cores: %d\n", prop.multiProcessorCount * 128); printf("GPU Memory: %.0f MB\n", prop.totalGlobalMem / (1024.0 * 1024.0)); printf("CPU Threads: %d (auto-detected)\n\n", num_threads); // Initialize cuBLAS for GPU benchmark cublasHandle_t handle_gpu; cublasCreate(&handle_gpu); printf("+----------+--------------+--------------+--------------+----------------------+\n"); printf("| Size | GPU (ms) | CPU (ms) | Speedup | GFLOPS (GPU) |\n"); printf("+----------+--------------+--------------+--------------+----------------------+\n"); for (int i = 0; i < NUM_SIZES; i++) { int N = SIZES[i]; float time_gpu = benchmarkGPU(N, handle_gpu); float time_cpu = benchmarkCPU(N, num_threads, true); if (time_gpu < 0) { printf("| %5d | GPU memory allocation failed\n", N); continue; } double ops = 2.0 * (double)N * N * N; double gflops_gpu = ops / (time_gpu * 1e6); if (time_cpu > 0) { float speedup = time_cpu / time_gpu; printf("| %5d | %12.2f | %12.2f | %10.1fx | %20.2f |\n", N, time_gpu, time_cpu, speedup, gflops_gpu); } else { printf("| %5d | %12.2f | (skipped) | - | %20.2f |\n", N, time_gpu, gflops_gpu); } } printf("+----------+--------------+--------------+--------------+----------------------+\n"); printf("\nNotes:\n"); printf("- GPU time = kernel execution only (excludes data transfer)\n"); printf("- CPU uses %d threads\n", num_threads); printf("- CPU benchmark skipped for N > 4096 (too slow)\n"); printf("- GFLOPS = Giga Floating Point Operations Per Second\n"); printf("- Matrix multiply: 2*N^3 operations\n\n"); cublasDestroy(handle_gpu); // ==================== Part 2: CUDA Core vs Tensor Core ==================== printf("\n"); printf("+==============================================================================+\n"); printf("| Matrix Multiplication: CUDA Core vs Tensor Core Comparison |\n"); printf("| (Memory released after each test to support larger matrices) |\n"); printf("+==============================================================================+\n\n"); printf("GPU: %s\n", prop.name); printf("CUDA Cores: %d\n", prop.multiProcessorCount * 128); printf("Tensor Cores: %d (estimated)\n", prop.multiProcessorCount * 4); printf("GPU Memory: %.0f MB\n", prop.totalGlobalMem / (1024.0 * 1024.0)); printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor); // Initialize cuBLAS handles cublasHandle_t handle_cuda_core; cublasHandle_t handle_tensor_tf32; cublasHandle_t handle_tensor_fp16; cublasCreate(&handle_cuda_core); cublasCreate(&handle_tensor_tf32); cublasCreate(&handle_tensor_fp16); // Set math modes cublasSetMathMode(handle_cuda_core, CUBLAS_DEFAULT_MATH); cublasSetMathMode(handle_tensor_tf32, CUBLAS_TF32_TENSOR_OP_MATH); cublasSetMathMode(handle_tensor_fp16, CUBLAS_DEFAULT_MATH); printf("Math Modes:\n"); printf("- CUDA Core: CUBLAS_DEFAULT_MATH (FP32 input, CUDA Cores only)\n"); printf("- TensorCore TF32: CUBLAS_TF32_TENSOR_OP_MATH (FP32 input, TF32 compute)\n"); printf("- TensorCore FP16: cublasGemmEx (FP16 input, FP32 output, max throughput)\n\n"); printf("Memory per matrix:\n"); for (int i = 0; i < NUM_SIZES; i++) { int N = SIZES[i]; printf(" %5d x %5d: FP32 = %4zu MB, FP16 = %4zu MB\n", N, N, (size_t)N*N*4/(1024*1024), (size_t)N*N*2/(1024*1024)); } printf("\n"); printf("+-------+----------+----------+----------+----------+----------+----------+-----------+\n"); printf("| Size | Naive | Tiled | CUDA Core| TC TF32 | TC FP16 | Speedup | FP16 |\n"); printf("| | (ms) | (ms) | (ms) | (ms) | (ms) | FP16/CUDA| GFLOPS |\n"); printf("+-------+----------+----------+----------+----------+----------+----------+-----------+\n"); for (int i = 0; i < NUM_SIZES; i++) { int N = SIZES[i]; // Test each method separately, releasing memory after each float time_naive = benchmarkNaive(N, true); float time_tiled = benchmarkTiled(N, true); float time_cublas_cuda = benchmarkCuBLAS_FP32(N, handle_cuda_core); float time_cublas_tf32 = benchmarkCuBLAS_TF32(N, handle_tensor_tf32); float time_cublas_fp16 = benchmarkCuBLAS_FP16(N, handle_tensor_fp16); // Check if FP16 test succeeded if (time_cublas_fp16 < 0) { printf("| %5d | GPU memory allocation failed (need more VRAM)\n", N); continue; } // Calculate GFLOPS for FP16 double ops = 2.0 * (double)N * N * N; double gflops_fp16 = ops / (time_cublas_fp16 * 1e6); // Speedup calculation float speedup_fp16 = (time_cublas_cuda > 0) ? time_cublas_cuda / time_cublas_fp16 : 0; // Print results if (time_naive > 0 && time_tiled > 0) { printf("| %5d | %8.2f | %8.2f | %8.2f | %8.2f | %8.2f | %8.2fx | %9.1f |\n", N, time_naive, time_tiled, time_cublas_cuda, time_cublas_tf32, time_cublas_fp16, speedup_fp16, gflops_fp16); } else { // Large matrix - skip Naive and Tiled (too slow) printf("| %5d | skipped | skipped | %8.2f | %8.2f | %8.2f | %8.2fx | %9.1f |\n", N, time_cublas_cuda, time_cublas_tf32, time_cublas_fp16, speedup_fp16, gflops_fp16); } } printf("+-------+----------+----------+----------+----------+----------+----------+-----------+\n"); printf("\n"); printf("+==============================================================================+\n"); printf("| Legend |\n"); printf("+==============================================================================+\n"); printf("| Naive : Basic CUDA kernel, each thread computes one element |\n"); printf("| Tiled : CUDA kernel with shared memory, tile size = %d x %d |\n", TILE_SIZE, TILE_SIZE); printf("| CUDA Core : cuBLAS FP32 with CUBLAS_DEFAULT_MATH (CUDA Cores only) |\n"); printf("| TC TF32 : cuBLAS FP32 with CUBLAS_TF32_TENSOR_OP_MATH (Tensor Cores) |\n"); printf("| TC FP16 : cuBLAS FP16 input with cublasGemmEx (Tensor Cores, max speed) |\n"); printf("| Speedup : CUDA Core time / TC FP16 time |\n"); printf("| FP16 GFLOPS: Performance based on TC FP16 time |\n"); printf("+==============================================================================+\n"); printf("\nNote: Memory is released after each test to support larger matrices.\n"); printf(" Naive and Tiled are skipped for N > 4096 (too slow).\n"); // Cleanup cuBLAS cublasDestroy(handle_cuda_core); cublasDestroy(handle_tensor_tf32); cublasDestroy(handle_tensor_fp16); return 0;}