1 /////////////////////////////////////////////////////////////////////////// 2 // matirxmulCUBLAS.cpp 3 // 4 // Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 5 // 6 // Please refer to the NVIDIA end user license agreement (EULA) associated 7 // with this source code for terms and conditions that govern your use of 8 // this software. Any use, reproduction, disclosure, or distribution of 9 // this software and related documentation outside the terms of the EULA 10 // is strictly prohibited. 11 // 12 //////////////////////////////////////////////////////////////////////////// 13 14 // 15 // Matrix multiplication: C = A * B. 16 // Host code. 17 // 18 // This sample implements matrix multiplication as described in Chapter 3 19 // of the programming guide and uses the CUBLAS library to demonstrate 20 // the best performance. 21 22 // SOME PRECAUTIONS: 23 // IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B, 24 // WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)! 25 // The reason is explained as follows: 26 27 // CUBLAS library uses column-major storage, but C/C++ use row-major storage. 28 // When passing the matrix pointer to CUBLAS, the memory layout alters from 29 // row-major to column-major, which is equivalent to an implict transpose. 30 31 // In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication 32 // C = A * B, we can‘t use the input order like cublasSgemm(A, B) because of 33 // implict transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T). 34 // If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not 35 // multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C 36 // is a column-based cublas matrix, which means C(T) in C/C++, we need extra 37 // transpose code to convert it to a row-based C/C++ matrix. 38 39 // To solve the problem, let‘s consider our desired result C, a row-major matrix. 40 // In cublas format, it is C(T) actually (becuase of the implict transpose). 41 // C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T) 42 // happen to be C/C++ matrice B and A (still becuase of the implict transpose)! 43 // We don‘t need extra transpose code, we only need alter the input order! 44 // 45 // CUBLAS provides high-performance matrix multiplication. 46 // See also: 47 // V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," 48 // in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC ‘08), 49 // Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. 50 // 51 52 // Utilities and system includes 53 #include <assert.h> 54 #include <helper_string.h> // helper for shared functions common to CUDA SDK samples 55 56 // CUDA runtime 57 #include <cuda_runtime.h> 58 #include <cublas_v2.h> 59 60 // CUDA and CUBLAS functions 61 #include <helper_functions.h> 62 63 #ifndef min 64 #define min(a,b) ((a < b) ? a : b) 65 #endif 66 #ifndef max 67 #define max(a,b) ((a > b) ? a : b) 68 #endif 69 70 typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes 71 { 72 unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; 73 } sMatrixSize; 74 75 //////////////////////////////////////////////////////////////////////////////// 76 //! Compute reference data set matrix multiply on CPU 77 //! C = A * B 78 //! @param C reference data, computed but preallocated 79 //! @param A matrix A as provided to device 80 //! @param B matrix B as provided to device 81 //! @param hA height of matrix A 82 //! @param wB width of matrix B 83 //////////////////////////////////////////////////////////////////////////////// 84 void 85 matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB) 86 { 87 for (unsigned int i = 0; i < hA; ++i) 88 for (unsigned int j = 0; j < wB; ++j) 89 { 90 double sum = 0; 91 92 for (unsigned int k = 0; k < wA; ++k) 93 { 94 double a = A[i * wA + k]; 95 double b = B[k * wB + j]; 96 sum += a * b; 97 } 98 99 C[i * wB + j] = (float)sum; 100 } 101 } 102 103 //////////////////////////////////////////////////////////////////////////////// 104 // These are CUDA Helper functions (in addition to helper_cuda.h) 105 106 void inline checkError(cublasStatus_t status, const char *msg) 107 { 108 if (status != CUBLAS_STATUS_SUCCESS) 109 { 110 printf("%s", msg); 111 exit(EXIT_FAILURE); 112 } 113 } 114 // end of CUDA Helper Functions 115 116 // Allocates a matrix with random float entries. 117 void randomInit(float *data, int size) 118 { 119 for (int i = 0; i < size; ++i) 120 data[i] = rand() / (float)RAND_MAX; 121 } 122 123 void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol) 124 { 125 printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol); 126 int i,j,k; 127 int error_count=0; 128 129 for (j = 0; j < height; j++) 130 { 131 if (error_count < iListLength) 132 { 133 printf("\n Row %d:\n", j); 134 } 135 136 for (i = 0; i < width; i++) 137 { 138 k = j * width + i; 139 float fDiff = fabs(data1[k] - data2[k]); 140 141 if (fDiff > fListTol) 142 { 143 if (error_count < iListLength) 144 { 145 printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff); 146 } 147 148 error_count++; 149 } 150 } 151 } 152 153 printf(" \n Total Errors = %d\n", error_count); 154 } 155 156 void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size) 157 { 158 // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line 159 cudaError_t error; 160 devID = 0; 161 162 if (checkCmdLineFlag(argc, (const char **)argv, "device")) 163 { 164 devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 165 error = cudaSetDevice(devID); 166 167 if (error != cudaSuccess) 168 { 169 printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__); 170 exit(EXIT_FAILURE); 171 } 172 } 173 174 // get number of SMs on this GPU 175 error = cudaGetDevice(&devID); 176 177 if (error != cudaSuccess) 178 { 179 printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__); 180 exit(EXIT_FAILURE); 181 } 182 183 184 if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) 185 { 186 iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); 187 } 188 189 iSizeMultiple = min(iSizeMultiple, 10); 190 iSizeMultiple = max(iSizeMultiple, 1); 191 192 cudaDeviceProp deviceProp; 193 194 error = cudaGetDeviceProperties(&deviceProp, devID); 195 196 if (error != cudaSuccess) 197 { 198 printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); 199 exit(EXIT_FAILURE); 200 } 201 202 printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 203 204 // use a larger block size for Fermi and above 205 int block_size = (deviceProp.major < 2) ? 16 : 32; 206 207 matrix_size.uiWA = 2 * block_size * iSizeMultiple; 208 matrix_size.uiHA = 4 * block_size * iSizeMultiple; 209 matrix_size.uiWB = 2 * block_size * iSizeMultiple; 210 matrix_size.uiHB = 4 * block_size * iSizeMultiple; 211 matrix_size.uiWC = 2 * block_size * iSizeMultiple; 212 matrix_size.uiHC = 4 * block_size * iSizeMultiple; 213 214 printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", 215 matrix_size.uiWA, matrix_size.uiHA, 216 matrix_size.uiWB, matrix_size.uiHB, 217 matrix_size.uiWC, matrix_size.uiHC); 218 } 219 220 //////////////////////////////////////////////////////////////////////////////// 221 //! Run a simple test matrix multiply using CUBLAS 222 //////////////////////////////////////////////////////////////////////////////// 223 int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) 224 { 225 cudaDeviceProp deviceProp; 226 cudaError_t error; 227 228 error = cudaGetDeviceProperties(&deviceProp, devID); 229 230 if (error != cudaSuccess) 231 { 232 printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); 233 exit(EXIT_FAILURE); 234 } 235 236 // use a larger block size for Fermi and above 237 int block_size = (deviceProp.major < 2) ? 16 : 32; 238 239 // set seed for rand() 240 srand(2006); 241 242 // allocate host memory for matrices A and B 243 unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; 244 unsigned int mem_size_A = sizeof(float) * size_A; 245 float *h_A = (float *)malloc(mem_size_A); 246 unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; 247 unsigned int mem_size_B = sizeof(float) * size_B; 248 float *h_B = (float *)malloc(mem_size_B); 249 250 // set seed for rand() 251 srand(2006); 252 253 // initialize host memory 254 randomInit(h_A, size_A); 255 randomInit(h_B, size_B); 256 257 // allocate device memory 258 float *d_A, *d_B, *d_C; 259 unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; 260 unsigned int mem_size_C = sizeof(float) * size_C; 261 262 // allocate host memory for the result 263 float *h_C = (float *) malloc(mem_size_C); 264 float *h_CUBLAS = (float *) malloc(mem_size_C); 265 266 error = cudaMalloc((void **) &d_A, mem_size_A); 267 268 if (error != cudaSuccess) 269 { 270 printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); 271 exit(EXIT_FAILURE); 272 } 273 274 error = cudaMalloc((void **) &d_B, mem_size_B); 275 276 if (error != cudaSuccess) 277 { 278 printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); 279 exit(EXIT_FAILURE); 280 } 281 282 // copy host memory to device 283 error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 284 285 if (error != cudaSuccess) 286 { 287 printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__); 288 exit(EXIT_FAILURE); 289 } 290 291 error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 292 293 if (error != cudaSuccess) 294 { 295 printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__); 296 exit(EXIT_FAILURE); 297 } 298 299 error = cudaMalloc((void **) &d_C, mem_size_C); 300 301 if (error != cudaSuccess) 302 { 303 printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); 304 exit(EXIT_FAILURE); 305 } 306 307 // setup execution parameters 308 dim3 threads(block_size, block_size); 309 dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); 310 311 // create and start timer 312 printf("Computing result using CUBLAS..."); 313 314 // execute the kernel 315 int nIter = 30; 316 317 // CUBLAS version 2.0 318 { 319 cublasHandle_t handle; 320 321 cublasStatus_t ret; 322 323 ret = cublasCreate(&handle); 324 325 if (ret != CUBLAS_STATUS_SUCCESS) 326 { 327 printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); 328 exit(EXIT_FAILURE); 329 } 330 331 const float alpha = 1.0f; 332 const float beta = 0.0f; 333 //Perform warmup operation with cublas 334 ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); 335 336 if (ret != CUBLAS_STATUS_SUCCESS) 337 { 338 printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); 339 exit(EXIT_FAILURE); 340 } 341 342 // Allocate CUDA events that we‘ll use for timing 343 cudaEvent_t start; 344 error = cudaEventCreate(&start); 345 346 if (error != cudaSuccess) 347 { 348 fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 349 exit(EXIT_FAILURE); 350 } 351 352 cudaEvent_t stop; 353 error = cudaEventCreate(&stop); 354 355 if (error != cudaSuccess) 356 { 357 fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 358 exit(EXIT_FAILURE); 359 } 360 361 // Record the start event 362 error = cudaEventRecord(start, NULL); 363 364 if (error != cudaSuccess) 365 { 366 fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 367 exit(EXIT_FAILURE); 368 } 369 370 for (int j = 0; j < nIter; j++) 371 { 372 //note cublas is column primary! 373 //need to transpose the order 374 ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); 375 376 if (ret != CUBLAS_STATUS_SUCCESS) 377 { 378 printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); 379 exit(EXIT_FAILURE); 380 } 381 } 382 383 printf("done.\n"); 384 385 // Record the stop event 386 error = cudaEventRecord(stop, NULL); 387 388 if (error != cudaSuccess) 389 { 390 fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 391 exit(EXIT_FAILURE); 392 } 393 394 // Wait for the stop event to complete 395 error = cudaEventSynchronize(stop); 396 397 if (error != cudaSuccess) 398 { 399 fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 400 exit(EXIT_FAILURE); 401 } 402 403 float msecTotal = 0.0f; 404 error = cudaEventElapsedTime(&msecTotal, start, stop); 405 406 if (error != cudaSuccess) 407 { 408 fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 409 exit(EXIT_FAILURE); 410 } 411 412 // Compute and print the performance 413 float msecPerMatrixMul = msecTotal / nIter; 414 double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; 415 double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 416 printf( 417 "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", 418 gigaFlops, 419 msecPerMatrixMul, 420 flopsPerMatrixMul); 421 422 // copy result from device to host 423 error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); 424 425 if (error != cudaSuccess) 426 { 427 printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__); 428 exit(EXIT_FAILURE); 429 } 430 431 checkError(cublasDestroy(handle), "cublasDestroy() error!\n"); 432 } 433 434 // compute reference solution 435 printf("Computing result using host CPU..."); 436 float *reference = (float *)malloc(mem_size_C); 437 matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); 438 printf("done.\n"); 439 440 // check result (CUBLAS) 441 bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); 442 443 if (resCUBLAS != true) 444 { 445 printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); 446 } 447 448 printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); 449 450 // clean up memory 451 free(h_A); 452 free(h_B); 453 free(h_C); 454 free(reference); 455 cudaFree(d_A); 456 cudaFree(d_B); 457 cudaFree(d_C); 458 459 cudaDeviceReset(); 460 461 if (resCUBLAS == true) 462 { 463 return EXIT_SUCCESS; // return value = 1 464 } 465 else 466 { 467 return EXIT_FAILURE; // return value = 0 468 } 469 } 470 471 //////////////////////////////////////////////////////////////////////////////// 472 // Program main 473 //////////////////////////////////////////////////////////////////////////////// 474 int main(int argc, char **argv) 475 { 476 printf("[Matrix Multiply CUBLAS] - Starting...\n"); 477 478 int devID = 0, sizeMult = 5; 479 sMatrixSize matrix_size; 480 481 initializeCUDA(argc, argv, devID, sizeMult, matrix_size); 482 483 int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); 484 485 exit(matrix_result); 486 }
1 CUDA_PATH ?=/usr/local/cuda-7.0 2 NVCC :=$(CUDA_PATH)/bin/nvcc -ccbin g++ 3 INCLUDE :=-I/usr/local/cuda/include/ 4 -I/usr/local/cuda/samples/common/inc 5 -I/usr/include/c++ 6 -I./ 7 8 LIBRARIES :=-L/usr/local/cuda/lib64 -lcudart -lcublas 9 TARGETS :=matrixMulCUBLAS 10 OBJECTS :=$(addsuffix .o, $(TARGETS)) 11 12 .SUFFIXES:.o .cu .cpp 13 .cu.o: 14 $(NVCC) -arch=sm_20 $(INCLUDE) -c -o [email protected] $< $(LIBRARIES) 15 .cpp.o: 16 g++ $(INCLUDE) -c -o [email protected] $< $(LIBRARYIES) 17 18 all:$(OBJECTS) 19 #sudo cp /usr/local/cuda/lib64/libcublas.so.7.0 /usr/lib 20 ln -s libcudart.so.7.0 libcudart.so 21 ln -s libcudart.so.7.0 libcudart.so.7 22 ln -s libcublas.so.7.0 libcublas.so 23 ln -s libcublas.so.7.0 libcublas.so.7 24 g++ $(INCLUDE) -o $(TARGETS) $^ $(LIBRARIES) 25 run: 26 ./$(TARGETS) 27 clean: 28 rm -rf *.o libcudart.so libcudart.so.729 libcublas.so libcublas.so.7 $(TARGETS)
./matrixMulCUBLAS
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "GeForce GTX 650 Ti" with compute capability 3.0
MatrixA(320,640), MatrixB(320,640), MatrixC(320,640)
Computing result using CUBLAS...done.
Performance= 612.46 GFlop/s, Time= 0.214 msec, Size= 131072000 Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS
时间: 2024-10-12 12:28:03