CUDA matrixMulCUBLAS

  1 ///////////////////////////////////////////////////////////////////////////
  2 //  matirxmulCUBLAS.cpp
  3 //
  4 // Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  5 //
  6 // Please refer to the NVIDIA end user license agreement (EULA) associated
  7 // with this source code for terms and conditions that govern your use of
  8 // this software. Any use, reproduction, disclosure, or distribution of
  9 // this software and related documentation outside the terms of the EULA
 10 // is strictly prohibited.
 11 //
 12 ////////////////////////////////////////////////////////////////////////////
 13
 14 //
 15 // Matrix multiplication: C = A * B.
 16 // Host code.
 17 //
 18 // This sample implements matrix multiplication as described in Chapter 3
 19 // of the programming guide and uses the CUBLAS library to demonstrate
 20 // the best performance.
 21
 22 // SOME PRECAUTIONS:
 23 // IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
 24 // WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
 25 // The reason is explained as follows:
 26
 27 // CUBLAS library uses column-major storage, but C/C++ use row-major storage.
 28 // When passing the matrix pointer to CUBLAS, the memory layout alters from
 29 // row-major to column-major, which is equivalent to an implict transpose.
 30
 31 // In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
 32 // C = A * B, we can‘t use the input order like cublasSgemm(A, B)  because of
 33 // implict transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
 34 // If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
 35 // multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
 36 // is a column-based cublas matrix, which means C(T) in C/C++, we need extra
 37 // transpose code to convert it to a row-based C/C++ matrix.
 38
 39 // To solve the problem, let‘s consider our desired result C, a row-major matrix.
 40 // In cublas format, it is C(T) actually (becuase of the implict transpose).
 41 // C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
 42 // happen to be C/C++ matrice B and A (still becuase of the implict transpose)!
 43 // We don‘t need extra transpose code, we only need alter the input order!
 44 //
 45 // CUBLAS provides high-performance matrix multiplication.
 46 // See also:
 47 // V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 48 // in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC ‘08),
 49 // Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 50 //
 51
 52 // Utilities and system includes
 53 #include <assert.h>
 54 #include <helper_string.h>  // helper for shared functions common to CUDA SDK samples
 55
 56 // CUDA runtime
 57 #include <cuda_runtime.h>
 58 #include <cublas_v2.h>
 59
 60 // CUDA and CUBLAS functions
 61 #include <helper_functions.h>
 62
 63 #ifndef min
 64 #define min(a,b) ((a < b) ? a : b)
 65 #endif
 66 #ifndef max
 67 #define max(a,b) ((a > b) ? a : b)
 68 #endif
 69
 70 typedef struct _matrixSize      // Optional Command-line multiplier for matrix sizes
 71 {
 72     unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
 73 } sMatrixSize;
 74
 75 ////////////////////////////////////////////////////////////////////////////////
 76 //! Compute reference data set matrix multiply on CPU
 77 //! C = A * B
 78 //! @param C          reference data, computed but preallocated
 79 //! @param A          matrix A as provided to device
 80 //! @param B          matrix B as provided to device
 81 //! @param hA         height of matrix A
 82 //! @param wB         width of matrix B
 83 ////////////////////////////////////////////////////////////////////////////////
 84 void
 85 matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 86 {
 87     for (unsigned int i = 0; i < hA; ++i)
 88         for (unsigned int j = 0; j < wB; ++j)
 89         {
 90             double sum = 0;
 91
 92             for (unsigned int k = 0; k < wA; ++k)
 93             {
 94                 double a = A[i * wA + k];
 95                 double b = B[k * wB + j];
 96                 sum += a * b;
 97             }
 98
 99             C[i * wB + j] = (float)sum;
100         }
101 }
102
103 ////////////////////////////////////////////////////////////////////////////////
104 // These are CUDA Helper functions (in addition to helper_cuda.h)
105
106 void inline checkError(cublasStatus_t status, const char *msg)
107 {
108     if (status != CUBLAS_STATUS_SUCCESS)
109     {
110         printf("%s", msg);
111         exit(EXIT_FAILURE);
112     }
113 }
114 // end of CUDA Helper Functions
115
116 // Allocates a matrix with random float entries.
117 void randomInit(float *data, int size)
118 {
119     for (int i = 0; i < size; ++i)
120         data[i] = rand() / (float)RAND_MAX;
121 }
122
123 void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
124 {
125     printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
126     int i,j,k;
127     int error_count=0;
128
129     for (j = 0; j < height; j++)
130     {
131         if (error_count < iListLength)
132         {
133             printf("\n  Row %d:\n", j);
134         }
135
136         for (i = 0; i < width; i++)
137         {
138             k = j * width + i;
139             float fDiff = fabs(data1[k] - data2[k]);
140
141             if (fDiff > fListTol)
142             {
143                 if (error_count < iListLength)
144                 {
145                     printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
146                 }
147
148                 error_count++;
149             }
150         }
151     }
152
153     printf(" \n  Total Errors = %d\n", error_count);
154 }
155
156 void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
157 {
158     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
159     cudaError_t error;
160     devID = 0;
161
162     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
163     {
164         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
165         error = cudaSetDevice(devID);
166
167         if (error != cudaSuccess)
168         {
169             printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
170             exit(EXIT_FAILURE);
171         }
172     }
173
174     // get number of SMs on this GPU
175     error = cudaGetDevice(&devID);
176
177     if (error != cudaSuccess)
178     {
179         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
180         exit(EXIT_FAILURE);
181     }
182
183
184     if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
185     {
186         iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
187     }
188
189     iSizeMultiple = min(iSizeMultiple, 10);
190     iSizeMultiple = max(iSizeMultiple, 1);
191
192     cudaDeviceProp deviceProp;
193
194     error = cudaGetDeviceProperties(&deviceProp, devID);
195
196     if (error != cudaSuccess)
197     {
198         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
199         exit(EXIT_FAILURE);
200     }
201
202     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
203
204     // use a larger block size for Fermi and above
205     int block_size = (deviceProp.major < 2) ? 16 : 32;
206
207     matrix_size.uiWA = 2 * block_size * iSizeMultiple;
208     matrix_size.uiHA = 4 * block_size * iSizeMultiple;
209     matrix_size.uiWB = 2 * block_size * iSizeMultiple;
210     matrix_size.uiHB = 4 * block_size * iSizeMultiple;
211     matrix_size.uiWC = 2 * block_size * iSizeMultiple;
212     matrix_size.uiHC = 4 * block_size * iSizeMultiple;
213
214     printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
215            matrix_size.uiWA, matrix_size.uiHA,
216            matrix_size.uiWB, matrix_size.uiHB,
217            matrix_size.uiWC, matrix_size.uiHC);
218 }
219
220 ////////////////////////////////////////////////////////////////////////////////
221 //! Run a simple test matrix multiply using CUBLAS
222 ////////////////////////////////////////////////////////////////////////////////
223 int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
224 {
225     cudaDeviceProp deviceProp;
226     cudaError_t error;
227
228     error = cudaGetDeviceProperties(&deviceProp, devID);
229
230     if (error != cudaSuccess)
231     {
232         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
233         exit(EXIT_FAILURE);
234     }
235
236     // use a larger block size for Fermi and above
237     int block_size = (deviceProp.major < 2) ? 16 : 32;
238
239     // set seed for rand()
240     srand(2006);
241
242     // allocate host memory for matrices A and B
243     unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
244     unsigned int mem_size_A = sizeof(float) * size_A;
245     float *h_A = (float *)malloc(mem_size_A);
246     unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
247     unsigned int mem_size_B = sizeof(float) * size_B;
248     float *h_B = (float *)malloc(mem_size_B);
249
250     // set seed for rand()
251     srand(2006);
252
253     // initialize host memory
254     randomInit(h_A, size_A);
255     randomInit(h_B, size_B);
256
257     // allocate device memory
258     float *d_A, *d_B, *d_C;
259     unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
260     unsigned int mem_size_C = sizeof(float) * size_C;
261
262     // allocate host memory for the result
263     float *h_C      = (float *) malloc(mem_size_C);
264     float *h_CUBLAS = (float *) malloc(mem_size_C);
265
266     error = cudaMalloc((void **) &d_A, mem_size_A);
267
268     if (error != cudaSuccess)
269     {
270         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
271         exit(EXIT_FAILURE);
272     }
273
274     error = cudaMalloc((void **) &d_B, mem_size_B);
275
276     if (error != cudaSuccess)
277     {
278         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
279         exit(EXIT_FAILURE);
280     }
281
282     // copy host memory to device
283     error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
284
285     if (error != cudaSuccess)
286     {
287         printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
288         exit(EXIT_FAILURE);
289     }
290
291     error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
292
293     if (error != cudaSuccess)
294     {
295         printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
296         exit(EXIT_FAILURE);
297     }
298
299     error = cudaMalloc((void **) &d_C, mem_size_C);
300
301     if (error != cudaSuccess)
302     {
303         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
304         exit(EXIT_FAILURE);
305     }
306
307     // setup execution parameters
308     dim3 threads(block_size, block_size);
309     dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
310
311     // create and start timer
312     printf("Computing result using CUBLAS...");
313
314     // execute the kernel
315     int nIter = 30;
316
317     // CUBLAS version 2.0
318     {
319         cublasHandle_t handle;
320
321         cublasStatus_t ret;
322
323         ret = cublasCreate(&handle);
324
325         if (ret != CUBLAS_STATUS_SUCCESS)
326         {
327             printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
328             exit(EXIT_FAILURE);
329         }
330
331         const float alpha = 1.0f;
332         const float beta  = 0.0f;
333         //Perform warmup operation with cublas
334         ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);
335
336         if (ret != CUBLAS_STATUS_SUCCESS)
337         {
338             printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
339             exit(EXIT_FAILURE);
340         }
341
342         // Allocate CUDA events that we‘ll use for timing
343         cudaEvent_t start;
344         error = cudaEventCreate(&start);
345
346         if (error != cudaSuccess)
347         {
348             fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
349             exit(EXIT_FAILURE);
350         }
351
352         cudaEvent_t stop;
353         error = cudaEventCreate(&stop);
354
355         if (error != cudaSuccess)
356         {
357             fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
358             exit(EXIT_FAILURE);
359         }
360
361         // Record the start event
362         error = cudaEventRecord(start, NULL);
363
364         if (error != cudaSuccess)
365         {
366             fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
367             exit(EXIT_FAILURE);
368         }
369
370         for (int j = 0; j < nIter; j++)
371         {
372             //note cublas is column primary!
373             //need to transpose the order
374             ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);
375
376             if (ret != CUBLAS_STATUS_SUCCESS)
377             {
378                 printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
379                 exit(EXIT_FAILURE);
380             }
381         }
382
383         printf("done.\n");
384
385         // Record the stop event
386         error = cudaEventRecord(stop, NULL);
387
388         if (error != cudaSuccess)
389         {
390             fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
391             exit(EXIT_FAILURE);
392         }
393
394         // Wait for the stop event to complete
395         error = cudaEventSynchronize(stop);
396
397         if (error != cudaSuccess)
398         {
399             fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
400             exit(EXIT_FAILURE);
401         }
402
403         float msecTotal = 0.0f;
404         error = cudaEventElapsedTime(&msecTotal, start, stop);
405
406         if (error != cudaSuccess)
407         {
408             fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
409             exit(EXIT_FAILURE);
410         }
411
412         // Compute and print the performance
413         float msecPerMatrixMul = msecTotal / nIter;
414         double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
415         double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
416         printf(
417             "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
418             gigaFlops,
419             msecPerMatrixMul,
420             flopsPerMatrixMul);
421
422         // copy result from device to host
423         error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
424
425         if (error != cudaSuccess)
426         {
427             printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
428             exit(EXIT_FAILURE);
429         }
430
431         checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
432     }
433
434     // compute reference solution
435     printf("Computing result using host CPU...");
436     float *reference = (float *)malloc(mem_size_C);
437     matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
438     printf("done.\n");
439
440     // check result (CUBLAS)
441     bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
442
443     if (resCUBLAS != true)
444     {
445         printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
446     }
447
448     printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
449
450     // clean up memory
451     free(h_A);
452     free(h_B);
453     free(h_C);
454     free(reference);
455     cudaFree(d_A);
456     cudaFree(d_B);
457     cudaFree(d_C);
458
459     cudaDeviceReset();
460
461     if (resCUBLAS == true)
462     {
463         return EXIT_SUCCESS;    // return value = 1
464     }
465     else
466     {
467         return EXIT_FAILURE;     // return value = 0
468     }
469 }
470
471 ////////////////////////////////////////////////////////////////////////////////
472 // Program main
473 ////////////////////////////////////////////////////////////////////////////////
474 int main(int argc, char **argv)
475 {
476     printf("[Matrix Multiply CUBLAS] - Starting...\n");
477
478     int devID = 0, sizeMult = 5;
479     sMatrixSize matrix_size;
480
481     initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
482
483     int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
484
485     exit(matrix_result);
486 }
 1 CUDA_PATH ?=/usr/local/cuda-7.0
 2 NVCC      :=$(CUDA_PATH)/bin/nvcc -ccbin g++
 3 INCLUDE   :=-I/usr/local/cuda/include/ 4            -I/usr/local/cuda/samples/common/inc 5         -I/usr/include/c++ 6         -I./
 7
 8 LIBRARIES :=-L/usr/local/cuda/lib64 -lcudart -lcublas
 9 TARGETS   :=matrixMulCUBLAS
10 OBJECTS   :=$(addsuffix .o, $(TARGETS))
11
12 .SUFFIXES:.o .cu .cpp
13 .cu.o:
14     $(NVCC) -arch=sm_20  $(INCLUDE) -c -o [email protected]  $< $(LIBRARIES)
15 .cpp.o:
16     g++ $(INCLUDE) -c -o [email protected] $< $(LIBRARYIES)
17
18 all:$(OBJECTS)
19     #sudo cp /usr/local/cuda/lib64/libcublas.so.7.0 /usr/lib
20     ln -s libcudart.so.7.0  libcudart.so
21     ln -s libcudart.so.7.0  libcudart.so.7
22     ln -s libcublas.so.7.0 libcublas.so
23     ln -s libcublas.so.7.0 libcublas.so.7
24     g++    $(INCLUDE) -o $(TARGETS) $^ $(LIBRARIES)
25 run:
26     ./$(TARGETS)
27 clean:
28     rm -rf *.o libcudart.so libcudart.so.729         libcublas.so libcublas.so.7 $(TARGETS)

./matrixMulCUBLAS
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "GeForce GTX 650 Ti" with compute capability 3.0

MatrixA(320,640), MatrixB(320,640), MatrixC(320,640)
Computing result using CUBLAS...done.
Performance= 612.46 GFlop/s, Time= 0.214 msec, Size= 131072000 Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS

时间: 2024-10-12 12:28:03

CUDA matrixMulCUBLAS的相关文章

NVIDIA Jetson TK1学习与开发(六):如何安装CUDA

本文介绍如何安装CUDA,以CUDA6.0为例介绍. 1.Installing the CUDA Toolkit onto your device for native CUDA development Download the .deb file for the CUDA Toolkit for L4T either using a web browser on the device, or download on your PC then copy the file to your devi

艰辛五天:Ubuntu14.04+显卡驱动+cuda+Theano环境安装过程

题记:从一开始不知道显卡就是GPU(虽然是学计算机的,但是我真的不知道-脑残如我也是醉了),到搞好所有这些环境前后弄了5天时间,前面的买显卡.装显卡和装双系统见另一篇博客装显卡.双系统,这篇主要记录我怎么配置后面的环境,虽然中间重装Ubuntu三次,后面安装过程也没差别. 基础平台:64-bit,Ubuntu14.04 1.安装NVIDIA驱动(参考技术文章,基本是复制啊,蟹蟹作者~) (1) 在官网下载NVIDIA驱动,根据自己买的型号选择下载,放到 /home/lvxia/ 目录下面,我下载

CUDA 计算线程索引的一般公式

CUDA thread index: int blockId = blockIdx.z * (gridDim.x*gridDim.y)                    + blockIdx.y * gridDim.x                    + blockIdx.x; int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)                      + threadIdx.z * (blo

CUDA gputimer.h头文件

#ifndef __GPU_TIMER_H__ #define __GPU_TIMER_H__ struct GpuTimer { cudaEvent_t start; cudaEvent_t stop; GpuTimer() { cudaEventCreate(&start); cudaEventCreate(&stop); } ~GpuTimer() { cudaEventDestroy(start); cudaEventDestroy(stop); } void Start() {

CUDA初试

1.基本概念 CUDA,全称是Compute Unified Device Architecture,意即统一计算架构,是NVIDIA推出的一种整合技术,开发者可以利用NVIDIA的GeForce 8 以后的GPU和较新的Quadro GPU进行计算. --维基百科 利用CUDA这个平台,可以方便地使用GPU来加速程序的数据运算.GPU对于深度学习这类领域非常重要,因为其具有强大的并行计算能力和浮点运算能力. CUDA的编程模型将CPU作为主机(Host),将GPU作为设备(Device),CP

CUDA, 软件抽象的幻影背后 之三

本文原载于我的主页:planckscale.info,转载于此. 版权声明:原创作品,欢迎转载,但转载请以超链接形式注明文章来源(planckscale.info).作者信息和本声明,否则将追究法律责任. 上一篇中谈到了编程模型中的Block等概念如何映射到硬件上执行,以及CUDA如何用并行来掩盖延迟.这一篇继续剖析SIMT,谈一谈控制流分叉,指令吞吐和线程间通讯机制. 虽然我们说warp中的线程类似于SIMD,但事实上它是真正的线程.warp中的每一个thread都有自己的指令地址寄存器,允许

【CUDA并行编程之四】矩阵相乘

前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子. 1.CPU上执行矩阵相乘以及性能. 在CPU上进行矩阵相乘运算的代码: mat_mul.cc: <span style="font-family:Microsoft YaHei;font-size:18px;">//a[i]*b[i] + c[i] = d[i] #include<iostream> #include<vector

CUDA学习日志:常量内存和纹理内存

接触CUDA的时间并不长,最开始是在cuda-convnet的代码中接触CUDA代码,当时确实看的比较痛苦.最近得空,在图书馆借了本<GPU高性能编程 CUDA实战>来看看,同时也整理一些博客来加强学习效果. Jeremy Lin 在上一篇博文中,我们谈到了如何利用共享内存来实现线程协作的问题.本篇博文我们主要来谈谈如何利用常量内存和纹理内存来提高程序性能. 常量内存 所谓的常量内存,从它的名字我们就可以知道,它是用来保存在核函数执行期间不会发生变化的数据.NVIDIA硬件提供了64KB的常量

NVIDIA DIGITS 学习笔记(NVIDIA DIGITS-2.0 + Ubuntu 14.04 + CUDA 7.0 + cuDNN 7.0 + Caffe 0.13.0)

转自:http://blog.csdn.net/enjoyyl/article/details/47397505?from=timeline&isappinstalled=0#10006-weixin-1-52626-6b3bffd01fdde4900130bc5a2751b6d1 NVIDIA DIGITS-2.0 + Ubuntu 14.04 + CUDA 7.0 + cuDNN 7.0 + Caffe 0.13.0环境配置 引言 DIGITS简介 DIGITS特性 资源信息 说明 DIGI