1 #include "cuda_runtime.h" // CUDAVectorAdd.cu 2 #include "device_launch_parameters.h" 3 #include "IML_PrecisionTimer.h" 4 5 #include <stdio.h> 6 #define MEM_SIZE (2048*1024) 7 8 __global__ void addKernel(float *c, float *a, float *b, int N) 9 { 10 int i = blockIdx.x * blockDim.x +threadIdx.x; 11 if (i<N) 12 { 13 c[i] = a[i] + b[i]; 14 } 15 } 16 17 int main() 18 { 19 PrecisionTimer g_timer; 20 float gfFrametime; 21 qPrecisionTimer_Init(&g_timer); 22 23 float *a; 24 float *b; 25 float *c; 26 27 int size =MEM_SIZE; 28 int i; 29 30 a = (float*)malloc(size*sizeof(float)); 31 b = (float*)malloc(size*sizeof(float)); 32 c = (float*)malloc(size*sizeof(float)); 33 34 for( i = 1; i< size; i++) 35 { 36 a[i] = i; 37 b[i] = i; 38 } 39 g_timer.Start( &g_timer ); 40 41 float *dev_a; 42 float *dev_b; 43 float *dev_c; 44 45 46 cudaError_t cudaStatus; 47 48 // Allocate GPU buffers for three vectors (two input, one output) . 49 cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float)); 50 if (cudaStatus != cudaSuccess) { 51 fprintf(stderr, "cudaMalloc failed!"); 52 } 53 54 cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(float)); 55 if (cudaStatus != cudaSuccess) { 56 fprintf(stderr, "cudaMalloc failed!"); 57 } 58 59 cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float)); 60 if (cudaStatus != cudaSuccess) { 61 fprintf(stderr, "cudaMalloc failed!"); 62 } 63 64 // Copy input vectors from host memory to GPU buffers. 65 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(float), cudaMemcpyHostToDevice); 66 if (cudaStatus != cudaSuccess) { 67 fprintf(stderr, "cudaMemcpy failed!"); 68 } 69 70 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice); 71 if (cudaStatus != cudaSuccess) { 72 fprintf(stderr, "cudaMemcpy failed!"); 73 } 74 75 // Launch a kernel on the GPU with one thread for each element. 76 addKernel<<<65535,1024>>>(dev_c, dev_a, dev_b, size); 77 78 79 // Copy output vector from GPU buffer to host memory. 80 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost); 81 82 gfFrametime = g_timer.End( &g_timer ); 83 printf("Time = %f Sec \n", gfFrametime); 84 85 float eps = 1e-16; 86 bool bflag = true; 87 for (i = 0; i<size; i++) 88 { 89 if(abs(c[i] - 2*i)>eps) 90 { 91 bflag = false; 92 break; 93 } 94 } 95 if(bflag) 96 { 97 printf("Result OK!\n"); 98 } 99 else 100 { 101 printf("Result ERROR!\n"); 102 } 103 #if 0 104 for (i=0 ; i<5; i++) 105 { 106 printf("c[%d] = %f\n", i, c[i]); 107 } 108 for (i=size-5 ; i<size; i++) 109 { 110 printf("c[%d] = %f\n",i, c[i]); 111 } 112 #endif 113 114 cudaFree(dev_c); 115 cudaFree(dev_a); 116 cudaFree(dev_b); 117 118 free(a); 119 free(b); 120 free(c); 121 122 return 0; 123 }
1 CUDA_PATH ?=/usr/local/cuda-7.0 2 NVCC :=$(CUDA_PATH)/bin/nvcc -ccbin g++ 3 INCLUDE :=-I/usr/local/cuda-7.0/include/ 4 -I/usr/local/cuda/samples/common/inc 5 -I/usr/include/c++ 6 -I../shareIMLTimer 7 -I./ 8 LINKPATH :=/usr/lib/ 9 LIBRARIES :=-L/usr/local/cuda/lib64 -lcudart 10 TARGETS :=CUDAVectorAdd 11 OBJECTS :=$(addsuffix .o, $(TARGETS)) 12 TIMEOBJECTS :=IML_PrecisionTimer.o13 stimer.o 14 15 .SUFFIXES:.o .cu .cpp 16 .cu.o: 17 $(NVCC) -arch=sm_20 $(INCLUDE) -c -g -o [email protected] $< $(LIBRARIES) 18 .cpp.o: 19 $(CXX) $(INCLUDE) -c -g -o [email protected] $< $(LIBRARIES) 20 21 all:$(OBJECTS) $(TIMEOBJECTS) 22 #sudo cp /usr/local/cuda/lib64/libcufft.so.7.0 /usr/lib 23 ln -s $(LINKPATH)libcudart.so.7.0 libcudart.so 24 ln -s $(LINKPATH)libcudart.so.7.0 libcudart.so.7 25 g++ $(INCLUDE) -o $(TARGETS) $^ $(LIBRARIES) 26 27 $(TIMEOBJECTS): 28 cd ../shareIMLTimer && make && cp *.o ../$(TARGETS) 29 run: 30 ./$(TARGETS) 31 clean: 32 rm -rf *.o kernel libcudart.so libcudart.so.7 $(TARGETS)
$./CUDAVectorAdd
Time = 0.089507 Sec
Result OK!
时间: 2024-10-27 18:06:48