多GPU设备处理点积示例,项目打包下载
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "device_launch_parameters.h" 20 #include "device_functions.h" 21 #include "cuda_runtime.h" 22 23 #define imin(a,b) (a<b?a:b) 24 25 #define N (33*1024*1024) 26 const int threadsPerBlock = 256; 27 const int blocksPerGrid = 28 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock); 29 30 31 __global__ void dot(int size, float *a, float *b, float *c) { 32 __shared__ float cache[threadsPerBlock]; 33 int tid = threadIdx.x + blockIdx.x * blockDim.x; 34 int cacheIndex = threadIdx.x; 35 36 float temp = 0; 37 while (tid < size) { 38 temp += a[tid] * b[tid]; 39 tid += blockDim.x * gridDim.x; 40 } 41 42 // set the cache values 43 cache[cacheIndex] = temp; 44 45 // synchronize threads in this block 46 __syncthreads(); 47 48 //块内归约 49 int i = blockDim.x / 2; 50 while (i != 0) { 51 if (cacheIndex < i) 52 cache[cacheIndex] += cache[cacheIndex + i]; 53 __syncthreads(); 54 i /= 2; 55 } 56 57 if (cacheIndex == 0) 58 c[blockIdx.x] = cache[0]; 59 } 60 61 62 struct DataStruct { 63 int deviceID; 64 int size; 65 float *a; 66 float *b; 67 float returnValue; 68 }; 69 70 unsigned WINAPI routine(void *pvoidData) 71 //void* routine(void *pvoidData) 72 { 73 DataStruct *data = (DataStruct*)pvoidData; 74 HANDLE_ERROR(cudaSetDevice(data->deviceID)); 75 76 int size = data->size; 77 float *a, *b, c, *partial_c; 78 float *dev_a, *dev_b, *dev_partial_c; 79 80 // allocate memory on the CPU side 81 a = data->a; 82 b = data->b; 83 partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 84 85 // allocate the memory on the GPU 86 HANDLE_ERROR(cudaMalloc((void**)&dev_a, 87 size*sizeof(float))); 88 HANDLE_ERROR(cudaMalloc((void**)&dev_b, 89 size*sizeof(float))); 90 HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 91 blocksPerGrid*sizeof(float))); 92 93 // copy the arrays ‘a‘ and ‘b‘ to the GPU 94 HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float), 95 cudaMemcpyHostToDevice)); 96 HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float), 97 cudaMemcpyHostToDevice)); 98 99 dot <<<blocksPerGrid, threadsPerBlock >>>(size, dev_a, dev_b, 100 dev_partial_c); 101 // copy the array ‘c‘ back from the GPU to the CPU 102 HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, 103 blocksPerGrid*sizeof(float), 104 cudaMemcpyDeviceToHost)); 105 106 // finish up on the CPU side 107 c = 0; 108 for (int i = 0; i<blocksPerGrid; i++) { 109 c += partial_c[i]; 110 } 111 112 HANDLE_ERROR(cudaFree(dev_a)); 113 HANDLE_ERROR(cudaFree(dev_b)); 114 HANDLE_ERROR(cudaFree(dev_partial_c)); 115 116 // free memory on the CPU side 117 free(partial_c); 118 119 data->returnValue = c; 120 return 0; 121 } 122 123 124 int main(void) { 125 int deviceCount; 126 HANDLE_ERROR(cudaGetDeviceCount(&deviceCount)); 127 //要求两个设备 128 if (deviceCount < 2) { 129 printf("We need at least two compute 1.0 or greater " 130 "devices, but only found %d\n", deviceCount); 131 return 0; 132 } 133 134 float *a = (float*)malloc(sizeof(float)* N); 135 HANDLE_NULL(a); 136 float *b = (float*)malloc(sizeof(float)* N); 137 HANDLE_NULL(b); 138 139 // fill in the host memory with data 140 for (int i = 0; i<N; i++) { 141 a[i] = i; 142 b[i] = i * 2; 143 } 144 145 /* 146 为多线程做准备 147 每个DateStruct都为数据集大小的一半 148 */ 149 DataStruct data[2]; 150 data[0].deviceID = 0; 151 data[0].size = N / 2; 152 data[0].a = a; 153 data[0].b = b; 154 155 data[1].deviceID = 1; 156 data[1].size = N / 2; 157 data[1].a = a + N / 2; 158 data[1].b = b + N / 2; 159 160 CUTThread thread = start_thread(routine, &(data[0])); 161 routine(&(data[1])); 162 end_thread(thread); 163 164 165 // free memory on the CPU side 166 free(a); 167 free(b); 168 169 printf("Value calculated: %f\n", 170 data[0].returnValue + data[1].returnValue); 171 172 return 0; 173 }
时间: 2024-10-09 17:49:53