cuda并行计算的几种模式

  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 #include <stdio.h>
  4 #include <time.h>
  5 #include <stdlib.h>
  6
  7 #define MAX 120
  8 #define MIN 0
  9
 10 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
 11         float* etime);
 12 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
 13         float* etime, int type);
 14 __global__ void addKernel(int *c, const int *a, const int *b) {
 15     int i = blockIdx.x;
 16     c[i] = a[i] + b[i];
 17 }
 18
 19 __global__ void addKernelThread(int *c, const int *a, const int *b) {
 20     int i = threadIdx.x;
 21     c[i] = a[i] + b[i];
 22 }
 23 int main() {
 24     const int arraySize = 800;
 25     srand((unsigned) time(NULL));
 26     int a[arraySize] = { 1, 2, 3, 4, 5 };
 27     int b[arraySize] = { 10, 20, 30, 40, 50 };
 28
 29     for (int i = 0; i < arraySize; i++) {
 30         a[i] = rand() % (MAX + 1 - MIN) + MIN;
 31         b[i] = rand() % (MAX + 1 - MIN) + MIN;
 32     }
 33     int c[arraySize] = { 0 };
 34     // Add vectors in parallel.
 35     cudaError_t cudaStatus;
 36     int num = 0;
 37     cudaDeviceProp prop;
 38     cudaStatus = cudaGetDeviceCount(&num);
 39     for (int i = 0; i < num; i++) {
 40         cudaGetDeviceProperties(&prop, i);
 41     }
 42
 43     float time;
 44     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
 45     printf("Elasped time of stream is : %f \n", time);
 46     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
 47             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 48             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 49             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 50             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 51             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 52     if (cudaStatus != cudaSuccess) {
 53         fprintf(stderr, "addWithCudaStream failed!");
 54         return 1;
 55     }
 56     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 0);
 57     printf("Elasped time of Block is : %f \n", time);
 58     if (cudaStatus != cudaSuccess) {
 59         fprintf(stderr, "addWithCudaStream failed!");
 60         return 1;
 61     }
 62     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
 63             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 64             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 65             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 66             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 67             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 68
 69     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 1);
 70     printf("Elasped time of thread is : %f \n", time);
 71     if (cudaStatus != cudaSuccess) {
 72         fprintf(stderr, "addWithCudaStream failed!");
 73         return 1;
 74     }
 75     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
 76             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 77             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 78             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 79             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 80             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 81
 82     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
 83     printf("Elasped time of stream is : %f \n", time);
 84     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n",
 85             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 86             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 87             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 88             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 89             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 90     if (cudaStatus != cudaSuccess) {
 91         fprintf(stderr, "addWithCudaStream failed!");
 92         return 1;
 93     }
 94     // cudaThreadExit must be called before exiting in order for profiling and
 95     // tracing tools such as Nsight and Visual Profiler to show complete traces.
 96     cudaStatus = cudaThreadExit();
 97     if (cudaStatus != cudaSuccess) {
 98         fprintf(stderr, "cudaThreadExit failed!");
 99         return 1;
100     }
101     return 0;
102 }
103 // Helper function for using CUDA to add vectors in parallel.
104 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
105         float* etime) {
106     int *dev_a = 0;
107     int *dev_b = 0;
108     int *dev_c = 0;
109     clock_t start, stop;
110     float time;
111     cudaError_t cudaStatus;
112
113     // Choose which GPU to run on, change this on a multi-GPU system.
114     cudaStatus = cudaSetDevice(0);
115     if (cudaStatus != cudaSuccess) {
116         fprintf(stderr,
117                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
118         goto Error;
119     }
120     // Allocate GPU buffers for three vectors (two input, one output)    .
121     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
122     if (cudaStatus != cudaSuccess) {
123         fprintf(stderr, "cudaMalloc failed!");
124         goto Error;
125     }
126     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
127     if (cudaStatus != cudaSuccess) {
128         fprintf(stderr, "cudaMalloc failed!");
129         goto Error;
130     }
131     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
132     if (cudaStatus != cudaSuccess) {
133         fprintf(stderr, "cudaMalloc failed!");
134         goto Error;
135     }
136     // Copy input vectors from host memory to GPU buffers.
137     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
138             cudaMemcpyHostToDevice);
139     if (cudaStatus != cudaSuccess) {
140         fprintf(stderr, "cudaMemcpy failed!");
141         goto Error;
142     }
143     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
144             cudaMemcpyHostToDevice);
145     if (cudaStatus != cudaSuccess) {
146         fprintf(stderr, "cudaMemcpy failed!");
147         goto Error;
148     }
149     cudaStream_t stream[5];
150     for (int i = 0; i < 5; i++) {
151         cudaStreamCreate(&stream[i]);   //创建流
152     }
153     // Launch a kernel on the GPU with one thread for each element.
154     for (int i = 0; i < 5; i++) {
155         addKernel<<<1, 1, 0, stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流
156     }
157     start = clock();
158     cudaDeviceSynchronize();
159     stop = clock();
160     time = (float) (stop - start) / CLOCKS_PER_SEC;
161     *etime = time;
162     // cudaThreadSynchronize waits for the kernel to finish, and returns
163     // any errors encountered during the launch.
164     cudaStatus = cudaThreadSynchronize();
165     if (cudaStatus != cudaSuccess) {
166         fprintf(stderr,
167                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",
168                 cudaStatus);
169         goto Error;
170     }
171     // Copy output vector from GPU buffer to host memory.
172     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
173             cudaMemcpyDeviceToHost);
174     if (cudaStatus != cudaSuccess) {
175         fprintf(stderr, "cudaMemcpy failed!");
176         goto Error;
177     }
178     Error: for (int i = 0; i < 5; i++) {
179         cudaStreamDestroy(stream[i]);   //销毁流
180     }
181     cudaFree(dev_c);
182     cudaFree(dev_a);
183     cudaFree(dev_b);
184     return cudaStatus;
185 }
186 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
187         float * etime, int type) {
188     int *dev_a = 0;
189     int *dev_b = 0;
190     int *dev_c = 0;
191     clock_t start, stop;
192     float time;
193     cudaError_t cudaStatus;
194
195     // Choose which GPU to run on, change this on a multi-GPU system.
196     cudaStatus = cudaSetDevice(0);
197     if (cudaStatus != cudaSuccess) {
198         fprintf(stderr,
199                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
200         goto Error;
201     }
202     // Allocate GPU buffers for three vectors (two input, one output)    .
203     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
204     if (cudaStatus != cudaSuccess) {
205         fprintf(stderr, "cudaMalloc failed!");
206         goto Error;
207     }
208     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
209     if (cudaStatus != cudaSuccess) {
210         fprintf(stderr, "cudaMalloc failed!");
211         goto Error;
212     }
213     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
214     if (cudaStatus != cudaSuccess) {
215         fprintf(stderr, "cudaMalloc failed!");
216         goto Error;
217     }
218     // Copy input vectors from host memory to GPU buffers.
219     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
220             cudaMemcpyHostToDevice);
221     if (cudaStatus != cudaSuccess) {
222         fprintf(stderr, "cudaMemcpy failed!");
223         goto Error;
224     }
225     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
226             cudaMemcpyHostToDevice);
227     if (cudaStatus != cudaSuccess) {
228         fprintf(stderr, "cudaMemcpy failed!");
229         goto Error;
230     }
231
232     if (type == 0) {
233         start = clock();
234         addKernel<<<size, 1>>>(dev_c, dev_a, dev_b);
235     } else {
236         start = clock();
237         addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);
238     }
239     stop = clock();
240     time = (float) (stop - start) / CLOCKS_PER_SEC;
241     *etime = time;
242     // cudaThreadSynchronize waits for the kernel to finish, and returns
243     // any errors encountered during the launch.
244     cudaStatus = cudaThreadSynchronize();
245     if (cudaStatus != cudaSuccess) {
246         fprintf(stderr,
247                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",
248                 cudaStatus);
249         goto Error;
250     }
251     // Copy output vector from GPU buffer to host memory.
252     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
253             cudaMemcpyDeviceToHost);
254     if (cudaStatus != cudaSuccess) {
255         fprintf(stderr, "cudaMemcpy failed!");
256         goto Error;
257     }
258     Error: cudaFree(dev_c);
259     cudaFree(dev_a);
260     cudaFree(dev_b);
261     return cudaStatus;
262 }

如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?

Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}

时间: 2024-10-11 01:11:44

cuda并行计算的几种模式的相关文章

对称加密算法的几种模式优缺点一览

ECB模式 ECB模式的加密 ECB模式的解密 对称密码中常见的又ECB.CBC.CFB.OFB 和 CTR等模式.   下面我们对这些模式的特点做一下整理. 模式 名称 优点 缺点 备注 ECB模式 Electronic CodeBook电子密码本模式 简单 快速 支持并行计算(加密.解密) 明文中的重复排列会反映在密文中 通过删除.替换密文分组可以对明文进行操作 对包含某些比特错误的密文进行解密时,对应的分组会出错 不能抵御重放攻击 必应使用 CBC模式 Cipher Block Charn

对称加密和分组加密中的四种模式(ECB、CBC、CFB、OFB)

对称加密和分组加密中的四种模式(ECB.CBC.CFB.OFB) 一. AES对称加密: AES加密 分组 二. 分组密码的填充 分组密码的填充 e.g.: PKCS#5填充方式 三. 流密码:   四. 分组密码加密中的四种模式: 3.1 ECB模式 优点: 1.简单: 2.有利于并行计算: 3.误差不会被传送: 缺点: 1.不能隐藏明文的模式: 2.可能对明文进行主动攻击: 3.2 CBC模式: 优点: 1.不容易主动攻击,安全性好于ECB,适合传输长度长的报文,是SSL.IPSec的标准.

AES加密的四种模式详解

对称加密和分组加密中的四种模式(ECB.CBC.CFB.OFB) 一. AES对称加密:                                                       AES加密                          分组 二. 分组密码的填充                                                    分组密码的填充 e.g.:                                          

公众号运营常见的4种模式

本文和大家分享的主要是新媒体运营中运营公众号的4种模式,一起来看看吧,希望对大家有所帮助. 如果把公司的公众号当作一个人来看待,那么有三个环节将决定它的生死: 公众号命名20%; 老板(领导)的战略50%: 运营模式30% 保证不了一见钟情 起码也不能让人一见转身吧 当用户要关注一个公众号,第一眼注意到最醒目的就是"公众号名称",这个"公众号名称"决定该用户对关注之后的愿景想象.如果看到的是一个不知名的公司名字,比如你看到的是"广州力图油漆公司.巴图鲁科技

Binlog的三种模式

binlog模式分三种(row,statement,mixed) 1.Row 日志中会记录成每一行数据被修改的形式,然后在slave端再对相同的数据进行修改,只记录要修改的数据,只有value,不会有sql多表关联的情况. 优点:在row模式下,bin-log中可以不记录执行的sql语句的上下文相关的信息,仅仅只需要记录那一条记录被修改了,修改成什么样了,所以row的日志内容会非常清楚的记录下每一行数据修改的细节,非常容易理解.而且不会出现某些特定情况下的存储过程和function,以及trig

Linux网卡bond的七种模式详解

像Samba.Nfs这种共享文件系统,网络的吞吐量非常大,就造成网卡的压力很大,网卡bond是通过把多个物理网卡绑定为一个逻辑网卡,实现本地网卡的冗余,带宽扩容和负载均衡,具体的功能取决于采用的哪种模式. 一.bond的七种模式介绍:1.mode=0(balance-rr)(平衡抡循环策略)链路负载均衡,增加带宽,支持容错,一条链路故障会自动切换正常链路.交换机需要配置聚合口,思科叫port channel.特点:传输数据包顺序是依次传输(即:第1个包走eth0,下一个包就走eth1-.一直循环

httpd的三种模式比较

查看你的httpd使用了哪种模式: /usr/local/apache2/bin/httpd -V |grep 'Server MPM' 使用哪种模式,需要在编译的时候指定 --with-mpm=prefork|worker|event 当然也可以编译的时候,让三者都支持: --enable-mpms-shared=all 然后在配置文件中,修改 LoadModule mpm_worker_module modules/mpd_mpm_worker.so 2.2版本默认为worker,2.4版本

WCF的三种模式

WCF通信的3种模式 1.正常模式:客户端调取接口->等待服务响应->接受响应->执行客户端后面代码(wcf服务有入参,有返回值) 2.数据报模式:客户端调取接口->不等待响应,直接执行客户端后面代码(wcf服务有入参,无返回值) 3.双工模式:客户端调取接口->服务端立刻回复介绍到请求->服务端继续执行,客户端也继续执行->服务端执行回调客户端回调函数,传递信息给客户端(wcf服务有入参,无返回值,但必须定义客户端回调函数) 上代码: 接口代码 using Sy

javascript创建对象的几种模式

虽然Object构造函数或者对象字面量都可以用来创建单个对象,但这些方式有个明显的缺点:使用同一个接口创建很多对象会产生大量的重复代码.为了解决这个问题,人们开始使用工厂模式的一种变体. 1.工厂模式 function createPerson(name, age, job){ var o=new Object(); o.name = name; o.age = age; o.job = job; o.sayName = function(){ alert(this.name); }; retu