CUFFT 高速运行之谜

最近在做并行计算，应用的是典型的计算快速傅立叶变换 FFT，程序设计的环境是 Window7, GTX 660ti 使用的软件操作是 CUDA 6.0， OpenCL1.2 , VC2005

笔者试图破解 CUFFT 高速运行之谜，但很遗憾未能如愿，其探索过程却有颇多趣味，写出来与诸位亲们共勉。

实验设计是这样的，笔者要做规模为 2048 的 FFT 做512 次，这样得到的结果用 matlab 来衡量程序计算结果的正确性。在保证正确性的条件下，测量程序中 512次规模为2048 FFT 计算时间，

先用CUDA cufft计算，在用OpenCL采用两种方法计算，一种是普通的并行蝶形计算方法，另外一种择采用矩阵 FFT算法，通过上述比较CUDA 与 OpenCL 的运行时间及计算精度

  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 #include <stdio.h>
  4 #include "cufft.h"
  5
  6 #define Nx 2048        // FFT
  7 #define Ny 512         // FFT??
  8 #define pi 3.1415926
  9
 10 int main(void)
 11 {
 12    Mytime g_timer;
 13
 14     float *h_fk;
 15     h_fk =(float*)malloc(sizeof(float)*Nx*Ny);
 16     for (int j=0; j< Nx*Ny; j++)
 17     {
 18           h_fk[j] = j%Nx;
 19      }
 20
 21 #if 0
 22     FILE *fid_h_fk;
 23     fid_h_fk = fopen("h_fk.txt","wt");
 24     if(fid_h_fk==NULL)
 25     {
 26         //cout<<"error"<<endl;
 27         printf( "Error : file open failed \n " );
 28         system("pause");
 29     }
 30
 31    for (int j = 0; j < Nx; j++)
 32     {
 33        fprintf(fid_h_fk, "%f \n" , h_fk[j]);
 34     }
 35     fclose(fid_h_fk);
 36 #endif
 37 /////////////////////////////////////////////////////////////////////////
 38 /////////cufft
 39     int i,j;
 40     cufftComplex *h_hatCH;
 41     h_hatCH=(cufftComplex*)malloc(sizeof(cufftComplex)*(Nx/2+1)*Ny);
 42
 43     cufftReal *d_fk;
 44     cufftComplex *d_cufft_CH;
 45
 46     cudaMalloc((void**)&d_fk, Nx * Ny * sizeof(cufftReal));
 47     cudaMalloc((void**)&d_cufft_CH, (Nx/2+1) * Ny * sizeof(cufftComplex));
 48
 49     cufftHandle plan;
 50
 51     cufftPlan1d(&plan, Nx, CUFFT_R2C,Ny);             // Real float To Complex
 52
 53     g_timer.Start( &g_timer );
 54
 55     cufftExecR2C(plan, (cufftReal*)d_fk, (cufftComplex*)d_cufft_CH);
 56
 57      cudaMemcpy(d_fk, h_fk, Nx * Ny * sizeof(cufftReal), cudaMemcpyHostToDevice);
 58
 59 ////////////////////////////////////////////////////////////////////////////////////
 60 ///////////  d_cufft_CH output
 61
 62    cudaMemcpy(h_hatCH, d_cufft_CH, sizeof(cufftComplex)*(Nx/2+1)*Ny, cudaMemcpyDeviceToHost);
 63
 64      gfFrametime = g_timer.End( &g_timer );
 65     printf("Time = %f Sec \n", gfFrametime);
 66 #if 0
 67     FILE *fid_h_hatCH;
 68     fid_h_hatCH = fopen("h_hatCH1.txt","wt");
 69     if(fid_h_hatCH==NULL)
 70     {
 71         printf( "Error : file open failed \n " );
 72         system("pause");
 73     }
 74
 75     for (int j = 0; j < (Nx/2+1)*Ny; j++)
 76     {
 77        if (h_hatCH[j].y == 0)
 78        fprintf(fid_h_hatCH, "%f\n" , h_hatCH[j].x);
 79        else if (h_hatCH[j].y > 0)
 80         fprintf(fid_h_hatCH, "%f+%fi\n" , h_hatCH[j].x ,h_hatCH[j].y );
 81             else fprintf(fid_h_hatCH, "%f%fi\n" , h_hatCH[j].x ,h_hatCH[j].y );
 82     }
 83     fclose(fid_h_hatCH);
 84 #endif
 85
 86     FILE *fid_h_hatCH;
 87     fid_h_hatCH = fopen("h_hatCH1.txt","wt");
 88     if(fid_h_hatCH==NULL)
 89     {
 90         printf( "Error : file open failed \n " );
 91         system("pause");
 92     }
 93
 94     for (int j = 0; j < Nx/2; j++)
 95     {
 96        fprintf(fid_h_hatCH, "%f\n" , sqrt(pow(h_hatCH[j].x,2)+pow(h_hatCH[j].y,2)));
 97     }
 98     fclose(fid_h_hatCH);
 99
100     cufftDestroy(plan);
101     cudaFree(d_fk);
102     cudaFree(d_cufft_CH);
103     free(h_hatCH);
104     free(h_fk);
105
106     return 0;
107
108 }

运行时间为： 0.003841 sec

如果用 OpenCL 蝶形运算方法， Device kernel 源码为

 1 #define PI 3.14159265358979323846
 2 #define PI_2 1.57079632679489661923
 3
 4 /* ???? ?? */
 5 __kernel void spinFact(__global float2* w, int n)
 6 {
 7     unsigned int i = get_global_id(0);
 8
 9     float2 angle = (float2)(2*i*PI/(float)n, (2*i*PI/(float)n)+PI_2);
10     w[i] = cos(angle);
11 }
12
13 /* ?? ?? */
14 __kernel void bitReverse(__global float2 *dst, __global float2 *src, int m, int n)
15 {
16     unsigned int gid = get_global_id(0);
17     unsigned int nid = get_global_id(1);
18
19     unsigned int j = gid;
20     j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
21     j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
22     j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
23     j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
24     j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
25
26     j >>= (32-m);
27
28     dst[nid*n+j] = src[nid*n+gid];
29 }
30
31 /* ????? ?? */
32 __kernel void butterfly(__global float2 *x, __global float2* w, int m, int n, int iter, uint flag)
33 {
34     unsigned int gid = get_global_id(0);
35     unsigned int nid = get_global_id(1);
36
37     int butterflySize = 1 << (iter-1);
38     int butterflyGrpDist = 1 << iter;
39     int butterflyGrpNum = n >> iter;
40     int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist);
41     int butterflyGrpOffset = gid & (butterflySize-1);
42
43     int a = nid * n + butterflyGrpBase + butterflyGrpOffset;
44     int b = a + butterflySize;
45
46     int l = butterflyGrpNum * butterflyGrpOffset;
47
48     float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb;
49
50     xa = x[a];
51     xb = x[b];
52     xbxx = xb.xx;
53     xbyy = xb.yy;
54
55     wab = as_float2(as_uint2(w[l]) ^ (uint2)(0x0, flag));
56     wayx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x80000000, 0x0));
57     wbyx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x0, 0x80000000));
58
59     resa = xa + xbxx*wab + xbyy*wayx;
60     resb = xa - xbxx*wab + xbyy*wbyx;
61
62     x[a] = resa;
63     x[b] = resb;
64 }

计算时间为 0.060381 sec
采用矩阵算法的源码为

 1 ///////////////////////////////////////////////////////////////////////////////
 2 //! Matrix multiplication on the device: C = A * B
 3 //! uiWA is A‘s width and uiWB is B‘s width
 4 ////////////////////////////////////////////////////////////////////////////////
 5 __kernel void
 6 matrixMul( __global float* C, __global float* A, __global float2* B,
 7        __local float* As, __local float* Bsr,__local float* Bsi,  int uiWA, int uiWB)
 8 {
 9     // Block index
10     int bx = get_group_id(0);
11     int by = get_group_id(1);
12
13     // Thread index
14     int tx = get_local_id(0);
15     int ty = get_local_id(1);
16
17     // Index of the first sub-matrix of A processed by the block
18     int aBegin = uiWA * BLOCK_SIZE * by;
19
20     // Index of the last sub-matrix of A processed by the block
21     int aEnd   = aBegin + uiWA - 1;
22
23     // Step size used to iterate through the sub-matrices of A
24     int aStep  = BLOCK_SIZE;
25
26     // Index of the first sub-matrix of B processed by the block
27     int bBegin = BLOCK_SIZE * bx;
28
29     // Step size used to iterate through the sub-matrices of B
30     int bStep  = BLOCK_SIZE * uiWB;
31
32     // Csub is used to store the element of the block sub-matrix
33     // that is computed by the thread
34     float2 Csub =(float2)(0.0f, 0.0f);
35
36     // Loop over all the sub-matrices of A and B
37     // required to compute the block sub-matrix
38     for (int a = aBegin, b = bBegin;
39              a <= aEnd;
40              a += aStep, b += bStep) {
41
42         // Load the matrices from device memory
43         // to shared memory; each thread loads
44         // one element of each matrix
45         AS(ty, tx) = A[a + uiWA * ty + tx];
46         BSr(ty, tx) = B[b + uiWB * ty + tx].x;
47         BSi(ty, tx) = B[b + uiWB * ty + tx].y;
48
49         // Synchronize to make sure the matrices are loaded
50         barrier(CLK_LOCAL_MEM_FENCE);
51
52         // Multiply the two matrices together;
53         // each thread computes one element
54         // of the block sub-matrix
55         #pragma unroll
56         for (int k = 0; k < BLOCK_SIZE; ++k)
57             Csub +=(float2)(AS(ty, k)*BSr(k, tx),AS(ty, k)* BSi(k, tx));
58
59         // Synchronize to make sure that the preceding
60         // computation is done before loading two new
61         // sub-matrices of A and B in the next iteration
62         barrier(CLK_LOCAL_MEM_FENCE);
63     }
64
65     // Write the block sub-matrix to device memory;
66     // each thread writes one element
67     C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = 10*log10(Csub.x*Csub.x + Csub.y * Csub.y) ;
68
69 }

计算时间为 0.024020 sec

精确度上 OpenCL明显优于 CUDA

时间： 2024-10-05 00:47:17

CUFFT 高速运行之谜的相关文章

高铁在高速运行时的电力是如何提供的？

高铁在高速运行时的电力是如何提供的? 铁路机车是个庞大的家族,高铁只是这个大家庭的一个新成员,如果要连篇累牍赘述其他车辆,恐怕这个答案是写不下的,故本文针对高速铁路进行讨论. 一. 高铁列车的动力来源是交流电还是直流电? 各国高铁基本采用交流电作为高铁列车的牵引网络的电流制式. (萌萌的意呆立除外.在高铁电流制式这个问题上,全世界都摸着意呆立过河) 二. 高速列车如何获取电能作为动力? (从电路角度来看,高铁采取AT(自耦变压器)供电方式. ) 高铁能够跑起来,依靠的是牵引供电系统给高速列车提供

提升python代码运行的5种方法？

不论什么语言我们都需要注意性能优化问题,提高执行效率.选择了脚本语言就要忍受其速度,这句话在某种程度上说明了Python作为脚本语言的不足之处,那就是执行效率和性能不够亮.尽管Python从未如C和Java一般快速,但是不少Python项目都处于开发语言领先位置. Python很简单易用,但大多数人使用Python都知道在处理密集型cpu工作时,它的数量级依然低于C.Java和JavaScript.但不少第三方不愿赘述Python的优点,而是决定自内而外提高其性能.如果你想让Python在同一硬

让Python代码更快运行的 5 种方法

不论什么语言,我们都需要注意性能优化问题,提高执行效率.选择了脚本语言就要忍受其速度,这句话在某种程度上说明了Python作为脚本语言的不足之处,那就是执行效率和性能不够亮.尽管Python从未如C和Java一般快速,但是不少Python项目都处于开发语言领先位置. Python很简单易用,但大多数人使用Python都知道在处理密集型cpu工作时,它的数量级依然低于C.Java和JavaScript.但不少第三方不愿赘述Python的优点,而是决定自内而外提高其性能.如果你想让Python在同一

让你的Python代码更快运行的 5 种方法

https://cloud.tencent.com/developer/news/354761 不论什么语言,我们都需要注意性能优化问题,提高执行效率.选择了脚本语言就要忍受其速度,这句话在某种程度上说明了Python作为脚本语言的不足之处,那就是执行效率和性能不够亮.尽管Python从未如C和Java一般快速,但是不少Python项目都处于开发语言领先位置. Python 很简单易用,但大多数人使用Python都知道在处理密集型cpu工作时,它的数量级依然低于C.Java和JavaScrip

memcached完全剖析–1. memcached的基础

系列文章导航: memcached完全剖析–1. memcached的基础 memcached全面剖析–2. 理解memcached的内存存储 memcached全面剖析–3. memcached的删除机制和发展方向 memcached全面剖析–4. memcached的分布式算法 memcached全面剖析–5. memcached的应用和兼容程序翻译一篇技术评论社的文章,是讲memcached的连载.fcicq同学说这个东西很有用,希望大家喜欢. 发表日:2008/7/2 作者:长野雅广(

memcached完全剖析--1

memcached的基础翻译一篇技术评论社的文章,是讲memcached的连载.fcicq同学说这个东西很有用,希望大家喜欢. 发表日:2008/7/2 作者:长野雅广(Masahiro Nagano) 原文链接:http://gihyo.jp/dev/feature/01/memcached/0001 我是mixi株式会社开发部系统运营组的长野. 日常负责程序的运营.从今天开始,将分几次针对最近在Web应用的可扩展性领域的热门话题memcached,与我公司开发部研究开发组的前坂一起, 说

为什么V8引擎这么快？（转载）

转自:http://www.cnblogs.com/yumianhu/p/3707427.html 转载请注明出处:http://blog.csdn.net/horkychen Google研发的V8 JavaScript引擎性能优异.我们请熟悉内部程序实现的作者依源代码来看看V8是如何加速的. 作者:Community Engine公司研发部研发工程师Hajime Morita Google的Chrome中的V8 JavaScript引擎,由于性能良好吸引了相当的注目.它是Google特别

分布式缓存- memcached

分布式缓存出于如下考虑,首先是缓存本身的水平线性扩展问题,其次是缓存大并发下的本身的性能问题,再次避免缓存的单点故障问题(多副本和副本一致性).分布式缓存的核心技术包括首先是内存本身的管理问题,包括了内存的分配,管理和回收机制.其次是分布式管理和分布式算法,其次是缓存键值管理和路由. 原文:http://wenku.baidu.com/view/8686d46c7e21af45b307a8c3.html 什么是Memcached 许多Web 应用程序都将数据保存到RDBMS中,应用服务器从中读取

pdf怎么转换成excel格式超简单

可编辑文档转换为不可编辑文档是非常简单的,比如将word或者excel转换成jpg或者pdf,office或者wps软件本身的最新版就自带有这个功能.但是如果我们要将PDF这种不可修改编辑的文档转换成可编辑的形式就会稍微麻烦一点,因为这种格式是任你怎么放大缩小都不会改变文件的排版方式,虽然阅读起来很方便.那怎么办呢?下面小编教给大家一个方法,可以将PDF转换成Excel格式,超简单! 把PDF格式的文件精确转换成EXCEL表格,这边我们可以选择一款叫"迅捷PDF转换器"的软件. (pd