OpenACC 书上的范例代码(Jacobi 迭代),part 2

? 使用Jacobi 迭代求泊松方程的数值解

● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <math.h>
 4 #include <openacc.h>
 5
 6 #if defined(_WIN32) || defined(_WIN64)
 7     #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h>
 8     #define gettime(a)  _ftime(a)
 9     #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))
10     typedef struct _timeb timestruct;
11 #else
12     #include <sys/time.h>
13     #define gettime(a)  gettimeofday(a, NULL)
14     #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000)
15     typedef struct timeval timestruct;
16 #endif
17
18 #define ROW 8191
19 #define COL 1023
20
21 inline float uval(float x, float y)
22 {
23     return x * x + y * y;
24 }
25
26 int main()
27 {
28     const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy;
29     const float fij = -4.0f;
30     const float maxIter = 100, errtol = 0.0f;
31     const int COLp1 = COL + 1;
32     const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2));
33     float *restrict u0 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1);
34     float *restrict u1 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1);
35
36     // 初始化
37     int ix, jy;
38     for (ix = 0; ix <= ROW; ix++)
39     {
40         u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f);
41         u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy);
42     }
43     for (jy = 0; jy <= COL; jy++)
44     {
45         u0[jy] = u1[jy] = uval(0.0f, jy * hy);
46         u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy);
47     }
48     for (ix = 1; ix < ROW; ix++)
49     {
50         for (jy = 1; jy < COL; jy++)
51             u0[ix * COLp1 + jy] = 0.0f;
52     }
53
54     // 计算
55     timestruct t1, t2;
56     float /*uerr, temp,*/ *tempp;
57     acc_init(4);        // 初始化设备,以便准确计时,4 代表 nVidia 设备
58     gettime(&t1);
59
60     for (int iter = 1; iter < maxIter; iter++)
61     {
62 #pragma acc kernels copy(u0[0:(ROW + 1) * COLp1], u1[0:(ROW + 1) * COLp1])
63         //uerr = 0.0f;
64 #pragma acc loop independent
65         for (ix = 1; ix < ROW; ix++)
66         {
67             for (jy = 1; jy < COL; jy++)
68             {
69                 u1[ix*COLp1 + jy] = c2 *
70                     (
71                         hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) +
72                         hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) +
73                         c1
74                         );
75                 //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
76                 //uerr = max(temp, uerr);
77             }
78         }
79         //printf("\niter = %d, uerr = %e\n", iter, uerr);
80         //if (uerr < errtol)
81         //    break;
82         tempp = u0, u0 = u1, u1 = tempp;
83     }
84     gettime(&t2);
85     acc_shutdown(4);    // 关闭设备,以便准确计时
86     printf("\nElapsed time: %13ld ms.\n", usec(t1, t2));
87     free(u0);
88     free(u1);
89     //getchar();
90     return 0;
91 }

● 运行结果,在 WSL 上跑时间为 797 ms。使用了环境变量 PGI_ACC_TIME=1,输出运行时间情况

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -c99 -acc -Minfo main.c -o main_acc.exe
main:
     50, Memory zero idiom, loop replaced by call to __c_mzero4
     60, Generating copy(u1[:COLp1*8192],u0[:COLp1*8192])
     65, Loop is parallelizable
     67, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         65, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         67, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     67, FMA (fused multiply-add) instruction(s) generated
uval:
     23, FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4

... // 中间省略 97 行,都是一样的内容,相当于重复 kernel 100 次

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4

Elapsed time:          2271 ms.

PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c
  main  NVIDIA  devicenum=0
    time(us): 1,019,022
    60: data region reached 198 times
        60: data copyin transfers: 396
             device time(us): total=510,057 max=1,347 min=1,279 avg=1,288
        82: data copyout transfers: 594
             device time(us): total=508,965 max=1,415 min=2 avg=856
    62: compute region reached 99 times
        67: kernel launched 99 times
            grid: [32x1024]  block: [32x4]
            elapsed time(us): total=49,000 max=1000 min=0 avg=494

● 中间曾出现报错 “PGC-S-0155-Cannot determine bounds for array u0 (main.c: 60)”,“PGC-S-0155-Cannot determine bounds for array u1 (main.c: 60)”,报错出现在改行循环上,而不是后面的导语上,尝试各种方法均无果,包括参考(https://stackoverflow.com/questions/38470116/openacc-alias-for-an-array-results-in-cannot-determine-bounds-for-array-erro)。后来先尝试把数组 c1 和 c2 改成静态数组(下面的代码),成功后换回动态数组版本,发现不再报错。原因不明,可能与 PGI 编译器、C99 标准(关键字 restrict)有关。

● 使用 Nvvp 分析,发现密密麻麻的都是内存拷贝,计算反而是瞬间完成:

● 使用静态数组,去掉了误差控制

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <math.h>
  4 #include <openacc.h>
  5
  6 #if defined(_WIN32) || defined(_WIN64)
  7 #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h>
  8 #define gettime(a)  _ftime(a)
  9 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))
 10 typedef struct _timeb timestruct;
 11 #else
 12 #include <sys/time.h>
 13 #define gettime(a)  gettimeofday(a, NULL)
 14 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000)
 15 typedef struct timeval timestruct;
 16 #endif
 17
 18 #define ROW 8191
 19 #define COL 1023
 20
 21 inline float uval(float x, float y)
 22 {
 23     return x * x + y * y;
 24 }
 25
 26 int main()
 27 {
 28     const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy;
 29     const float fij = -4.0f;
 30     const float maxIter = 100, errtol = 0.0f;
 31     const int COLp1 = COL + 1;
 32     const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2));
 33     float u0[(ROW + 1) * COLp1];
 34     float u1[(ROW + 1) * COLp1];
 35
 36     // 初始化
 37     int ix, jy;
 38     for (ix = 0; ix <= ROW; ix++)
 39     {
 40         u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f);
 41         u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy);
 42     }
 43     for (jy = 0; jy <= COL; jy++)
 44     {
 45         u0[jy] = u1[jy] = uval(0.0f, jy * hy);
 46         u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy);
 47     }
 48     for (ix = 1; ix < ROW; ix++)
 49     {
 50         for (jy = 1; jy < COL; jy++)
 51             u0[ix * COLp1 + jy] = 0.0f;
 52     }
 53
 54     // 计算
 55     timestruct t1, t2;
 56     float /*uerr, temp,*/ *tempp;
 57     acc_init(4);        // 初始化设备,以便准确计时
 58     gettime(&t1);
 59 #pragma acc kernels copy(u0[0:(ROW + 1) * COLp1], u1[0:(ROW + 1) * COLp1])
 60     for (int iter = 1; iter < maxIter; iter++)
 61     {
 62         //uerr = 0.0f;
 63         if (iter % 2)   // 分 iter 奇偶性考虑写入目标
 64         {
 65 #pragma acc loop independent
 66             for (ix = 1; ix < ROW; ix++)
 67             {
 68                 for (jy = 1; jy < COL; jy++)
 69                 {
 70                     u1[ix*COLp1 + jy] = c2 *
 71                         (
 72                             hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) +
 73                             hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) +
 74                             c1
 75                             );
 76                     //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
 77                     //uerr = max(temp, uerr);
 78                 }
 79             }
 80         }
 81         else
 82         {
 83 #pragma acc loop independent
 84             for (ix = 1; ix < ROW; ix++)
 85             {
 86                 for (jy = 1; jy < COL; jy++)
 87                 {
 88                     u0[ix*COLp1 + jy] = c2 *
 89                         (
 90                             hy2 * (u1[(ix - 1) * COLp1 + jy] + u1[(ix + 1) * COLp1 + jy]) +
 91                             hx2 * (u1[ix * COLp1 + (jy - 1)] + u1[ix * COLp1 + (jy + 1)]) +
 92                             c1
 93                             );
 94                     //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
 95                     //uerr = max(temp, uerr);
 96                 }
 97             }
 98         }
 99         //printf("\niter = %d, uerr = %e\n", iter, uerr);
100         //if (uerr < errtol)
101         //    break;
102     }
103     gettime(&t2);
104     acc_shutdown(4);    // 关闭设备,以便准确计时
105     printf("\nElapsed time: %13ld ms.\n", usec(t1, t2));
106     //getchar();
107     return 0;
108 }

● 输出结果,在 WSL 上跑时间为 849 ms

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -c99 -acc -Minfo main.c -o main_acc.exe
main:
     50, Memory zero idiom, loop replaced by call to __c_mzero4
     59, Generating copy(u1[:COLp1*8192],u0[:COLp1*8192])
     60, Loop carried dependence due to exposed use of u1[:COLp1*8192],u0[:COLp1*8192] prevents parallelization
         Accelerator kernel generated
         Generating Tesla code
         60, #pragma acc loop seq
         66, #pragma acc loop seq
         68, #pragma acc loop vector(128) /* threadIdx.x */
         84, #pragma acc loop seq
         86, #pragma acc loop vector(128) /* threadIdx.x */
     66, Loop is parallelizable
     68, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     84, Loop is parallelizable
     86, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
uval:
     23, FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main line=60 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=128 grid=1 block=128

Elapsed time:          1987 ms.

PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c
  main  NVIDIA  devicenum=0
    time(us): 10,461
    59: compute region reached 1 time
        60: kernel launched 1 time
            grid: [1]  block: [128]
            elapsed time(us): total=3,950,000 max=3,950,000 min=3,950,000 avg=3,950,000
    59: data region reached 2 times
        59: data copyin transfers: 4
             device time(us): total=5,133 max=1,287 min=1,282 avg=1,283
        104: data copyout transfers: 6
             device time(us): total=5,328 max=1,369 min=2 avg=888

● 使用 Nvvp 分析,发现内存拷贝只有首尾两次,中间全在 GPU 上计算:

原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9416237.html

时间: 2024-08-30 11:41:27

OpenACC 书上的范例代码(Jacobi 迭代),part 2的相关文章

OpenACC 书上的范例代码(Jacobi 迭代),part 3

? 使用Jacobi 迭代求泊松方程的数值解 ● 使用 data 构件,强行要求 u0 仅拷入和拷出 GPU 各一次,u1 仅拷入GPU 一次 1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <openacc.h> 5 6 #if defined(_WIN32) || defined(_WIN64) 7 #include <C:\Program File

数据结构书上的纵横图代码

#include <stdio.h> #include <stdlib.h> #define MAXSIZE 20 int main() { int m, magic[MAXSIZE][MAXSIZE] = {0}, i = 0,j, x, y; scanf("%d",&m); magic[i][m/2] = 1; x = 0;y =m/2; for(i = 2;i <= m*m ;i++)//我进行的优化及改进 { if(magic[(x-1+m

uva 213 - Message Decoding (我觉得我的方法要比书上少很多代码,不保证好……)

#include<stdio.h> #include<math.h> #include<string.h> char s[250]; char a[10][250]; int a1[4]; int a2[250]; char ch; int init(int len) { int tt=0; for(int i=1;i<=7;i++) { for(int j=0;j<(int)pow(2,i)-1;j++) { a[i][j]=s[tt++]; if(tt&

C#高级编程(第9版) -C#5.0&amp;.Net4.5.1 书上的示例代码下载链接

http://www.wrox.com/WileyCDA/WroxTitle/Professional-C-5-0-and-NET-4-5-1.productCd-1118833031,descCd-DOWNLOAD.html http://www.cnblogs.com/zhouyinhui/archive/2007/11/08/952020.html   //中文简易版 https://msdn.microsoft.com/en-us/library/ms788718.aspx  英文版本的

Java 初学 第一弹--编译并运行书上的简单程序(猜数字小游戏)

(博主原创) 首先说明一下,博主是大一上学期结束寒假时自己看的Java,然后我看的是Head First Java的中文版,因为大一学了c,所以里面的一些基本思想还是了解的,在看这本书时就浏览了一下(就是那种光看没有自己动手去敲代码的),然后看到书上的一个猜数字小游戏,就想手动敲一下,熟悉熟悉Java的语法,但是真正去做时,发现比看起来要困难一些. 首先是Java在建立一个源码文件之前要先建一个package,然后我用的Eclipse写的Java(感觉和pycharm风格差不多),再新建一个文件

数据结构--书上代码用栈求解迷宫问题存在BUG(非最优解)

数据结构第四版p79页迷宫问题我觉得存在BUG,下图盗用贺老师就会的QAQ,也希望贺老师能看到帮忙解答一下啦. BUG:  程序从起始点(1,1)开始寻找路径,在当前点进行判断其上下左右是否存在可走点,如果从(1,1)点开始判断如图那么它的右(1,2)下(2,1)都是可走点那么将右边的格子坐标进栈呢还是将下边的格子坐标进栈?书本上给的代码是先判断上边格子再判断右边格子再判断下边格子再判断左边格子,这就造成了一个问题:(1,2)则个点会被进栈(因为(1,2)点位于(1,1)点的右边被先判断进栈),

用zrender实现工作流图形化设计(附范例代码)

公司研发的管理系统有工作流图形化设计和查看功能,这个功能的开发历史比较久远.在那个暗无天日的年月里,IE几乎一统江湖,所以顺理成章地采用了当时红极一时的VML技术. 后来的事情大家都知道了,IE开始走下坡路,VML这个技术现在早已灭绝,导致原来的工作流图形化功能完全不能使用,所以需要采用新技术来重写工作流图形化功能. 多方对比之后,决定采用zrender库来实现(关于zrender库的介绍,请看http://ecomfe.github.io/zrender/),花了一天的时间,终于做出了一个大致

OK 开始实践书上的项目一:即使标记

OK 开始实践书上的项目一:及时标记 然而....又得往前面看啦! ----------------------我是分割线------------------------ 代码改变世界

#1 如何在 HTML页面上显示HTML代码

今天把数据库里面的文章内容输出到界面上,遇到了一个问题.文章内容没有全部书出来,在某个地方被阶段了,纠结了好久,后来发现问题. 问题出现在:“<meta charset="utf-8″>” 在数据库文章表里的 文章中有 “<meta charset="utf-8″>”   这个东西,然后查询出来到服务端.查询出来的结果是没问题的. 但是我用 response.write(item.Content);   // item.Content 是文章表里面内容字段. 输