OpenACC 书上的范例代码（Jacobi 迭代），part 2 / 憋错料

? 使用Jacobi 迭代求泊松方程的数值解

● 首次使用 OpenACC 进行加速，使用动态数组，去掉了误差控制

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <math.h>
 4 #include <openacc.h>
 5
 6 #if defined(_WIN32) || defined(_WIN64)
 7     #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h>
 8     #define gettime(a)  _ftime(a)
 9     #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))
10     typedef struct _timeb timestruct;
11 #else
12     #include <sys/time.h>
13     #define gettime(a)  gettimeofday(a, NULL)
14     #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000)
15     typedef struct timeval timestruct;
16 #endif
17
18 #define ROW 8191
19 #define COL 1023
20
21 inline float uval(float x, float y)
22 {
23     return x * x + y * y;
24 }
25
26 int main()
27 {
28     const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy;
29     const float fij = -4.0f;
30     const float maxIter = 100, errtol = 0.0f;
31     const int COLp1 = COL + 1;
32     const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2));
33     float *restrict u0 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1);
34     float *restrict u1 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1);
35
36     // 初始化
37     int ix, jy;
38     for (ix = 0; ix <= ROW; ix++)
39     {
40         u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f);
41         u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy);
42     }
43     for (jy = 0; jy <= COL; jy++)
44     {
45         u0[jy] = u1[jy] = uval(0.0f, jy * hy);
46         u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy);
47     }
48     for (ix = 1; ix < ROW; ix++)
49     {
50         for (jy = 1; jy < COL; jy++)
51             u0[ix * COLp1 + jy] = 0.0f;
52     }
53
54     // 计算
55     timestruct t1, t2;
56     float /*uerr, temp,*/ *tempp;
57     acc_init(4);        // 初始化设备，以便准确计时，4 代表 nVidia 设备
58     gettime(&t1);
59
60     for (int iter = 1; iter < maxIter; iter++)
61     {
62 #pragma acc kernels copy(u0[0:(ROW + 1) * COLp1], u1[0:(ROW + 1) * COLp1])
63         //uerr = 0.0f;
64 #pragma acc loop independent
65         for (ix = 1; ix < ROW; ix++)
66         {
67             for (jy = 1; jy < COL; jy++)
68             {
69                 u1[ix*COLp1 + jy] = c2 *
70                     (
71                         hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) +
72                         hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) +
73                         c1
74                         );
75                 //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
76                 //uerr = max(temp, uerr);
77             }
78         }
79         //printf("\niter = %d, uerr = %e\n", iter, uerr);
80         //if (uerr < errtol)
81         //    break;
82         tempp = u0, u0 = u1, u1 = tempp;
83     }
84     gettime(&t2);
85     acc_shutdown(4);    // 关闭设备，以便准确计时
86     printf("\nElapsed time: %13ld ms.\n", usec(t1, t2));
87     free(u0);
88     free(u1);
89     //getchar();
90     return 0;
91 }

● 运行结果，在 WSL 上跑时间为 797 ms。使用了环境变量 PGI_ACC_TIME=1，输出运行时间情况

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -c99 -acc -Minfo main.c -o main_acc.exe
main:
     50, Memory zero idiom, loop replaced by call to __c_mzero4
     60, Generating copy(u1[:COLp1*8192],u0[:COLp1*8192])
     65, Loop is parallelizable
     67, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         65, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         67, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     67, FMA (fused multiply-add) instruction(s) generated
uval:
     23, FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4

... // 中间省略 97 行，都是一样的内容，相当于重复 kernel 100 次

launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4

Elapsed time:          2271 ms.

PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c
  main  NVIDIA  devicenum=0
    time(us): 1,019,022
    60: data region reached 198 times
        60: data copyin transfers: 396
             device time(us): total=510,057 max=1,347 min=1,279 avg=1,288
        82: data copyout transfers: 594
             device time(us): total=508,965 max=1,415 min=2 avg=856
    62: compute region reached 99 times
        67: kernel launched 99 times
            grid: [32x1024]  block: [32x4]
            elapsed time(us): total=49,000 max=1000 min=0 avg=494

● 中间曾出现报错 “PGC-S-0155-Cannot determine bounds for array u0 (main.c: 60)”，“PGC-S-0155-Cannot determine bounds for array u1 (main.c: 60)”，报错出现在改行循环上，而不是后面的导语上，尝试各种方法均无果，包括参考（https://stackoverflow.com/questions/38470116/openacc-alias-for-an-array-results-in-cannot-determine-bounds-for-array-erro）。后来先尝试把数组 c1 和 c2 改成静态数组（下面的代码），成功后换回动态数组版本，发现不再报错。原因不明，可能与 PGI 编译器、C99 标准（关键字 restrict）有关。

● 使用 Nvvp 分析，发现密密麻麻的都是内存拷贝，计算反而是瞬间完成：

● 使用静态数组，去掉了误差控制

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <math.h>
  4 #include <openacc.h>
  5
  6 #if defined(_WIN32) || defined(_WIN64)
  7 #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h>
  8 #define gettime(a)  _ftime(a)
  9 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))
 10 typedef struct _timeb timestruct;
 11 #else
 12 #include <sys/time.h>
 13 #define gettime(a)  gettimeofday(a, NULL)
 14 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000)
 15 typedef struct timeval timestruct;
 16 #endif
 17
 18 #define ROW 8191
 19 #define COL 1023
 20
 21 inline float uval(float x, float y)
 22 {
 23     return x * x + y * y;
 24 }
 25
 26 int main()
 27 {
 28     const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy;
 29     const float fij = -4.0f;
 30     const float maxIter = 100, errtol = 0.0f;
 31     const int COLp1 = COL + 1;
 32     const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2));
 33     float u0[(ROW + 1) * COLp1];
 34     float u1[(ROW + 1) * COLp1];
 35
 36     // 初始化
 37     int ix, jy;
 38     for (ix = 0; ix <= ROW; ix++)
 39     {
 40         u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f);
 41         u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy);
 42     }
 43     for (jy = 0; jy <= COL; jy++)
 44     {
 45         u0[jy] = u1[jy] = uval(0.0f, jy * hy);
 46         u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy);
 47     }
 48     for (ix = 1; ix < ROW; ix++)
 49     {
 50         for (jy = 1; jy < COL; jy++)
 51             u0[ix * COLp1 + jy] = 0.0f;
 52     }
 53
 54     // 计算
 55     timestruct t1, t2;
 56     float /*uerr, temp,*/ *tempp;
 57     acc_init(4);        // 初始化设备，以便准确计时
 58     gettime(&t1);
 59 #pragma acc kernels copy(u0[0:(ROW + 1) * COLp1], u1[0:(ROW + 1) * COLp1])
 60     for (int iter = 1; iter < maxIter; iter++)
 61     {
 62         //uerr = 0.0f;
 63         if (iter % 2)   // 分 iter 奇偶性考虑写入目标
 64         {
 65 #pragma acc loop independent
 66             for (ix = 1; ix < ROW; ix++)
 67             {
 68                 for (jy = 1; jy < COL; jy++)
 69                 {
 70                     u1[ix*COLp1 + jy] = c2 *
 71                         (
 72                             hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) +
 73                             hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) +
 74                             c1
 75                             );
 76                     //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
 77                     //uerr = max(temp, uerr);
 78                 }
 79             }
 80         }
 81         else
 82         {
 83 #pragma acc loop independent
 84             for (ix = 1; ix < ROW; ix++)
 85             {
 86                 for (jy = 1; jy < COL; jy++)
 87                 {
 88                     u0[ix*COLp1 + jy] = c2 *
 89                         (
 90                             hy2 * (u1[(ix - 1) * COLp1 + jy] + u1[(ix + 1) * COLp1 + jy]) +
 91                             hx2 * (u1[ix * COLp1 + (jy - 1)] + u1[ix * COLp1 + (jy + 1)]) +
 92                             c1
 93                             );
 94                     //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
 95                     //uerr = max(temp, uerr);
 96                 }
 97             }
 98         }
 99         //printf("\niter = %d, uerr = %e\n", iter, uerr);
100         //if (uerr < errtol)
101         //    break;
102     }
103     gettime(&t2);
104     acc_shutdown(4);    // 关闭设备，以便准确计时
105     printf("\nElapsed time: %13ld ms.\n", usec(t1, t2));
106     //getchar();
107     return 0;
108 }

● 输出结果，在 WSL 上跑时间为 849 ms

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -c99 -acc -Minfo main.c -o main_acc.exe
main:
     50, Memory zero idiom, loop replaced by call to __c_mzero4
     59, Generating copy(u1[:COLp1*8192],u0[:COLp1*8192])
     60, Loop carried dependence due to exposed use of u1[:COLp1*8192],u0[:COLp1*8192] prevents parallelization
         Accelerator kernel generated
         Generating Tesla code
         60, #pragma acc loop seq
         66, #pragma acc loop seq
         68, #pragma acc loop vector(128) /* threadIdx.x */
         84, #pragma acc loop seq
         86, #pragma acc loop vector(128) /* threadIdx.x */
     66, Loop is parallelizable
     68, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
     84, Loop is parallelizable
     86, Loop is parallelizable
         FMA (fused multiply-add) instruction(s) generated
uval:
     23, FMA (fused multiply-add) instruction(s) generated

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main line=60 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=128 grid=1 block=128

Elapsed time:          1987 ms.

PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c
  main  NVIDIA  devicenum=0
    time(us): 10,461
    59: compute region reached 1 time
        60: kernel launched 1 time
            grid: [1]  block: [128]
            elapsed time(us): total=3,950,000 max=3,950,000 min=3,950,000 avg=3,950,000
    59: data region reached 2 times
        59: data copyin transfers: 4
             device time(us): total=5,133 max=1,287 min=1,282 avg=1,283
        104: data copyout transfers: 6
             device time(us): total=5,328 max=1,369 min=2 avg=888

● 使用 Nvvp 分析，发现内存拷贝只有首尾两次，中间全在 GPU 上计算：

原文地址：https://www.cnblogs.com/cuancuancuanhao/p/9416237.html

时间： 2024-11-08 15:03:14

OpenACC 书上的范例代码（Jacobi 迭代），part 2

OpenACC 书上的范例代码（Jacobi 迭代），part 2的相关文章

OpenACC 书上的范例代码（Jacobi 迭代），part 3

数据结构书上的纵横图代码

uva 213 - Message Decoding (我觉得我的方法要比书上少很多代码，不保证好……)

C#高级编程（第9版） -C#5.0&.Net4.5.1 书上的示例代码下载链接

Java 初学第一弹--编译并运行书上的简单程序（猜数字小游戏）

数据结构--书上代码用栈求解迷宫问题存在BUG（非最优解）

用zrender实现工作流图形化设计（附范例代码）

OK 开始实践书上的项目一：即使标记

#1 如何在 HTML页面上显示HTML代码