OpenACC 书上的范例代码（Jacobi 迭代），part 3

? 使用Jacobi 迭代求泊松方程的数值解

● 使用 data 构件，强行要求 u0 仅拷入和拷出 GPU 各一次，u1 仅拷入GPU 一次

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <math.h>
 4 #include <openacc.h>
 5
 6 #if defined(_WIN32) || defined(_WIN64)
 7 #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h>
 8 #define gettime(a)  _ftime(a)
 9 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm))
10 typedef struct _timeb timestruct;
11 #else
12 #include <sys/time.h>
13 #define gettime(a)  gettimeofday(a, NULL)
14 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000)
15 typedef struct timeval timestruct;
16 #endif
17
18 #define ROW 8191
19 #define COL 1023
20
21 inline float uval(float x, float y)
22 {
23     return x * x + y * y;
24 }
25
26 int main()
27 {
28     const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy;
29     const float fij = -4.0f;
30     const float maxIter = 100, errtol = 0.0f;
31     const int COLp1 = COL + 1;
32     const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2));
33     float *restrict u0 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1);
34     float *restrict u1 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1);
35
36     // 初始化
37     int ix, jy;
38     for (ix = 0; ix <= ROW; ix++)
39     {
40         u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f);
41         u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy);
42     }
43     for (jy = 0; jy <= COL; jy++)
44     {
45         u0[jy] = u1[jy] = uval(0.0f, jy * hy);
46         u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy);
47     }
48     for (ix = 1; ix < ROW; ix++)
49     {
50         for (jy = 1; jy < COL; jy++)
51             u0[ix * COLp1 + jy] = 0.0f;
52     }
53
54     // 计算
55     timestruct t1, t2;
56     float /*uerr, temp,*/ *tempp;
57     acc_init(4);
58     gettime(&t1);
59 #pragma acc data copy(u0[0:(ROW + 1) * COLp1]) copyin(u1[0:(ROW + 1) * COLp1])      // 在 迭代的外侧添加 data 构件，跨迭代（内核）构造数据空间
60     {
61         for (int iter = 1; iter < maxIter; iter++)
62         {
63 #pragma acc kernels present(u0[0:((ROW + 1) * COLp1)], u1[0:((ROW + 1) * COLp1)])   // 每次调用内核时声明 u0 和 u1 已经存在，不要再拷贝
64             {
65                 //uerr = 0.0f;
66 #pragma acc loop independent
67                 for (ix = 1; ix < ROW; ix++)
68                 {
69                     for (jy = 1; jy < COL; jy++)
70                     {
71                         u1[ix*COLp1 + jy] = c2 *
72                             (
73                                 hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) +
74                                 hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) +
75                                 c1
76                                 );
77                         //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]);
78                         //uerr = max(temp, uerr);
79                     }
80                 }
81             }
82             //printf("\niter = %d, uerr = %e\n", iter, uerr);
83             //if (uerr < errtol)
84             //    break;
85             tempp = u0, u0 = u1, u1 = tempp;    // 代码块层数比较多，注意摆放位置时每次迭代的末尾
86         }
87     }
88     gettime(&t2);
89     acc_shutdown(4);
90     printf("\nElapsed time: %13ld ms.\n", usec(t1, t2));
91     free(u0);
92     free(u1);
93     //getchar();
94     return 0;
95 }

● 输出结果，在 Windows 里运行结果如下，在 WSL 里运行时间为 959 ms

 1 D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc.exe
 2 main:
 3      50, Memory zero idiom, loop replaced by call to __c_mzero4
 4      59, Generating copy(u0[:COLp1*8192])
 5          Generating copyin(u1[:COLp1*8192])
 6      61, Generating present(u1[:COLp1*8192],u0[:COLp1*8192])
 7      67, Loop is parallelizable
 8      69, Loop is parallelizable
 9          Accelerator kernel generated
10          Generating Tesla code
11          67, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
12          69, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
13      69, FMA (fused multiply-add) instruction(s) generated
14 uval:
15      23, FMA (fused multiply-add) instruction(s) generated
16
17 D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
18 launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
19 line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
20 launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
21 line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
22
23 ... // 省略 97 行
24
25 launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
26 line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
27
28 Elapsed time:            78 ms.
29 PGI: "acc_shutdown" not detected, performance results might be incomplete.
30  Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.
31
32 Accelerator Kernel Timing data
33 D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c
34   main  NVIDIA  devicenum=0
35     time(us): 7,704
36     59: data region reached 2 times
37         59: data copyin transfers: 4
38              device time(us): total=5,141 max=1,288 min=1,284 avg=1,285
39         88: data copyout transfers: 3
40              device time(us): total=2,563 max=1,278 min=7 avg=854
41     61: data region reached 198 times
42     63: compute region reached 99 times
43         69: kernel launched 99 times
44             grid: [32x1024]  block: [32x4]
45             elapsed time(us): total=48,000 max=1000 min=0 avg=484

● 在 Nvvp 中的结果，非常清晰了

● 将 tempp 放到了更里一层循环，产生如下的运行时错误，关于这个错误 715，参考https://stackoverflow.com/questions/41366915/openacc-create-data-while-running-inside-a-kernels，大意是关于内存泄露

 1 D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc main.c -o main_acc.exe -Minfo
 2 main:
 3       0, Accelerator kernel generated
 4          Generating Tesla code
 5      50, Memory zero idiom, loop replaced by call to __c_mzero4
 6      59, Generating copyin(u1[:COLp1*8192])
 7          Generating copy(u0[:COLp1*8192])
 8      67, Loop is parallelizable
 9      69, Loop is parallelizable
10          Accelerator kernel generated
11          Generating Tesla code
12          67, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
13          69, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
14      69, FMA (fused multiply-add) instruction(s) generated
15      80, Accelerator serial kernel generated
16 uval:
17      23, FMA (fused multiply-add) instruction(s) generated
18
19 D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
20 launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
21 line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
22 launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
23 line=80 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
24 call to cuStreamSynchronize returned error 715: Illegal instruction
25
26 call to cuMemFreeHost returned error 715: Illegal instruction

● 另外，将 PGI_ACC_TIME 打开会强行分析运行时间，而不报运行时错误。如上错误的基础上，把 PGI_ACC_TIME 打开会获得如下的结果：

D:\Code\OpenACC\OpenACCProject\OpenACCProject>set PGI_ACC_TIME=1

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe
launch CUDA kernel  file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main
line=69 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4
PGI: "acc_shutdown" not detected, performance results might be incomplete.
 Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete.

Accelerator Kernel Timing data
D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c
  main  NVIDIA  devicenum=0
    time(us): 5,129
    59: data region reached 1 time
        59: data copyin transfers: 4
             device time(us): total=5,129 max=1,286 min=1,280 avg=1,282
    63: compute region reached 1 time
        69: kernel launched 1 time
            grid: [32x1024]  block: [32x4]
             device time(us): total=0 max=0 min=0 avg=0

原文地址：https://www.cnblogs.com/cuancuancuanhao/p/9417047.html

时间： 2024-11-01 23:52:00

OpenACC 书上的范例代码（Jacobi 迭代），part 3的相关文章

OpenACC 书上的范例代码（Jacobi 迭代），part 2

? 使用Jacobi 迭代求泊松方程的数值解 ● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制 1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <openacc.h> 5 6 #if defined(_WIN32) || defined(_WIN64) 7 #include <C:\Program Files (x86)\Window

数据结构书上的纵横图代码

#include <stdio.h> #include <stdlib.h> #define MAXSIZE 20 int main() { int m, magic[MAXSIZE][MAXSIZE] = {0}, i = 0,j, x, y; scanf("%d",&m); magic[i][m/2] = 1; x = 0;y =m/2; for(i = 2;i <= m*m ;i++)//我进行的优化及改进 { if(magic[(x-1+m

uva 213 - Message Decoding (我觉得我的方法要比书上少很多代码，不保证好……)

#include<stdio.h> #include<math.h> #include<string.h> char s[250]; char a[10][250]; int a1[4]; int a2[250]; char ch; int init(int len) { int tt=0; for(int i=1;i<=7;i++) { for(int j=0;j<(int)pow(2,i)-1;j++) { a[i][j]=s[tt++]; if(tt&

C#高级编程（第9版） -C#5.0&.Net4.5.1 书上的示例代码下载链接

http://www.wrox.com/WileyCDA/WroxTitle/Professional-C-5-0-and-NET-4-5-1.productCd-1118833031,descCd-DOWNLOAD.html http://www.cnblogs.com/zhouyinhui/archive/2007/11/08/952020.html //中文简易版 https://msdn.microsoft.com/en-us/library/ms788718.aspx 英文版本的

Java 初学第一弹--编译并运行书上的简单程序（猜数字小游戏）

(博主原创) 首先说明一下,博主是大一上学期结束寒假时自己看的Java,然后我看的是Head First Java的中文版,因为大一学了c,所以里面的一些基本思想还是了解的,在看这本书时就浏览了一下(就是那种光看没有自己动手去敲代码的),然后看到书上的一个猜数字小游戏,就想手动敲一下,熟悉熟悉Java的语法,但是真正去做时,发现比看起来要困难一些. 首先是Java在建立一个源码文件之前要先建一个package,然后我用的Eclipse写的Java(感觉和pycharm风格差不多),再新建一个文件

数据结构--书上代码用栈求解迷宫问题存在BUG（非最优解）

数据结构第四版p79页迷宫问题我觉得存在BUG,下图盗用贺老师就会的QAQ,也希望贺老师能看到帮忙解答一下啦. BUG: 程序从起始点(1,1)开始寻找路径,在当前点进行判断其上下左右是否存在可走点,如果从(1,1)点开始判断如图那么它的右(1,2)下(2,1)都是可走点那么将右边的格子坐标进栈呢还是将下边的格子坐标进栈?书本上给的代码是先判断上边格子再判断右边格子再判断下边格子再判断左边格子,这就造成了一个问题:(1,2)则个点会被进栈(因为(1,2)点位于(1,1)点的右边被先判断进栈),

用zrender实现工作流图形化设计（附范例代码）

公司研发的管理系统有工作流图形化设计和查看功能,这个功能的开发历史比较久远.在那个暗无天日的年月里,IE几乎一统江湖,所以顺理成章地采用了当时红极一时的VML技术. 后来的事情大家都知道了,IE开始走下坡路,VML这个技术现在早已灭绝,导致原来的工作流图形化功能完全不能使用,所以需要采用新技术来重写工作流图形化功能. 多方对比之后,决定采用zrender库来实现(关于zrender库的介绍,请看http://ecomfe.github.io/zrender/),花了一天的时间,终于做出了一个大致

OK 开始实践书上的项目一：即使标记

OK 开始实践书上的项目一:及时标记然而....又得往前面看啦! ----------------------我是分割线------------------------ 代码改变世界

#1 如何在 HTML页面上显示HTML代码

今天把数据库里面的文章内容输出到界面上,遇到了一个问题.文章内容没有全部书出来,在某个地方被阶段了,纠结了好久,后来发现问题. 问题出现在:“<meta charset="utf-8″>” 在数据库文章表里的文章中有 “<meta charset="utf-8″>” 这个东西,然后查询出来到服务端.查询出来的结果是没问题的. 但是我用 response.write(item.Content); // item.Content 是文章表里面内容字段. 输