? 使用Jacobi 迭代求泊松方程的数值解
● 首次使用 OpenACC 进行加速,使用动态数组,去掉了误差控制
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <openacc.h> 5 6 #if defined(_WIN32) || defined(_WIN64) 7 #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h> 8 #define gettime(a) _ftime(a) 9 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm)) 10 typedef struct _timeb timestruct; 11 #else 12 #include <sys/time.h> 13 #define gettime(a) gettimeofday(a, NULL) 14 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000) 15 typedef struct timeval timestruct; 16 #endif 17 18 #define ROW 8191 19 #define COL 1023 20 21 inline float uval(float x, float y) 22 { 23 return x * x + y * y; 24 } 25 26 int main() 27 { 28 const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy; 29 const float fij = -4.0f; 30 const float maxIter = 100, errtol = 0.0f; 31 const int COLp1 = COL + 1; 32 const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2)); 33 float *restrict u0 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1); 34 float *restrict u1 = (float *)malloc(sizeof(float)*(ROW + 1) * COLp1); 35 36 // 初始化 37 int ix, jy; 38 for (ix = 0; ix <= ROW; ix++) 39 { 40 u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f); 41 u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy); 42 } 43 for (jy = 0; jy <= COL; jy++) 44 { 45 u0[jy] = u1[jy] = uval(0.0f, jy * hy); 46 u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy); 47 } 48 for (ix = 1; ix < ROW; ix++) 49 { 50 for (jy = 1; jy < COL; jy++) 51 u0[ix * COLp1 + jy] = 0.0f; 52 } 53 54 // 计算 55 timestruct t1, t2; 56 float /*uerr, temp,*/ *tempp; 57 acc_init(4); // 初始化设备,以便准确计时,4 代表 nVidia 设备 58 gettime(&t1); 59 60 for (int iter = 1; iter < maxIter; iter++) 61 { 62 #pragma acc kernels copy(u0[0:(ROW + 1) * COLp1], u1[0:(ROW + 1) * COLp1]) 63 //uerr = 0.0f; 64 #pragma acc loop independent 65 for (ix = 1; ix < ROW; ix++) 66 { 67 for (jy = 1; jy < COL; jy++) 68 { 69 u1[ix*COLp1 + jy] = c2 * 70 ( 71 hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) + 72 hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) + 73 c1 74 ); 75 //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]); 76 //uerr = max(temp, uerr); 77 } 78 } 79 //printf("\niter = %d, uerr = %e\n", iter, uerr); 80 //if (uerr < errtol) 81 // break; 82 tempp = u0, u0 = u1, u1 = tempp; 83 } 84 gettime(&t2); 85 acc_shutdown(4); // 关闭设备,以便准确计时 86 printf("\nElapsed time: %13ld ms.\n", usec(t1, t2)); 87 free(u0); 88 free(u1); 89 //getchar(); 90 return 0; 91 }
● 运行结果,在 WSL 上跑时间为 797 ms。使用了环境变量 PGI_ACC_TIME=1,输出运行时间情况
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -c99 -acc -Minfo main.c -o main_acc.exe main: 50, Memory zero idiom, loop replaced by call to __c_mzero4 60, Generating copy(u1[:COLp1*8192],u0[:COLp1*8192]) 65, Loop is parallelizable 67, Loop is parallelizable Accelerator kernel generated Generating Tesla code 65, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */ 67, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 67, FMA (fused multiply-add) instruction(s) generated uval: 23, FMA (fused multiply-add) instruction(s) generated D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe launch CUDA kernel file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 launch CUDA kernel file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 ... // 中间省略 97 行,都是一样的内容,相当于重复 kernel 100 次 launch CUDA kernel file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main line=67 device=0 threadid=1 num_gangs=32768 num_workers=4 vector_length=32 grid=32x1024 block=32x4 Elapsed time: 2271 ms. PGI: "acc_shutdown" not detected, performance results might be incomplete. Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete. Accelerator Kernel Timing data D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c main NVIDIA devicenum=0 time(us): 1,019,022 60: data region reached 198 times 60: data copyin transfers: 396 device time(us): total=510,057 max=1,347 min=1,279 avg=1,288 82: data copyout transfers: 594 device time(us): total=508,965 max=1,415 min=2 avg=856 62: compute region reached 99 times 67: kernel launched 99 times grid: [32x1024] block: [32x4] elapsed time(us): total=49,000 max=1000 min=0 avg=494
● 中间曾出现报错 “PGC-S-0155-Cannot determine bounds for array u0 (main.c: 60)”,“PGC-S-0155-Cannot determine bounds for array u1 (main.c: 60)”,报错出现在改行循环上,而不是后面的导语上,尝试各种方法均无果,包括参考(https://stackoverflow.com/questions/38470116/openacc-alias-for-an-array-results-in-cannot-determine-bounds-for-array-erro)。后来先尝试把数组 c1 和 c2 改成静态数组(下面的代码),成功后换回动态数组版本,发现不再报错。原因不明,可能与 PGI 编译器、C99 标准(关键字 restrict)有关。
● 使用 Nvvp 分析,发现密密麻麻的都是内存拷贝,计算反而是瞬间完成:
● 使用静态数组,去掉了误差控制
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <openacc.h> 5 6 #if defined(_WIN32) || defined(_WIN64) 7 #include <C:\Program Files (x86)\Windows Kits\10\Include\10.0.10150.0\ucrt\sys\timeb.h> 8 #define gettime(a) _ftime(a) 9 #define usec(t1,t2) ((((t2).time - (t1).time) * 1000 + (t2).millitm - (t1).millitm)) 10 typedef struct _timeb timestruct; 11 #else 12 #include <sys/time.h> 13 #define gettime(a) gettimeofday(a, NULL) 14 #define usec(t1,t2) (((t2).tv_sec - (t1).tv_sec) * 1000 + ((t2).tv_usec - (t1).tv_usec)/1000) 15 typedef struct timeval timestruct; 16 #endif 17 18 #define ROW 8191 19 #define COL 1023 20 21 inline float uval(float x, float y) 22 { 23 return x * x + y * y; 24 } 25 26 int main() 27 { 28 const float width = 2.0, height = 1.0, hx = height / ROW, hy = width / COL, hx2 = hx * hx, hy2 = hy * hy; 29 const float fij = -4.0f; 30 const float maxIter = 100, errtol = 0.0f; 31 const int COLp1 = COL + 1; 32 const float c1 = hx2 * hy2 * fij, c2 = 1.0f / (2.0 * (hx2 + hy2)); 33 float u0[(ROW + 1) * COLp1]; 34 float u1[(ROW + 1) * COLp1]; 35 36 // 初始化 37 int ix, jy; 38 for (ix = 0; ix <= ROW; ix++) 39 { 40 u0[ix * COLp1 + 0] = u1[ix * COLp1 + 0] = uval(ix * hx, 0.0f); 41 u0[ix * COLp1 + COL] = u1[ix * COLp1 + COL] = uval(ix * hx, COL * hy); 42 } 43 for (jy = 0; jy <= COL; jy++) 44 { 45 u0[jy] = u1[jy] = uval(0.0f, jy * hy); 46 u0[ROW * COLp1 + jy] = u1[ROW * COLp1 + jy] = uval(ROW * hx, jy * hy); 47 } 48 for (ix = 1; ix < ROW; ix++) 49 { 50 for (jy = 1; jy < COL; jy++) 51 u0[ix * COLp1 + jy] = 0.0f; 52 } 53 54 // 计算 55 timestruct t1, t2; 56 float /*uerr, temp,*/ *tempp; 57 acc_init(4); // 初始化设备,以便准确计时 58 gettime(&t1); 59 #pragma acc kernels copy(u0[0:(ROW + 1) * COLp1], u1[0:(ROW + 1) * COLp1]) 60 for (int iter = 1; iter < maxIter; iter++) 61 { 62 //uerr = 0.0f; 63 if (iter % 2) // 分 iter 奇偶性考虑写入目标 64 { 65 #pragma acc loop independent 66 for (ix = 1; ix < ROW; ix++) 67 { 68 for (jy = 1; jy < COL; jy++) 69 { 70 u1[ix*COLp1 + jy] = c2 * 71 ( 72 hy2 * (u0[(ix - 1) * COLp1 + jy] + u0[(ix + 1) * COLp1 + jy]) + 73 hx2 * (u0[ix * COLp1 + (jy - 1)] + u0[ix * COLp1 + (jy + 1)]) + 74 c1 75 ); 76 //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]); 77 //uerr = max(temp, uerr); 78 } 79 } 80 } 81 else 82 { 83 #pragma acc loop independent 84 for (ix = 1; ix < ROW; ix++) 85 { 86 for (jy = 1; jy < COL; jy++) 87 { 88 u0[ix*COLp1 + jy] = c2 * 89 ( 90 hy2 * (u1[(ix - 1) * COLp1 + jy] + u1[(ix + 1) * COLp1 + jy]) + 91 hx2 * (u1[ix * COLp1 + (jy - 1)] + u1[ix * COLp1 + (jy + 1)]) + 92 c1 93 ); 94 //temp = fabs(u0[ix * COLp1 + jy] - u1[ix * COLp1 + jy]); 95 //uerr = max(temp, uerr); 96 } 97 } 98 } 99 //printf("\niter = %d, uerr = %e\n", iter, uerr); 100 //if (uerr < errtol) 101 // break; 102 } 103 gettime(&t2); 104 acc_shutdown(4); // 关闭设备,以便准确计时 105 printf("\nElapsed time: %13ld ms.\n", usec(t1, t2)); 106 //getchar(); 107 return 0; 108 }
● 输出结果,在 WSL 上跑时间为 849 ms
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -c99 -acc -Minfo main.c -o main_acc.exe main: 50, Memory zero idiom, loop replaced by call to __c_mzero4 59, Generating copy(u1[:COLp1*8192],u0[:COLp1*8192]) 60, Loop carried dependence due to exposed use of u1[:COLp1*8192],u0[:COLp1*8192] prevents parallelization Accelerator kernel generated Generating Tesla code 60, #pragma acc loop seq 66, #pragma acc loop seq 68, #pragma acc loop vector(128) /* threadIdx.x */ 84, #pragma acc loop seq 86, #pragma acc loop vector(128) /* threadIdx.x */ 66, Loop is parallelizable 68, Loop is parallelizable FMA (fused multiply-add) instruction(s) generated 84, Loop is parallelizable 86, Loop is parallelizable FMA (fused multiply-add) instruction(s) generated uval: 23, FMA (fused multiply-add) instruction(s) generated D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc.exe launch CUDA kernel file=D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c function=main line=60 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=128 grid=1 block=128 Elapsed time: 1987 ms. PGI: "acc_shutdown" not detected, performance results might be incomplete. Please add the call "acc_shutdown(acc_device_nvidia)" to the end of your application to ensure that the performance results are complete. Accelerator Kernel Timing data D:\Code\OpenACC\OpenACCProject\OpenACCProject\main.c main NVIDIA devicenum=0 time(us): 10,461 59: compute region reached 1 time 60: kernel launched 1 time grid: [1] block: [128] elapsed time(us): total=3,950,000 max=3,950,000 min=3,950,000 avg=3,950,000 59: data region reached 2 times 59: data copyin transfers: 4 device time(us): total=5,133 max=1,287 min=1,282 avg=1,283 104: data copyout transfers: 6 device time(us): total=5,328 max=1,369 min=2 avg=888
● 使用 Nvvp 分析,发现内存拷贝只有首尾两次,中间全在 GPU 上计算:
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9416237.html
时间: 2024-11-08 15:03:14