4.2 Reduction优化

首先添加上Heterogeneous Parallel Programming class 中 lab: Reduction的代码：

myReduction.c

// MP Reduction
// Given a list (lst) of length n
// Output its sum = lst[0] + lst[1] + ... + lst[n-1];

#include    <wb.h>

#define BLOCK_SIZE 512 //@@ You can change this

#define wbCheck(stmt) do {                                                    \
        cudaError_t err = stmt;                                                       if (err != cudaSuccess) {                                                         wbLog(ERROR, "Failed to run stmt ", #stmt);                                   wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));                return -1;                                                                }                                                                         } while(0)

__global__ void reduction(float *g_idata, float *g_odata, unsigned int n){

    __shared__ float sdata[BLOCK_SIZE];

    // load shared mem
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? g_idata[i] : 0;

    __syncthreads();

    // do reduction in shared mem, stride is divided by 2,
    for (unsigned int s=blockDim.x/2; s>0; s>>=1)
    {
        //__syncthreads();
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }

        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

__global__ void total(float * input, float * output, int len) {
    //@@ Load a segment of the input vector into shared memory
    __shared__ float partialSum[2 * BLOCK_SIZE];  //blockDim.x is not okay, compile fail
    unsigned int t = threadIdx.x;
    unsigned int start = 2 * blockIdx.x * blockDim.x;
    if (start + t < len)
       partialSum[t] = input[start + t];
    else
       partialSum[t] = 0;

    if (start + blockDim.x + t < len)
       partialSum[blockDim.x + t] = input[start + blockDim.x + t];
    else
       partialSum[blockDim.x + t] = 0;

    //@@ Traverse the reduction tree
    for (unsigned int stride = blockDim.x; stride >= 1; stride >>= 1) {
       __syncthreads();
       if (t < stride)
          partialSum[t] += partialSum[t+stride];
    }
    //@@ Write the computed sum of the block to the output vector at the
    //@@ correct index
    if (t == 0)
       output[blockIdx.x] = partialSum[0];
}

int main(int argc, char ** argv) {
    int ii;
    wbArg_t args;
    float * hostInput; // The input 1D list
    float * hostOutput; // The output list
    float * deviceInput;
    float * deviceOutput;
    int numInputElements; // number of elements in the input list
    int numOutputElements; // number of elements in the output list

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numInputElements);

    numOutputElements = numInputElements / (BLOCK_SIZE);
    if (numInputElements % (BLOCK_SIZE)) {
        numOutputElements++;
    }

    //This for kernel total
    /*numOutputElements = numInputElements / (BLOCK_SIZE <<1);
    if (numInputElements % (BLOCK_SIZE)<<1) {
        numOutputElements++;
    } */
    hostOutput = (float*) malloc(numOutputElements * sizeof(float));

    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The number of input elements in the input is ", numInputElements);
    wbLog(TRACE, "The number of output elements in the input is ", numOutputElements);

    wbTime_start(GPU, "Allocating GPU memory.");
    //@@ Allocate GPU memory here
    cudaMalloc((void **) &deviceInput, numInputElements * sizeof(float));
    cudaMalloc((void **) &deviceOutput, numOutputElements * sizeof(float));

    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    //@@ Copy memory to the GPU here
    cudaMemcpy(deviceInput,
               hostInput,
               numInputElements * sizeof(float),
               cudaMemcpyHostToDevice);

    wbTime_stop(GPU, "Copying input memory to the GPU.");
    //@@ Initialize the grid and block dimensions here
    dim3 dimGrid(numOutputElements, 1, 1);
    dim3 dimBlock(BLOCK_SIZE, 1, 1);

    wbTime_start(Compute, "Performing CUDA computation");
    //@@ Launch the GPU Kernel here
    reduction<<<dimGrid,dimBlock>>>(deviceInput, deviceOutput, numInputElements);
    //total<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, numInputElements);
    cudaDeviceSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * numOutputElements, cudaMemcpyDeviceToHost);
    wbTime_stop(Copy, "Copying output memory to the CPU");

    /********************************************************************
     * Reduce output vector on the host
     * NOTE: One could also perform the reduction of the output vector
     * recursively and support any size input. For simplicity, we do not
     * require that for this lab.
     ********************************************************************/
    for (ii = 1; ii < numOutputElements; ii++) {
        hostOutput[0] += hostOutput[ii];
    }

    wbTime_start(GPU, "Freeing GPU Memory");
    //@@ Free the GPU memory here
    cudaFree(deviceInput);
    cudaFree(deviceOutput);

    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostOutput, 1);

    free(hostInput);
    free(hostOutput);

    return 0;
}

Reduction优化：

时间： 2024-11-10 12:33:02

4.2 Reduction优化的相关文章

gcc中文手册

GCC中文手册中文版维护人——徐明<[email protected]> GCC 1 Section: GNU Tools (1) Updated: 2003/12/05 Other Links: GNU GPL Chinese NAME gcc,g++-GNU工程的C和C++编译器(egcs-1.1.2) 总览(SYNOPSIS) gcc[option|filename ]... g++[option|filename ]... 警告(WARNING) 本手册页内容摘自GNU C编译器的

PHP 优化详解

笔者收集的这些技巧来源较广,完整性不能保证. 由于数量较多,这些优化技巧没有经过测试.请各位看官在使用之前自行测试,毕竟这些技巧是否能派上用场,还是需要由PHP所在的独特环境所决定的. 目录索引找到瓶颈(Finding the Bottleneck) 缓存 (Caching) 编译 vs. 解释(Compiling vs. Interpreting) 代码减肥 (Content Reduction) 多线程与多进程(Multithreading & Multiprocessing) 字符串(S

如何优化app，看Facebook如何做

周四,Facebook Engineering blog 发表了一篇名为<Improving Facebook on Android>博文.博文从四个方面(Performance,Data Efficiency, Networking,Application Size)讲述了Facebook是如何优化app保证其在不同国家不同类型Android设备上都能表现出良好性能的.由于原文内容比较容易理解,这里就直接给出原文,以使上边链接打不开的同学也能看到.<菜鸟成长史:http://blog

WEB前端优化CSS,JS,图片

做网站优化的运维都知道,为了提高网站的访问速度,一般会开启Apache/Nginx gzip功能,将文件压缩,但是这个压缩与我要说的压缩不在一个层面.网上也提供一些在线css.js文件压缩功能.今天介绍一款linux工具来实现这个功能安装JAVA YUI Compressor由java开发,所有我们必须先有java环境.一般系统都会自带java,如果没带,那么 yum安装一个openjava # yum install java 确认是否安装成功 [[email protected] vhost

Optimization and Machine Learning（优化与机器学习）

这是根据(ShanghaiTech University)王浩老师的授课所作的整理. 需要的预备知识:数分.高代.统计.优化 machine learning:(Tom M. Mitchell) "A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P if its performance at tasks in T,

hive查询注意及优化tips

Hive是将符合SQL语法的字符串解析生成可以在Hadoop上执行的MapReduce的工具.使用Hive尽量按照分布式计算的一些特点来设计sql,和传统关系型数据库有区别, 所以需要去掉原有关系型数据库下开发的一些固有思维. 基本原则: 1:尽量尽早地过滤数据,减少每个阶段的数据量,对于分区表要加分区,同时只选择需要使用到的字段 select ... from A join B on A.key = B.key where A.userid>10 and B.userid<10 and A.

收集hive优化解决方案

hive的优化问题1.启动一次JOB尽可能多做事,尽量减少job的数量.能重用就重用,要设计好的模型.2.合理设置reduce个数,reduce个数过多,会造成大量小文件问题.3.使用hive.exec.parallel参数控制在同一个sql中的不同的job是否可以同时运行,提高作业的并发4.注意join的使用,表小用map join,否则用普通reduce join,hive会将前面的表数据装入内存,因此可将数据少的表放在数据多的表之前,减少内存资源消耗.5.注意小文件的问题在hive

优化问题 Optimization Problems & 动态规划 Dynamic Programming

2018-01-12 22:50:06 一.优化问题优化问题用数学的角度来分析就是去求一个函数或者说方程的极大值或者极小值,通常这种优化问题是有约束条件的,所以也被称为约束优化问题. 约束优化问题(亦译为受约束的最优化问题)是一类数学最优化问题,它由目标函数以及与目标函数中的变量相关的约束条件两部分组成,优化过程则为在约束条件下最优化(最大化或最小化)目标函数. 经典的优化问题: 最短路问题旅行商问题(TSP) 装箱问题调度问题背包问题了解并熟练掌握这些经典的优化问题会对以后遇到的新的

怎样优化app，看Facebook怎样做

周四,Facebook Engineering blog 发表了一篇名为<Improving Facebook on Android>博文.博文从四个方面(Performance,Data Efficiency, Networking,Application Size)讲述了Facebook是怎样优化app保证其在不同国家不同类型Android设备上都能表现出良好性能的. 因为原文内容比較 easy理解,这里就直接给出原文.以使上边链接打不开的同学也能看到.<菜鸟成长史:http://b