【CUDA并行编程之四】矩阵相乘

前面介绍了基本的Cuda编程的相关知识，那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能，我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码：

mat_mul.cc:

<span style="font-family:Microsoft YaHei;font-size:18px;">//a[i]*b[i] + c[i] = d[i]
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include"wtime.h" 

using namespace std;

const int N = 320;

//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];

void init()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			a[i][j] = 1;
			b[i][j] = 2;
			c[i][j] = 3;
		}
}

void init1()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			aa[i*N+j] = 1;
			bb[i*N+j] = 2;
			cc[i*N+j] = 3;
		}
}

void mul()
{
	for(int i=0;i<N;i++)
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			d[i][j] += a[i][k] * b[k][j];
		}
		d[i][j] += c[i][j];
	  }
}

void mul1()
{
	for(int i=0;i<N;i++)
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
		}
		dd[N*i+j] += cc[N*i+j];
	  }
}

void print()
{
	ofstream fout;
	fout.open("result.txt");
	if(!fout)
	{
		perror("can not open the file");
	}
	for(int i=0;i<N;i++)
	{
	  for(int j=0;j<N;j++)
	  {
	  	  fout<<d[i][j]<<" ";
	  }
	  fout<<endl;
	}
	fout.close();
}

int main()
{
	init1();	

	double t = wtime();
	mul1();
	t = wtime()-t;
	printf("computation timing = %10.10f sec\n",t);

	//print();

	return 0;
}<strong>
</strong></span>

wtime.h:

<span style="font-family:Microsoft YaHei;font-size:18px;">#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif
</span>

wtime.cc:

<span style="font-family:Microsoft YaHei;font-size:18px;">#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif
</span>

makefile:

<span style="font-family:Microsoft YaHei;font-size:18px;">target:
	g++ mat_mul.cc wtime.cc
	./a.out</span>

结果：

2.GPU上执行矩阵相乘以及性能。

代码：

cuda_mat_mul_v1.cu:

<span style="font-family:Microsoft YaHei;font-size:18px;">//matrix multiplication with global memory
#include<iostream>
#include<fstream>
#include "wtime.h"

using namespace std;

const int BLOCK_SIZE = 16;
const int GRID_SIZE = 20;

<strong>//D = A * B + C;
__global__ </strong>void mat_mul(int *da,int *db,int *dc,int *dd,int N)
{
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	int sum = 0;
	for(int i=0;i<N;i++)
	{
		sum += da[row*N + i] * db[row*i+col];
	}
	dd[row*N + col] = sum + dc[row*N + col];
}

int main()
{
	int N = BLOCK_SIZE * GRID_SIZE;
	int *ha,*hb,*hc,*hd;
	int *da,*db,*dc,*dd;
	double time;
	ha = new int[N*N];
	hb = new int[N*N];
	hc = new int[N*N];
	hd = new int[N*N];
	cudaError_t err;

	//initialize
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			ha[i*N+j] = 1;
			hb[i*N+j] = 2;
			hc[i*N+j] = 3;
		}

	<strong>//malloc</strong>
	cudaMalloc(&da,N*N*sizeof(int));
	cudaMalloc(&db,N*N*sizeof(int));
	cudaMalloc(&dc,N*N*sizeof(int));
	err = cudaMalloc(&dd,N*N*sizeof(int));
	printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));

	<strong>//host to device</strong>
	cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);

	<strong>dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
	dim3 grid(GRID_SIZE,GRID_SIZE);
	//kernel</strong>
	time = wtime();
	mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
	printf("Computation time is %10.10f\n",wtime()-time);

	<strong>//device to host</strong>
	cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);

	<strong>//print result to file</strong>
	ofstream fout;
	fout.open("result_v1.txt");
	if(!fout)
	{
		cerr<<"open the file error"<<endl;
		exit(-1);
	}
	for(int i=0;i<N;i++)
	{
		for(int j=0;j<N;j++)
		{
			fout<<hd[i*N+j]<<" ";
		}
		fout<<endl;
	}

	delete []ha;delete []hb;delete []hc;delete []hd;
	cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);

	return 0;
}
</span>

cuda_wtime.cu:

<span style="font-family:Microsoft YaHei;font-size:18px;">#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif<strong>
</strong></span>

wtime.h:

<span style="font-family:Microsoft YaHei;font-size:18px;">#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif
</span>

cuda_wtime.cu:

<span style="font-family:Microsoft YaHei;font-size:18px;">#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif
</span>

makefile:

<span style="font-family:Microsoft YaHei;font-size:18px;">cu:
	nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
	./a.out</span>

结果：

3.计算性能对比：

矩阵大小	*16001600**	1200*1200	800*800	320*320
串行时间/s	30.9	11.49865	2.597987	0.162311
并行时间	grid=100/block=16	grid=75/block=16	grid=50/block=16	grid=20/block=16
kernel执行时间/s	0.0000319	0.0000309944	0.0000309944	0.0000231266
并行计算总时间（分配内存加+数据拷贝+计算）/s	0.70796	0.439213	0.310214	0.237676

可见，在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。

注明出处：http://blog.csdn.net/lavorange/article/details/41896591

时间： 2024-12-28 20:58:29

【CUDA并行编程之四】矩阵相乘的相关文章

【CUDA并行编程之三】Cuda矢量求和运算

本文将通过矢量求和运算来说明基本的Cuda并行编程的基本概念.所谓矢量求和运算,就是两个数组数据中对应的元素两两相加,并将结果保存在第三个数组中.如下图所示: 1.基于CPU的矢量求和: 代码非常简单: #include<iostream> using namespace std; const int N =10; void add( int *a ,int *b , int *c) { int tid = 0; while(tid < N) { c[tid] = a[tid] + b[

CUDA 并行编程简介

前言并行就是让计算中相同或不同阶段的各个处理同时进行.目前有很多种实现并行的手段,如多核处理器,分布式系统等.本专题的文章将主要介绍使用 GPU 实现并行的方法.参考本专题文章前请务必搭建好 CUDA 开发平台,搭建方法可以参考上一篇文章. GPU 并行的优缺点优点: 1. 显存具有更大的内存带宽 2. GPU 具有更大量的执行单元 3. 价格低廉缺点: 1. 对于不能高度并行化的工作,能带来帮助不大. 2. 对于绝大多数显卡型号,CUDA 仅支持 float 类型而不支持 double

【CUDA并行编程之七】数组元素之和

现在需要求得一个数组的所有元素之和,之前感觉似乎不太可能,因为每个线程只处理一个元素,无法将所有元素联系起来,但是最近学习了一段代码可以实现,同时也对shared memory有了进一步的理解. 一.C++串行实现串行实现的方法非常之简单,只要将所有元素依次相加就能够得到相应的结果,实际上我们注重的不是结果,而是运行的效率.那么代码如下: array_sum.cc: #include<iostream> #include<stdio.h> #include "kmean

【CUDA并行编程之五】计算向量的欧式距离

本文将介绍如何用cuda来计算两个向量之间的欧式距离,其中涉及到了如果将二维矩阵传入到核函数进行计算的问题,并且介绍两个内存分配和拷贝的API:cudaMallocPitch以及cudaMemcpy2D. 一.需求分析现在我们要解决这么一个问题:计算一个D维的向量A[D]到二维矩阵B[N][D]的每一行的欧式距离,并且将每一组距离保存在一个向量dis[N]中并返回.我们还是通过串行和并行两种方式来进行实现. 二.串行实现实现方法就是用一个二重循环进行相乘,然后将结果保存.上代码: dis_c

【CUDA并行编程之八】Cuda实现Kmeans算法

本文主要介绍如何使用CUDA并行计算框架编程实现机器学习中的Kmeans算法,Kmeans算法的详细介绍在这里,本文重点在并行实现的过程. 当然还是简单的回顾一下kmeans算法的串行过程: 伪代码: 创建k个点作为起始质心(经常是随机选择) 当任意一个点的簇分配结果发生改变时对数据集中的每个数据点对每个质心计算质心与数据点之间的距离将数据点分配到距其最近的簇对每一个簇,计算簇中所有点的均值并将均值作为质心我们可以观察到有两个部分可以并行优化: ①line03-04:将每个数据点到多

【Cuda并行编程之二】Cuda Memory Hierarchy_Cuda内存层次结构

要想编写高效的程序,那么一定要对内存结构有比较深刻的认识,就像C/C++里面的堆内存,栈内存,全局存储区,静态存储区,常量区等.Cuda是并行计算框架,而GPU的内存有限,那么如果想编写高效的Cuda程序,首先要对其内存结构有一个简单的认识. 首先我们先上一张图,然后通过解释一些名词和代码来进行解释. 各种存储器比较: 存储器位置拥有缓存访问权限变量生存周期 register GPU片内 N/A device可读/写与thread相同 local memory 板载显存无 devi

【CUDA并行编程之六】KNN算法的并行实现

之前写了两篇文章一个是KNN算法的C++串行实现,另一个是CUDA计算向量的欧氏距离.那么这篇文章就可以说是前两篇文章的一个简单的整合.在看这篇文章之前可以先阅读前两篇文章. 一.生成数据集现在需要生成一个N个D维的数据,没在一组数据都有一个类标,这个类标根据第一维的正负来进行标识样本数据的类标:Positive and Negative. #!/usr/bin/python import re import sys import random import os filename = "in

CUDA并行编程思维过程

1)确定应用程序中需要且可以并行化的部分 2)将并行化代码中需要用到的数据分离出来,具体方法是用API函数在并行技术设备上分配内存空间 3)用API函数将数据传输到并行计算设备上 4)在并行化部分开发一个kernel函数,该函数由其中个别线程执行 5)并行线程执行且启动kernel函数 6)最后调用API函数将数据传回主机处理器

cuda并行编程之求解ConjugateGradient(共轭梯度迭代)丢失dll解决方案

在进行图像处理过程中,我们经常会用到梯度迭代求解大型现在方程组:今天在对奇异矩阵进行求解的时候,出现了缺少dll的情况: 报错如下图: 缺少cusparse32_60.dll 缺失cublas32_60.dll 解决方案: (1)将cusparse32_60.dll和cublas32_60.dll直接拷贝到C:\Windows目录,但这样在一直的时候,还会出现同样错误,为了避免麻烦,最好采用方法(2) (2)将cusparse32_60.dll和cublas32_60.dll拷贝到你所在项目的文