C++amp矩阵相乘

参考自：https://msdn.microsoft.com/en-us/library/hh873134.aspx

#include <iostream>
#include <amp.h>
using namespace concurrency;

// 常规矩阵相乘计算
void MultiplyWithOutAMP()
{
	int aMatrix[3][2] = { { 1, 4 }, { 2, 5 }, { 3, 6 } };
	int bMatrix[2][3] = { { 7, 8, 9 }, { 10, 11, 12 } };
	int product[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };

	for (int row = 0; row < 3; row++)
	{
		for (int col = 0; col < 3; col++)
		{
			// Multiply the row of A by the column of B to get the row, column of product.
			for (int inner = 0; inner < 2; inner++)
			{
				product[row][col] += aMatrix[row][inner] * bMatrix[inner][col];
			}
			std::cout << product[row][col] << " ";
		}
		std::cout << "\n";
	}
}

// 使用C++amp矩阵相乘计算
void MultiplyWithAmp()
{
	int aMatrix[] = { 1, 2, 3, 4, 5, 6 };
	int bMatrix[] = { 7, 8, 9, 10, 11, 12 };
	int productMatrix[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };

	array_view<int, 2> a(3, 2, aMatrix);
	array_view<int, 2> b(2, 3, bMatrix);
	array_view<int, 2> product(3, 3, productMatrix);

	parallel_for_each(
		product.extent,
		[=](index<2> idx) restrict(amp)
		{
			int row = idx[0];
			int col = idx[1];
			for (int inner = 0; inner < 2; inner++)
			{
				product[idx] += a(row, inner) * b(inner, col);
			}
		}
	);

	// copy the values of the product variable vakc to the productMatrix variable
	product.synchronize();

	for (int row = 0; row < 3; row++)
	{
		for (int col = 0; col < 3; col++)
		{
			std::cout << productMatrix[row * 3 + col] << " ";
			//std::cout << product(row, col) << " ";
		}
		std::cout << "\n";
	}
}

// 使用C++amp,分块矩阵相乘计算
void MultiplyWithtiling()
{
	// The tile size is 2.
	static const int TS = 2;

	// The raw data.
	int aMatrix[] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
	int bMatrix[] = { 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 };
	int productMatrix[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

	// Create the array_view objects.
	array_view<int, 2> a(4, 4, aMatrix);
	array_view<int, 2> b(4, 4, bMatrix);
	array_view<int, 2> product(4, 4, productMatrix);

	// Call parallel_for_each by using 2 x 2 this.
	parallel_for_each(product.extent.tile< TS, TS>(),
		[=](tiled_index<TS, TS> t_idx) restrict(amp)
		{
			// Get the location of the thread relative to the tile (row, col) and the
			// entire array_view(rowGlobal, colGlobal).
			int row = t_idx.local[0];
			int col = t_idx.local[1];
			int rowGlobal = t_idx.global[0];
			int colGlobal = t_idx.global[1];
			int sum = 0;

			// Given a 4 x 4 matrix and a 2 x 2 tile size, this loop executes teice for each thread.
			// For the first tile and the first loop, it copies a into locA and e into locB.
			// For the first tile and eht second loop, it copies b into locA and g into locB.
			for (int i = 0; i < 4; i += TS)
			{
				tile_static int locA[TS][TS];
				tile_static int locB[TS][TS];
				locA[row][col] = a(rowGlobal, col + i);
				locB[row][col] = b(row + i, colGlobal);
				// The threads in the tile all wait here until locA and locB are filled.
				t_idx.barrier.wait();

				// Return the product for the thread. The sum is retained across
				// both iterations of the loop, in effect adding the two products
				// together, for example, a * e
				for (int k = 0; k < TS; k++)
				{
					sum += locA[row][k] * locB[k][col];
				}

				// All threads must wait until the sums are calculated. If any threads
				// moved ahead, the values in locA and locB would change.
				t_idx.barrier.wait();
			}

			// After both iterations of the loop, copy the sum to the product variable bty using
			product[t_idx.global] = sum;
			// the global location.
		}
	);

	// Copy the contents of product back to the productMatrix variable.
	product.synchronize();

	for (int row = 0; row < 4; row++)
	{
		for (int col = 0; col < 4; col++)
		{
			// The results are available from both the product and productMatrix variables.
			//std::cout << productMatrix[row * 3 + col] << " ";
			std::cout << product(row, col) << " ";
		}
		std::cout << "\n";
	}
}

void main()
{
	/*MultiplyWithOutAMP();
	MultiplyWithAmp();*/
	MultiplyWithtiling();

	getchar();
}

时间： 2024-10-09 20:54:41

C++amp矩阵相乘的相关文章

【CUDA并行编程之四】矩阵相乘

前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子. 1.CPU上执行矩阵相乘以及性能. 在CPU上进行矩阵相乘运算的代码: mat_mul.cc: <span style="font-family:Microsoft YaHei;font-size:18px;">//a[i]*b[i] + c[i] = d[i] #include<iostream> #include<vector

C语言 · 矩阵相乘 · 算法提高

算法提高矩阵相乘时间限制:1.0s 内存限制:256.0MB 问题描述小明最近在为线性代数而头疼,线性代数确实很抽象(也很无聊),可惜他的老师正在讲这矩阵乘法这一段内容. 当然,小明上课打瞌睡也没问题,但线性代数的习题可是很可怕的. 小明希望你来帮他完成这个任务. 现在给你一个ai行aj列的矩阵和一个bi行bj列的矩阵, 要你求出他们相乘的积(当然也是矩阵). (输入数据保证aj=bi,不需要判断) 输入格式输入文件共有ai+bi+2行,并且输入的所有数为整数(long long范围

矩阵相乘

有一个x*y的矩阵和一个y*z矩阵相乘,元素均为整数,求两个矩阵相乘得到的矩阵.这是一道华为OJ题,具体描述忘记了,大致内容如此.并且要求实现的函数参数为指针. 例如矩阵1为 int m1[1][3]={2,-6,3}; 矩阵2为 int m2[3][1]={4,-2,-4}; 乘积矩阵初始化为 int r[1][1]=0; 要求实现的函数为 void matrix_multiple(int *m1,int *m2,int *r,int x,int y,int z) 代码如下: void mat

稀疏矩阵的三元组顺序表存储及矩阵相乘算法小结

稀疏矩阵的三元组顺序表存储及矩阵相乘算法小结巧若拙(欢迎转载,但请注明出处:http://blog.csdn.net/qiaoruozhuo) 一:稀疏矩阵的三元组顺序表数据结构 typedef int ElemType; typedef struct { intx, y; //该非零元素的行下标和列下标 ElemTypee; //该非零元素的值 } Triple; typedef struct { Tripledata[MAXSIZE]; //非零元素三元组顺序表 intmu, nu, t

ObjC语法练习冒泡排序、选择排序、矩阵相乘

用OC实现的冒泡排序.选择排序.矩阵相乘,纯粹是用来练习语法. 冒泡排序,程序如下: void bubbleSort() { //初始化数组 NSMutableArray *array1 = [[NSMutableArray alloc] initWithCapacity:8]; [array1 addObject:@"5"]; [array1 addObject:@"10"]; [array1 addObject:@"8"]; [array1

cublas 矩阵相乘API详解

#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h>#include "cublas_v2.h" void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int

CUDA 矩阵相乘完整代码

#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h>#include <time.h>#include "cublas_v2.h" #define BLOCK_SIZE 16 cudaError_t multiCuda(float *c, float *a, flo

CUDA 矩阵相乘

#include "cuda_runtime.h"#include "device_launch_parameters.h" #include <stdio.h>#include <stdlib.h>#include "cublas_v2.h" #define BLOCK_SIZE 16 /***************/ 用cuBlas的内置函数API,cublasSgemm cudaError_t multiWithc

HDU 4920 Matrix multiplication(矩阵相乘)

各种TEL,233啊.没想到是处理掉0的情况就可以过啊.一直以为会有极端数据.没想到竟然是这样的啊..在网上看到了一个AC的神奇的代码,经典的矩阵乘法,只不过把最内层的枚举,移到外面就过了啊...有点不理解啊,复杂度不是一样的吗.. Matrix multiplication Time Limit: 4000/2000 MS (Java/Others) Memory Limit: 131072/131072 K (Java/Others) Total Submission(s): 640

HDU 1757 矩阵相乘，快速幂模板题

HDU 1757 题意: If x < 10, f(x) = x; If x >= 10, f(x) = a0 * f(x-1) + a1 * f(x-2) + a2 * f(x-3) + -- + a9 * f(x-10); 给出k和mod,求f(k). 总结: 1.特别注意,矩阵相乘不满足交换律,即a*b != b*a. 2.感觉推方程有点困难. 3.矩阵初始化注意. f(x-10) 0 0 0 0 0 0 0 0 0 ( first矩阵 ) f(x-9)