2014.09.05

#include "MultiHide_BP.cuh"

void NN_TRAIN(float* Samples, float* Targets, int LayersNum, int* Layers, float**W, float**B, float* mse)
{
float* gpuSamples = nullptr;
float* gpuTargets = nullptr;
float* gpuMse = nullptr;
float* cpuMse = (float*)malloc(sizeof(float)*BLOCK_NUM);

CUDA_CALL(cudaMalloc((void**)&gpuSamples, sizeof(float)*SAMPLE_ALL*Layers[0]));
CUDA_CALL(cudaMalloc((void**)&gpuTargets, sizeof(float)*SAMPLE_ALL*Layers[LayersNum-1]));
CUDA_CALL(cudaMalloc((void**)&gpuMse, sizeof(float)*BLOCK_NUM));
CUDA_CALL(cudaMemcpy(gpuSamples, Samples, sizeof(float)*SAMPLE_ALL*Layers[0], cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuTargets, Targets, sizeof(float)*SAMPLE_ALL*Layers[LayersNum-1], cudaMemcpyHostToDevice));

int* gpuLayers = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuLayers, sizeof(int)*LayersNum));
CUDA_CALL(cudaMemcpy(gpuLayers, Layers, sizeof(int)*LayersNum, cudaMemcpyHostToDevice));

float* Ones = (float*) malloc(sizeof(float)*SAMPLE_NUM);
float* gpuOnes =nullptr;
memset(Ones, 1, sizeof(float)*SAMPLE_NUM);
CUDA_CALL(cudaMalloc((void**)&gpuOnes, sizeof(float)*SAMPLE_NUM));
CUDA_CALL(cudaMemcpy(gpuOnes, Ones, sizeof(float)*SAMPLE_NUM, cudaMemcpyHostToDevice));

float** gpuW = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuW, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuW[i], sizeof(float)*(Layers[i])*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuW[i], W[i], sizeof(float)*(Layers[i])*Layers[i+1], cudaMemcpyHostToDevice));
}

float** gpuB = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuB, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuB[i], sizeof(float)*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuB[i], B[i], sizeof(float)*Layers[i+1], cudaMemcpyHostToDevice));
}

float** gpuErr = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuErr, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuErr[i], sizeof(float)*SAMPLE_NUM*(Layers[i+1])));
}

float** gpuO =nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuO, sizeof(float*)*(LayersNum)));
for(int i=0; i<LayersNum; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuO[i], sizeof(float)*SAMPLE_NUM*(Layers[i])));
}

int iter = 0;
while(iter<ITER_MAX)
{
printf("iter = %d:\n",iter);
int iter_1round = 0;
while(iter_1round<ONE_ROUND)
{

NNFF(gpuW, gpuB, gpuO, LAYERSNUM, Layers, gpuSamples + iter_1round*Layers[0]*SAMPLE_NUM);
MSE(mse, cpuMse, gpuMse, gpuTargets + iter_1round*Layers[LayersNum-1]*SAMPLE_NUM, gpuO[LayersNum-1], Layers[LayersNum-1], iter, iter_1round);
NNBP( gpuW, gpuB, gpuErr, gpuO, gpuTargets + iter_1round*Layers[LayersNum-1]*SAMPLE_NUM, gpuOnes, LAYERSNUM, gpuLayers, iter);

iter_1round++;
}
iter++;
}

for(int i=0; i<LayersNum-1; ++i)
{
cudaMemcpy(W[i],gpuW[i],sizeof(float)*Layers[i]*Layers[i+1],cudaMemcpyDeviceToHost);
cudaMemcpy(B[i],gpuB[i],sizeof(float)*Layers[i+1],cudaMemcpyDeviceToHost);
}

free(cpuMse);
free(Ones);
cudaFree(gpuMse);
cudaFree(gpuOnes);
cudaFree(gpuLayers);

for(int i=0; i<LayersNum-1; ++i)
{
cudaFree(gpuW[i]);
cudaFree(gpuB[i]);
cudaFree(gpuErr[i]);
cudaFree(gpuO[i]);
}
cudaFree(gpuO[LayersNum-1]);
cudaFree(*gpuW);
cudaFree(*gpuB);
cudaFree(*gpuErr);
cudaFree(*gpuO);
cudaFree(gpuSamples);
cudaFree(gpuTargets);
}

void BP_Test(float* Samples, uchar* Targets,int LayersNum, int* Layers, float**W, float**B, int* Record)
{
float* gpuSamples = nullptr;
uchar* gpuTargets = nullptr;
int * gpuRecord = nullptr;

CUDA_CALL(cudaMalloc((void**)&gpuSamples, sizeof(float)*SAMPLE_TEST*Layers[0]));
CUDA_CALL(cudaMalloc((void**)&gpuTargets, sizeof(float)*SAMPLE_TEST*Layers[LayersNum-1]));
CUDA_CALL(cudaMalloc((void**)&gpuRecord,sizeof(int)*SAMPLE_TEST));
CUDA_CALL(cudaMemcpy(gpuSamples, Samples, sizeof(float)*SAMPLE_TEST*Layers[0], cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuTargets, Targets, sizeof(uchar)*SAMPLE_TEST, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuRecord, Record, sizeof(int)*SAMPLE_TEST, cudaMemcpyHostToDevice));

float** gpuW = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuW, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuW[i], sizeof(float)*(Layers[i])*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuW[i], W[i], sizeof(float)*(Layers[i])*Layers[i+1], cudaMemcpyHostToDevice));
}

float** gpuB = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuB, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuB[i], sizeof(float)*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuB[i], B[i], sizeof(float)*Layers[i+1], cudaMemcpyHostToDevice));
}

float** gpuO =nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuO, sizeof(float*)*(LayersNum)));
for(int i=0; i<LayersNum; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuO[i], sizeof(float)*SAMPLE_NUM*(Layers[i])));
}

/***************************/

NNFF(gpuW, gpuB, gpuO, LAYERSNUM, Layers, gpuSamples);
GetResult<<<BLOCK_NUM,THREAD_NUM>>>(gpuO[LayersNum-1], gpuTargets, Layers[LayersNum-1], gpuRecord);
cudaMemcpy(Record, gpuRecord, sizeof(int)*SAMPLE_TEST, cudaMemcpyDeviceToHost);

/***************************/
for(int i=0; i<LayersNum-1; ++i)
{
cudaFree(gpuW[i]);
cudaFree(gpuB[i]);
cudaFree(gpuO[i]);
}
cudaFree(gpuO[LayersNum-1]);
cudaFree(*gpuW);
cudaFree(*gpuB);
cudaFree(*gpuO);
cudaFree(gpuSamples);
cudaFree(gpuTargets);
}

void NNFF(float**gpuW, float** gpuB, float**gpuO, int LayersNum, int* Layers, float* gpuSampleSub)
{
const float alpha_Nor = 1.0f;
const float beta_Nor = 0.0f;

CUDA_CALL(cudaMemcpy(gpuO[0], gpuSampleSub, sizeof(float)*SAMPLE_NUM*Layers[0], cudaMemcpyDeviceToDevice));

cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

for(int i=1; i<LayersNum; ++i)
{
//ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Hide, SAMPLE_NUM, In, &alpha_Nor, gpuWeight1, Hide, gpuSamples + iter_1round*In*SAMPLE_NUM, In, &beta_Nor, O1, Hide);
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Layers[i], SAMPLE_NUM, Layers[i-1], &alpha_Nor, gpuW[i-1], Layers[i], gpuO[i-1], Layers[i-1], &beta_Nor, gpuO[i], Layers[i]);
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(gpuO[i], gpuB[i-1], Layers[i], SAMPLE_NUM);
}

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

}

void MSE(float*mse, float*cpuMse, float*gpuMse, float*gpuTargetSub, float*gpuOLast, const int OutNum, const int iter, const int iter_1round)
{

MSE_Kernel<<<BLOCK_NUM,THREAD_NUM>>>(gpuMse, gpuTargetSub, gpuOLast, OutNum);
CUDA_CALL(cudaMemcpy(cpuMse,gpuMse,sizeof(float)*BLOCK_NUM,cudaMemcpyDeviceToHost));
for(int m =0; m<BLOCK_NUM;++m)
{
mse[iter*ONE_ROUND + iter_1round] += cpuMse[m];
}
mse[iter*ONE_ROUND + iter_1round] /= SAMPLE_NUM;

}

void NNBP( float** gpuW, float** gpuB, float**gpuErr, float**gpuO, float*gpuTargetSub, float*gpuOnes, int LayersNum, int* Layers, const int iter)
{
static float alpha = ALFA/SAMPLE_NUM;
const float beta = 1.0f;
const float alpha_Nor = 1.0f;
const float beta_Nor = 0.0f;

ErroLastlayer<<<BLOCK_NUM,THREAD_NUM>>>(gpuErr[LayersNum-2], gpuTargetSub, gpuO[LayersNum-1], Layers[LayersNum-1]);

if(iter>THREAD)
{
alpha *=0.92;
}

cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

//求err
for(int i= LayersNum-3; i>=0; i--)
{
ret = cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_N, Layers[i+1], SAMPLE_NUM, Layers[i+2], &alpha_Nor,gpuW[i+1], Layers[i+1],gpuErr[i+1], Layers[i+2],&beta_Nor,gpuErr[i],Layers[i+1]);
Erro<<<BLOCK_NUM,THREAD_NUM>>>(gpuErr[i], gpuO[i+1], Layers[i+1]);
}

//求weight和bias
for(int i=0; i<LayersNum-1;++i)
{
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Layers[i+1], 1, SAMPLE_NUM, &alpha, gpuErr[i], Layers[i+1], gpuOnes, SAMPLE_NUM, &beta, gpuB[i], Layers[i+1]);
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, Layers[i+1],Layers[i],SAMPLE_NUM, &alpha,gpuErr[i], Layers[i+1],gpuO[i], Layers[i], &beta, gpuW[i], Layers[i+1]);
}

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

}

__global__ void SIGMOD(float* IO, float*Bias, const int NodeNum,const int SampleNum)
{
int idx = blockIdx.x*blockDim.x +threadIdx.x ;
for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
{
int row = i%NodeNum;
IO[i] = 1/(1+exp(-IO[i]-Bias[row]));
}
__syncthreads();
}

__global__ void MSE_Kernel(float*gpuMse, float*targets, float*output, const int OutNum)
{
const size_t thID = threadIdx.x;
const size_t bloID = blockIdx.x;
__shared__ float sharedData[THREAD_NUM];
sharedData[thID] = 0;

for(size_t i = bloID*THREAD_NUM + thID ; i < SAMPLE_NUM*OutNum ; i = i+BLOCK_NUM*THREAD_NUM )
{
sharedData[thID] += 0.5*(targets[i]-output[i])*(targets[i]-output[i]);
}
__syncthreads( );

if(thID<128) sharedData[thID] += sharedData[thID+128];
__syncthreads( );
if ( thID < 64 ) sharedData[thID] += sharedData[thID + 64];
__syncthreads( );
if ( thID < 32 ) sharedData[thID] += sharedData[thID + 32];
if ( thID < 16 ) sharedData[thID] += sharedData[thID + 16];
if ( thID < 8 ) sharedData[thID]+= sharedData[thID + 8];
if ( thID < 4 ) sharedData[thID]+= sharedData[thID + 4];
if ( thID < 2 ) sharedData[thID]+= sharedData[thID + 2];
if ( thID < 1 ) sharedData[thID]+= sharedData[thID + 1];
if ( thID == 0 )// 如果线程ID为0,那么计算结果
{
gpuMse[bloID] = sharedData[0];
}
}

__global__ void ErroLastlayer(float*gpuErrLast, float* gpuTargetsSub, float* gpuOLast, const int NodeNum)
{
size_t idx = blockIdx.x*THREAD_NUM + threadIdx.x;

for(int i = idx; i< NodeNum*SAMPLE_NUM; i = i+BLOCK_NUM*THREAD_NUM)
{
gpuErrLast[i] = (gpuTargetsSub[i]-gpuOLast[i])*gpuOLast[i]*(1-gpuOLast[i]);
}
}

__global__ void Erro(float* Err, float* O, const int NodeNum)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
for(int i = idx; i<NodeNum*SAMPLE_NUM; i=i+BLOCK_NUM*THREAD_NUM)
{
Err[i] = Err[i]*O[i]*(1-O[i]);
}
}

__global__ void GetResult(float* OLast, uchar* Targets, const int OutNum, int * Record)
{
size_t idx = blockIdx.x*THREAD_NUM + threadIdx.x;

for(int i = idx; i < SAMPLE_TEST; i = i+THREAD_NUM*BLOCK_NUM )
{
float value = OLast[10*i];
uchar label = 0;

for(int j=1; j<OutNum; ++j)
{
if(OLast[10*i+j]>value)
{
value = OLast[10*i+j];
label =(uchar) j;
}
}

if(Targets[i] != label)
{
Record[i] = 1;
}
}
}

时间: 2024-10-06 03:58:14

2014.09.05的相关文章

oracle__学习笔记2014.09.05

oracle学习笔记2014.09.05 测试数据库配置的信息 全局数据库名:xiuhao 系统标识符(SID):xiuhao 服务器参数文件名:c:\oracle\dbs\spfilexiuhao.ora database control URL: http://C-1:5500/em sys以及system解锁 edit 以文本格式打开当前命令/ / 执行当前命令 l [num] 显示缓存区命令 get [file] 把file中的文件加入到缓冲区 c /[str] /[str] 修改当前语

12.我们不是在真空里谈软件工程, 软件要运行在硬件芯片上面, 下面看看一个计算机芯片的发展历史: http://perspectives.mvdirona.com/2014/09/august-21-2014-computer-history-museum-presentation/ http://mvdirona.com/jrh/TalksAndPapers/DileepBhandar

电脑芯片的诞生和发展是20世纪最伟大的发明之一,芯片技术决定了计算机升级换代的速度,决定了计算机小型化实现的程度,决定了计算机智能化的程度,决定了计算机普及化的应用深度. 1971年11月15日,英特尔公司推出了第一枚微处理器——4004芯片.这一举措不仅改变了公司的未来,而且对整个工业产生了深远的影响.同年,intel推出了1103DRAM核心内纯. 1974年,英特尔公司推出了划时代的处理器,Intel 8080.采用了复杂的 指令集以及40管脚封装,8080的处理能力大为提高,其功能是80

2014/08/05 – Backbonejs

[来自: Backbone.js 开发秘笈 第2章] Model API: (function ($) { //define Model Class ------------------- var ModelClass = Backbone.Model.extend({ defaults: {},//Backbone 支持在模型初始化时动态进行定义 [支持多行表达式设置默认值,即值为函数] initialize: function () { //模型对象被创建后即被调用 /* 注:如定义了默认属

选择虚拟机还是容器?-【软件和信息服务】2014.09

最近业内有人在探讨一个趋势-"虚拟机:永远的光荣还是垂死挣扎呢?"这个探讨主要源于Docker公司和Linux容器(Container)的探讨.很多人疑惑到底是否容器技术终将取代虚拟机技术.可能你也听说了Linux容器技术,但并非每个人都花时间研究了容器技术的来龙去脉,这不是一条140字的微博能够说清的,因此先分享一点背景知识. 关于容器技术 容器技术提供了操作系统级的进程隔离,类似于硬件的虚拟化技术,这也是为什么现在会有人提出容器技术将取代虚拟机技术的主要原因.当然,容器技术还是不同

Swift迎来了1.0 GM 版(2014.09.09)

2014年6月2日,swift开发团队将swift语言公之于众.而2014年9月9日迎来了swift的第二个里程碑,swift1.0版本号(GM),这意味着无论你的应用有一部分功能是用swift写的,还是整个应用是纯swift写的.你都能够将这个应用通过提交到苹果商店与全世界分享你的成果. 你可能已经注意到我们使用了GM代号.而不是final,这是由于Swift还会持续加入新功能,改善性能和重定义语法. 所以从如今起.你能够放心大胆使用Swift语言来编写新的应用,或者用Swift语言来编写已经

2015.09.05 C++中类的static与const成员

static 对于特定类类型的全体对象而言,访问一个全局对象有时是必要的.也许,在程序的任意点需要统计已创建的特定类类型对象的数量:或者,全局对象可能是指向类的错误处理例程的一个指针:或者,它是指向类类型对象的内在自由存储区的一个指针.然而,全局对象会破坏封装:对象需要支持特定类抽象的实现.如果对象是全局的,一般的用户代码就可以修改这个值.类可以定义类 静态成员,而不是定义一个可普遍访问的全局对象.通常,非 static 数据成员存在于类类型的每个对象中.不像普通的数据成员,static 数据成

2015.09.05 组成原理笔记

一二.概论 三.系统总线 四.存储器 1. 存储器层次结构: 寄存器 缓存(Cache) 主存 辅存(磁盘,磁带) 2. 静态RAM[坑] 动态RAM[坑] ROM 只读存储器 PROM (一次性)可编程只读存储器 EPROM 可擦除可编程只读存储器 EEPROM 电擦除可编程只读存储器 Flash Memory 闪存 3. Cache地址映射[坑]

2014.09.27

1.其实也没什么可抱怨的.付出得不到同等回报大多都是因为你给予的并不是别人想要的,你损失的同时还给别人增加了负担. 2.在外国独自留学的朋友曾跟我说过这番话,分享给大家:这个世界没有那么多的感同身受,也不会每次你难过的时候都有好朋友听你苦水,父母问起来了,还得编造理由让他们安心.最后学会一个人也能生活.有什么难过就告诉给自己听,有多伤心就哭多久.过后振作起来就行.虽然听起来很残酷,但对自己最好.

2014/09/20 关于ArrayList的几种操作

1.删除ArrayList集合元素 删除ArrayList集合里面的元素时,提供了Clear方法,Remove方法,RmoveAt方法和RemoveRange方法. Clear方法是移除所有的元素 Ref:int[] arr =new int[]{1,2,3,4,5,6}; ArrayList List = new ArrayList(arr); List.Clear(); 2.Remove方法 该方法用来从ArrayList中移除特定对象的第一个匹配项 Ref:int arr =new int