20140822_BP神经网络

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "math.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#include <time.h>
#ifndef nullptr
#define nullptr 0
#endif

#define BLOCK_NUM 6
#define THREAD_NUM 256

//#define SAMPLE_NUM 300
#define LAYERS 3
#define Num_In 30
#define Num_Hide 50
#define Num_Out 5

#define ALFA 0.85
#define Beta 0
#define ACCURACY = 0.005; //网络要求精度
#define ITER_MAX = 500; //最大循环次数

#define CUDA_CALL(x) {const cudaError_t a=(x); if(a!= cudaSuccess) {printf("\nCUDA ERROR:%s(err_num = %d)\n",cudaGetErrorString(a),a); cudaDeviceReset(); assert(0);}}

void Init(float* weight0, const int w0_Num, float* weight1, const int w1_Num, float* weight2, const int w2_Num)
{
assert(weight0!=nullptr);
assert(weight1!=nullptr);
assert(weight2!=nullptr);

srand( (unsigned)time(NULL));

for(size_t i = 0; i<w0_Num; ++i)
{
weight0[i] = (float)(rand()%1000)/1000.0f;
}

for(size_t i = 0; i<w1_Num; ++i)
{
weight1[i] = (float)(rand()%1000)/1000.0f;
}

for(size_t i = 0; i<w2_Num; ++i)
{
weight2[i] = (float)(rand()%1000)/1000.0f;
}
}

int main()
{
float* Samples;
int Samples_Num;
float* Targets;
/*
read Data
*/
float* weight0 = (float*) malloc(sizeof(float)*Num_In*Num_In);
float* weight1 = (float*) malloc(sizeof(float)*Num_In*Num_Hide);
float* weight2 = (float*) malloc(sizeof(float)*Num_Hide*Num_Out);

float* bias0 = (float*)malloc(sizeof(float)*Num_In);
float* bias1 = (float*)malloc(sizeof(float)*Num_Hide);
float* bias2 = (float*)malloc(sizeof(float)*Num_Out);

memset( bias0,0,sizeof(float)*Num_In);
memset( bias1,0,sizeof(float)*Num_Hide);
memset( bias2,0,sizeof(float)*Num_Out);

Init(weight0, Num_In*Num_In , weight1, Num_In*Num_Hide, weight2, Num_Hide*Num_Out);

free(weight0);
free(weight1);
free(weight2);

free(bias0);
free(bias1);
free(bias2);

return 0;
}

void BP_Train( float* Samples, int Samples_Num, float* Targets,
float* weight0,float* weight1, float* weight2,
float* bias0,float* bias1,float* bias2)
{
float mse = 0xFFFFFFFF;

float* gpuSamples = nullptr;
float* gpuTargets = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuSamples,sizeof(float)*Samples_Num*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuTargets,sizeof(float)*Samples_Num));

float* gpuWeight0 = nullptr;
float* gpuWeight1 = nullptr;
float* gpuWeight2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuWeight0,sizeof(float)*Num_In*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuWeight1,sizeof(float)*Num_In*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&gpuWeight2,sizeof(float)*Num_Hide*Num_Out));

float* gpuBias0 = nullptr;
float* gpuBias1 = nullptr;
float* gpuBias2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuBias0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuBias1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&gpuBias2,sizeof(float)*Num_Out));

float* gpuErr0 = nullptr;
float* gpuErr1 = nullptr;
float* gpuErr2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuErr0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuErr1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&gpuErr2,sizeof(float)*Num_Out));

float* O0 = nullptr;
float* O1 = nullptr;
float* O2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&O0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&O1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&O2,sizeof(float)*Num_Out));

CUDA_CALL(cudaMemcpy(gpuWeight0, weight0, sizeof(float)*Num_In*Num_In, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuWeight1, weight1, sizeof(float)*Num_In*Num_Hide, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuWeight2, weight2, sizeof(float)*Num_Hide*Num_Out, cudaMemcpyHostToDevice));

CUDA_CALL(cudaMemcpy(gpuBias0,bias0,sizeof(float)*Num_In,cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuBias1,bias1,sizeof(float)*Num_Hide,cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuBias2,bias2,sizeof(float)*Num_Out,cudaMemcpyHostToDevice));

cudaFree(gpuSamples);
cudaFree(gpuTargets);

cudaFree(gpuWeight0);
cudaFree(gpuWeight1);
cudaFree(gpuWeight2);

cudaFree(gpuBias0);
cudaFree(gpuBias1);
cudaFree(gpuBias2);

cudaFree(gpuErr0);
cudaFree(gpuErr1);
cudaFree(gpuErr2);

cudaFree(O0);
cudaFree(O1);
cudaFree(O2);
}

__global__ void SIGMOD(float* IO, float*Bias, const int NodeNum)
{
int idx = threadIdx.x;
if(idx < NodeNum) // NodeNum<THREAD_NUM 时不用考虑多块
{
IO[idx] = 1/(1+exp(-IO[idx]-Bias[idx]));
}
}

//前向传播一个样本pSamples,计算结果O0,O1,O2
void FeedForward( float* O0,float* O1,float* O2, float* pSamples,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2)
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

const float alpha = 1.0f;
const float beta = 0.0f;

//输入层到第一层 + bias0 得到结果经过激活函数得到O0
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, Num_In, 1, Num_In, &alpha, weight0, Num_In, pSamples, 1, &beta, O0, Num_In);
//ret = cublasSaxpy(handle, Num_In,&alpha,bias0,1,O0,1);
SIGMOD<<<1,THREAD_NUM>>>(O0,bias0,Num_In);

//第一层到第二层 + bias1 得到结果经过激活函数得到O1
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Hide, 1, Num_In, &alpha, weight1, Num_In, O0, Num_In, &beta, O1, Num_Hide);
//ret = cublasSaxpy(handle, Num_Hide,&alpha,bias1,1,O1,1);
SIGMOD<<<1,THREAD_NUM>>>(O1,bias1,Num_Hide);

//第二层到第三层 + bias2 得到结果经过激活函数得到O2
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Out, 1, Num_Hide, &alpha, weight2, Num_Hide, O1, Num_Hide, &beta, O2, Num_Out);
//ret = cublasSaxpy(handle, Num_Out,&alpha,bias2,1,O2,1);
SIGMOD<<<1,THREAD_NUM>>>(O2,bias2,Num_Out);

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

}

//反向传播一个样本pSamples,调整weight0,weight1,weight2,bias0,bias1,bias2
__global__ void BackPropagation( float* O0,float* O1,float* O2, float* Targets,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2,
float* err0,float* err1, float* err2)
{
int idx = blockIdx.x + threadIdx.x;

//修改最后一层的误差项和偏差项
if(idx<Num_Out)
{
err2[idx] = (Targets[idx]- O2[idx])*O2[idx]*(1-O2[idx]);
bias2[idx] += ALFA*err2[idx];
}
//__syncthreads(); Num_Out<32 不用同步

//修改最后一层的权值
if(idx< Num_Out*Num_Hide)
{
int r = (int) (idx/Num_Hide);
weight2[idx] += ALFA*err2[r]*O1[idx - r*Num_Hide];
}
__syncthreads();

//修改隐藏层的误差项和偏差项
if(idx<Num_Hide)
{
err1[idx] = 0;
for(int i =0 ; i< Num_Out; i++)
{
err1[idx] += err2[i]*weight2[i*Num_Hide+idx];
}

bias1[idx] += ALFA*err1[idx];
err1[idx] *= O1[idx]*(1-O1[idx]);

}
__syncthreads();

//修改隐藏层的权值
for(int i = 0; i< Num_Hide*Num_In; ++i)
{
int r = (int)(idx/Num_In);
weight1[idx] += ALFA*err1[r]*O1[idx - r*Num_In];
}
__syncthreads();

//修改输入层的误差项和偏差项
if(idx<Num_In)
{
err0[idx] = 0;
for(int i=0; i<Num_Hide; ++i)
{
err0[idx] += err1[i]*weight1[i*Num_In+ idx];
}
bias0[idx] += ALFA*err0[idx];
err0[idx] *= O0[idx]*(1-O0[idx]);
}
//__syncthreads(); Num_In<32 不用同步

for(int i =0; i<Num_In*Num_In; ++i)
{
int r = (int)(idx/Num_In);
weight0[idx] += ALFA*err0[r]*O0[idx - r*Num_In] ;
}
__syncthreads();
}

__global__ void Add_MSE(float* mse, float* O, float * Targets)
{
int idx = threadIdx.x;
if(idx<Num_Out)
{
atomicAdd(mse, sqrt(O[idx]-Targets[idx]));
}
}

//用训练好的模型weight0,weight1,weight2,bias0,bias1,bias2,计算网络对于所有样本Samples的均方误差
float MSE( float* Samples,int Samples_Num, float* Targets,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2)
{
float mse = 0.0;

float Output[Num_Out];
float* O0 = nullptr;
float* O1 = nullptr;
float* O2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&O0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&O1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&O2,sizeof(float)*Num_Out));

cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

const float alpha = 1.0f;
const float beta = 0.0f;

for(int i = 0; i<Samples_Num; ++i)
{
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, Num_In, 1, Num_In, &alpha, weight0, Num_In, Samples+ i*Num_In, 1, &beta, O0, Num_In);
SIGMOD<<<1,THREAD_NUM>>>(O0,bias0,Num_In);
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Hide, 1, Num_In, &alpha, weight1, Num_In, O0, Num_In, &beta, O1, Num_Hide);
SIGMOD<<<1,THREAD_NUM>>>(O1,bias1,Num_Hide);
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Out, 1, Num_Hide, &alpha, weight2, Num_Hide, O1, Num_Hide, &beta, O2, Num_Out);
SIGMOD<<<1,THREAD_NUM>>>(O2,bias2,Num_Out);

}

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

cudaFree(O0);
cudaFree(O1);
cudaFree(O2);
}

/*
cudaEvent_t start,stop;
CUDA_CALL( cudaEventCreate(&start));
CUDA_CALL( cudaEventCreate(&stop));
CUDA_CALL(cudaEventRecord(start, NULL));
*/

时间: 2024-08-27 18:20:59

20140822_BP神经网络的相关文章

Neural Networks and Deep Learning学习笔记ch1 - 神经网络

近期開始看一些深度学习的资料.想学习一下深度学习的基础知识.找到了一个比較好的tutorial,Neural Networks and Deep Learning,认真看完了之后觉得收获还是非常多的.从最主要的感知机開始讲起.到后来使用logistic函数作为激活函数的sigmoid neuron,和非常多其它如今深度学习中常使用的trick. 把深度学习的一个发展过程讲得非常清楚,并且还有非常多源代码和实验帮助理解.看完了整个tutorial后打算再又一次梳理一遍,来写点总结.以后再看其它资料

深度学习方法(十):卷积神经网络结构变化——Maxout Networks,Network In Network,Global Average Pooling

技术交流QQ群:433250724,欢迎对算法.技术感兴趣的同学加入. 最近接下来几篇博文会回到神经网络结构的讨论上来,前面我在"深度学习方法(五):卷积神经网络CNN经典模型整理Lenet,Alexnet,Googlenet,VGG,Deep Residual Learning"一文中介绍了经典的CNN网络结构模型,这些可以说已经是家喻户晓的网络结构,在那一文结尾,我提到"是时候动一动卷积计算的形式了",原因是很多工作证明了,在基本的CNN卷积计算模式之外,很多简

对抗神经网络(Adversarial Nets)的介绍[1]

对抗NN简介 概念介绍 对抗名字的由来及对抗过程 对抗NN的模型 对抗NN的模型和训练 判别网络D的最优值 模拟学习高斯分布 对抗NN实验结果 <生成对抗NN>代码的安装与运行 对抗网络相关论文 论文引用 一.对抗NN简介 大牛Ian J. Goodfellow 的2014年的<Generative Adversative Nets>第一次提出了对抗网络模型,短短两年的时间,这个模型在深度学习生成模型领域已经取得了不错的成果.论文提出了一个新的框架,可以利用对抗过程估计生成模型,相

第五章 神经网络

读书笔记 周志华老师的<机器学习> 因为边看边记,所以写在随笔里,如果涉及版权问题,请您联系我立马删除,[email protected] 5.1 神经元模型 “神经网络是由具有适应性的简单单元组成的广泛并行互连的网络,它的组织能够模拟生物神经系统对真实世界物体所作出的交互反应.” 神经元模型:生物神经网络中,每个神经元与其他神经元相连,当它“兴奋”时,就会向相连的神经元发送化学物质,从而改变这些神经元内的电位:如果某神经元的电位超过了一个“阈值”,那么它就会被激活,即“兴奋”起来,向其他神经

中国首款嵌入式神经网络处理器发布

中国首款嵌入式神经网络处理器(NPU)芯片在北京正式发布,该芯片颠覆传统计算机架构,是由中星微“数字多媒体芯片技术”国家重点实验室研发,已于今年3月6日实现量产. 据介绍,有别于传统的冯诺依曼计算机架构,NPU采用了“数据驱动并行计算”架构,其具有低功耗的特点,擅长视频.图像类的多媒体数据处理,有助于人工智能在嵌入式机器视觉应用中稳定发挥. 中星微日前展示了型号为VC0758的国内首款NPU芯片产品,其内部集成了四个NPU内核,同时其还集成了[email protected]的SVAC国家标准音

Spark MLlib Deep Learning Convolution Neural Network (深度学习-卷积神经网络)3.1

3.Spark MLlib Deep Learning Convolution Neural Network (深度学习-卷积神经网络)3.1 http://blog.csdn.net/sunbow0 Spark MLlib Deep Learning工具箱,是根据现有深度学习教程<UFLDL教程>中的算法,在SparkMLlib中的实现.具体Spark MLlib Deep Learning(深度学习)目录结构: 第一章Neural Net(NN) 1.源码 2.源码解析 3.实例 第二章D

Spark MLlib Deep Learning Convolution Neural Network (深度学习-卷积神经网络)3.2

3.Spark MLlib Deep Learning Convolution Neural Network(深度学习-卷积神经网络)3.2 http://blog.csdn.net/sunbow0 第三章Convolution Neural Network (卷积神经网络) 2基础及源码解析 2.1 Convolution Neural Network卷积神经网络基础知识 1)基础知识: 自行google,百度,基础方面的非常多,随便看看就可以,只是很多没有把细节说得清楚和明白: 能把细节说清

Spark MLlib Deep Learning Convolution Neural Network (深度学习-卷积神经网络)3.3

3.Spark MLlib Deep Learning Convolution Neural Network(深度学习-卷积神经网络)3.3 http://blog.csdn.net/sunbow0 第三章Convolution Neural Network (卷积神经网络) 3实例 3.1 测试数据 按照上例数据,或者新建图片识别数据. 3.2 CNN实例 //2 测试数据 Logger.getRootLogger.setLevel(Level.WARN) valdata_path="/use

【原创】连“霍金”都想学习的“人工智能”---【自己动手写神经网络】小白入门连载开始了(1)

欢迎关注[自己动手写神经网络]的博客连载!!! 第1章 神经网络简介 神经网络这个词,相信大家都不陌生.就在你打开本书,并试图了解神经网络时,你已经在使用一个世界上最复杂的神经网络——你的大脑,一个由大约1000亿个神经元(每个单元拥有约1万个连接)构成的复杂系统.但人的大脑太过复杂,以至于科学家们到目前为止仍然无法准确解释大脑的工作原理和方式.但有幸的是,生物神经网络的最最基本的元素已经能够被识别,而这就构成了本书想为你介绍的人工神经网络(Artificial Neural Network).