#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cassert>
#define TEST
#define uchar unsigned char
#define ITER_MAX 20
#define BLOCK_NUM 32
#define THREAD_NUM 256
#define SAMPLE_NUM 100
#define SAMPLE_ALL 60000
#define ONE_ROUND SAMPLE_ALL/SAMPLE_NUM
#define ALFA 0.85
#define In 784
#define Hide 1000
#define Out 10
#define FILE_PATH "E:\\CUDA\\"
#define CUDA_CALL(x) {const cudaError_t a=(x); if(a!= cudaSuccess) {printf("\nCUDA ERROR:%s(err_num = %d)\n",cudaGetErrorString(a),a); cudaDeviceReset(); assert(0);}}
__global__ void MSE(float*mse, float*targets, float*output)
{
const size_t thID = threadIdx.x;
const size_t bloID = blockIdx.x;
__shared__ float sharedData[THREAD_NUM];
for(size_t i = bloID*THREAD_NUM + thID ; i < SAMPLE_NUM*Out ;i = i+BLOCK_NUM*THREAD_NUM )
{
sharedData[thID] += 0.5*(targets[i]-output[i])*(targets[i]-output[i]);
}
__syncthreads( );
if(thID<128) sharedData[thID] += sharedData[thID+128];
__syncthreads( );
if ( thID < 64 ) sharedData[thID] += sharedData[thID + 64];
__syncthreads( );
if ( thID < 32 ) sharedData[thID] += sharedData[thID + 32];
if ( thID < 16 ) sharedData[thID] += sharedData[thID + 16];
if ( thID < 8 ) sharedData[thID]+= sharedData[thID + 8];
if ( thID < 4 ) sharedData[thID]+= sharedData[thID + 4];
if ( thID < 2 ) sharedData[thID]+= sharedData[thID + 2];
if ( thID < 1 ) sharedData[thID]+= sharedData[thID + 1];
if ( thID == 0 )// 如果线程ID为0,那么计算结果
{
mse[bloID] = sharedData[0];
}
}
__global__ void ErroLastlayer(float*Err2, float* TargetsSub, float* O2, const int NodeNum)
{
size_t idx = blockIdx.x*THREAD_NUM + threadIdx.x;
for(int i = idx; i< NodeNum*SAMPLE_NUM; i = i+BLOCK_NUM*THREAD_NUM)
{
Err2[i] = (TargetsSub[i]-O2[i])*O2[i]*(1-O2[i]);
}
}
__global__ void Erro(float* Err, float* O, const int NodeNum)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
for(int i = idx; i<NodeNum*SAMPLE_NUM; i=i+BLOCK_NUM*THREAD_NUM)
{
Err[i] = Err[i]*O[i]*(1-O[i]);
}
}
__global__ void Bias(float* Bias, float*Err, const int NodeNum)
{
size_t idx = blockIdx.x*THREAD_NUM + threadIdx.x;
if(idx<NodeNum)
{
for(int i=0; i<SAMPLE_NUM; ++i)
{
Bias[idx] += ALFA*Err[idx+i*NodeNum]/SAMPLE_NUM;
}
}
}
//并行的+bias& SIGMOD激活函数
__global__ void SIGMOD(float* IO, float*Bias, const int NodeNum)
{
int idx = blockIdx.x*blockDim.x +threadIdx.x ;
for(int i = idx; i<NodeNum*SAMPLE_NUM; i=i+BLOCK_NUM*THREAD_NUM)
{
int row = i%NodeNum;
IO[i] = 1/(1+exp(-IO[i]-Bias[row]));
}
__syncthreads();
}
void Save_BP( float* weight1, float* weight2, float* bias1, float* bias2);
void Save_MSE( float* mse);
void BP_Train(float* Samples, float* Targets, float* weight1,float* weight2, float* bias1, float* bias2, float* mse)
{
#ifdef TEST
float* cpuWeight1 = nullptr;
float* cpuWeight2 = nullptr;
float* cpuBias1 = nullptr;
float* cpuBias2 = nullptr;
float* cpuO1 = nullptr;
float* cpuO2 = nullptr;
float* cpuErr1 = nullptr;
float* cpuErr2 = nullptr;
cpuWeight1 = (float*) malloc(sizeof(float)*In*Hide);
cpuWeight2 = (float*) malloc(sizeof(float)*Hide*Out);
cpuBias1 = (float*)malloc(sizeof(float)*Hide);
cpuBias2 = (float*)malloc(sizeof(float)*Out);
cpuO1 = (float*)malloc(sizeof(float)*Hide*SAMPLE_NUM);
cpuO2 = (float*)malloc(sizeof(float)*Out*SAMPLE_NUM);
cpuErr1 = (float*)malloc( sizeof(float)*Hide*SAMPLE_NUM);
cpuErr2 = (float*)malloc( sizeof(float)*Out*SAMPLE_NUM);
#endif
/****************************/
float* gpuSamples = nullptr;
float* gpuTargets = nullptr;
float* gpuMse = nullptr;
float* cpuMse = (float*)malloc(sizeof(float)*BLOCK_NUM);
memset(cpuMse,0,sizeof(float)*BLOCK_NUM);
CUDA_CALL(cudaMalloc((void**)&gpuSamples,sizeof(float)*SAMPLE_ALL*In));
CUDA_CALL(cudaMalloc((void**)&gpuTargets,sizeof(float)*SAMPLE_ALL*Out));
CUDA_CALL(cudaMalloc((void**)&gpuMse,sizeof(float)*BLOCK_NUM));
float* gpuWeight1 = nullptr;
float* gpuWeight2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuWeight1,sizeof(float)*In*Hide));
CUDA_CALL(cudaMalloc((void**)&gpuWeight2,sizeof(float)*Hide*Out));
float* gpuBias1 = nullptr;
float* gpuBias2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuBias1,sizeof(float)*Hide));
CUDA_CALL(cudaMalloc((void**)&gpuBias2,sizeof(float)*Out));
float* gpuErr1 = nullptr;
float* gpuErr2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuErr1,sizeof(float)*Hide*SAMPLE_NUM));
CUDA_CALL(cudaMalloc((void**)&gpuErr2,sizeof(float)*Out*SAMPLE_NUM));
float* O1 = nullptr;
float* O2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&O1,sizeof(float)*Hide*SAMPLE_NUM));
CUDA_CALL(cudaMalloc((void**)&O2,sizeof(float)*Out*SAMPLE_NUM));
CUDA_CALL(cudaMemcpy(gpuSamples, Samples, sizeof(float)*In*SAMPLE_ALL, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuTargets, Targets, sizeof(float)*Out*SAMPLE_ALL, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuWeight1, weight1, sizeof(float)*In*Hide, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuWeight2, weight2, sizeof(float)*Hide*Out, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuBias1,bias1,sizeof(float)*Hide,cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuBias2,bias2,sizeof(float)*Out,cudaMemcpyHostToDevice));
const float alpha = ALFA/SAMPLE_NUM;
const float beta = 1.0f;
const float alpha_Nor = 1.0f;
const float beta_Nor = 0.0f;
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
int iter = 0;
while(iter<ITER_MAX)
{
printf("iter = %d:\n",iter);
int iter_1round = 0;
while(iter_1round<ONE_ROUND)
{ /*O1,Hide*/
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Hide, SAMPLE_NUM, In, &alpha_Nor, gpuWeight1, Hide, gpuSamples + iter_1round*In*SAMPLE_NUM, In, &beta_Nor, O1, Hide);
#ifdef TEST1
printf("I1:\n");
CUDA_CALL(cudaMemcpy(cpuO1,O1,sizeof(float)*Hide*SAMPLE_NUM,cudaMemcpyDeviceToHost));
for(int j=0; j<1000; ++j)
{
printf("%8.6f ",cpuO1[j]);
}
printf("\n");
#endif
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O1,gpuBias1,Hide);
#ifdef TEST
CUDA_CALL(cudaMemcpy(cpuO1,O1,sizeof(float)*Hide*SAMPLE_NUM,cudaMemcpyDeviceToHost));
printf("GPU O1:\n");
for(int j=0; j<Hide; ++j)
{
printf("%8.6f ",cpuO1[j]);
}
printf("\n");
#endif
/*O2,Out*/
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Out, SAMPLE_NUM, Hide, &alpha_Nor, gpuWeight2, Out, O1, Hide, &beta_Nor, O2, Out);
#ifdef TEST1
printf("\n\n");
printf("I2:\n");
CUDA_CALL(cudaMemcpy(cpuO2,O2,sizeof(float)*Out*SAMPLE_NUM,cudaMemcpyDeviceToHost));
for(int j=0; j<Out; ++j)
{
printf("%8.6f ",cpuO2[j]);
}
printf("\n");
#endif
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(O2,gpuBias2,Out);
#ifdef TEST
CUDA_CALL(cudaMemcpy(cpuO2,O2,sizeof(float)*Out*SAMPLE_NUM,cudaMemcpyDeviceToHost));
for(int j=0; j<Out; ++j)
{
printf("%8.6f ",cpuO2[j]);
}
printf("\n");
#endif
/*MSE*/
MSE<<<BLOCK_NUM,THREAD_NUM>>>(gpuMse, gpuTargets + iter_1round*Out*SAMPLE_NUM, O2);
CUDA_CALL(cudaMemcpy(cpuMse,gpuMse,sizeof(float)*BLOCK_NUM,cudaMemcpyDeviceToHost));
#ifdef TEST1
for(int i=0; i<BLOCK_NUM; ++i)
{
printf("%8.6f ",cpuMse[i]);
}
printf("\n");
#endif
for(int m =0; m<BLOCK_NUM;++m)
{
mse[iter*ONE_ROUND + iter_1round] += cpuMse[m];
}
mse[iter*ONE_ROUND + iter_1round] /= SAMPLE_NUM;
#ifdef TEST1
printf("mse[%4d] = %f\n",iter*ONE_ROUND + iter_1round,mse[iter*ONE_ROUND + iter_1round]);
#endif
/*err2,bias2,weight2*/
//ErroBiasLastlayer<<<BLOCK_NUM, THREAD_NUM>>>(gpuErr2, gpuBias2, gpuTargets + iter_1round*Out*SAMPLE_NUM, O2,Out);
ErroLastlayer<<<BLOCK_NUM,THREAD_NUM>>>(gpuErr2, gpuTargets + iter_1round*Out*SAMPLE_NUM, O2, Out);
Bias<<<BLOCK_NUM,THREAD_NUM>>>(gpuBias2, gpuErr2, Out);
#ifdef TEST1
printf("ErroBiasLastlayer:\nBias2:\n");
CUDA_CALL(cudaMemcpy(cpuBias2,gpuBias2,sizeof(float)*Out,cudaMemcpyDeviceToHost));
for(int i=0; i<Out;++i)
{
printf("%8.6f ",cpuBias2[i]);
}
printf("\n\n");
printf("ErroBiasLastlayer:\nErr2:\n");
CUDA_CALL(cudaMemcpy(cpuErr2,gpuErr2,sizeof(float)*SAMPLE_NUM*Out,cudaMemcpyDeviceToHost));
for(int i=0; i<SAMPLE_NUM; ++i)
{
for(int j=0; j<Out; ++j)
{
printf("%8.6f ",cpuErr2[i*Out + j]);
}
printf("\n");
}
printf("\n\n");
#endif
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, Out, Hide, SAMPLE_NUM, &alpha, gpuErr2, Out, O1, Hide, &beta, gpuWeight2, Out);
#ifdef TEST1
printf("After Weight2:\n");
CUDA_CALL(cudaMemcpy(cpuWeight2,gpuWeight2,sizeof(float)*Out*Hide,cudaMemcpyDeviceToHost));
for(int i=0; i<Hide; ++i)
{
for(int j=0; j<Out;++j)
{
printf("%8.6f ",cpuWeight2[i*Out+j]);
}
printf("\n");
}
printf("\n\n");
#endif
/*err1,bias1,weight1*/
ret = cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_N,Hide,SAMPLE_NUM, Out, &alpha_Nor,gpuWeight2,Out,gpuErr2,Out,&beta_Nor,gpuErr1,Hide);
//ErroBias<<<BLOCK_NUM, THREAD_NUM>>>(gpuErr1, gpuBias1, O1, Hide);
Erro<<<BLOCK_NUM,THREAD_NUM>>>(gpuErr1, O1, Hide);
Bias<<<BLOCK_NUM,THREAD_NUM>>>(gpuBias1, gpuErr1, Out);
#ifdef TEST1
printf("Bias1:\n");
CUDA_CALL(cudaMemcpy(cpuBias1,gpuBias1,sizeof(float)*Hide,cudaMemcpyDeviceToHost));
for(int i=0; i<Hide;++i)
{
printf("%8.6f ",cpuBias1[i]);
}
printf("\n\n");
printf("Err1:\n");
CUDA_CALL(cudaMemcpy(cpuErr1,gpuErr1,sizeof(float)*Hide*SAMPLE_NUM,cudaMemcpyDeviceToHost));
for(int j=0; j<Hide; ++j)
{
printf("%8.6f ",cpuErr1[j]);
}
printf("\n\n");
#endif
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, Hide, In, SAMPLE_NUM, &alpha, gpuErr1, Hide, gpuSamples + iter_1round*In*SAMPLE_NUM, In, &beta, gpuWeight1, Hide);
#ifdef TEST1
printf("After Weight1:\n");
CUDA_CALL(cudaMemcpy(cpuWeight1,gpuWeight1,sizeof(float)*In*Hide,cudaMemcpyDeviceToHost));
for(int j=0; j<Hide;++j)
{
printf("%8.6f ",cpuWeight1[j]);
}
printf("\n\n");
#endif
iter_1round++;
}
iter++;
}
ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
cudaMemcpy(weight1, gpuWeight1, sizeof(int)*Hide*In, cudaMemcpyDeviceToHost);
cudaMemcpy(weight2, gpuWeight2, sizeof(int)*Hide*Out, cudaMemcpyDeviceToHost);
cudaMemcpy(bias1, gpuBias1, sizeof(int)*Hide, cudaMemcpyDeviceToHost);
cudaMemcpy(bias2, gpuBias2, sizeof(int)*Out, cudaMemcpyDeviceToHost);
#ifdef TEST
free(cpuWeight1);
free(cpuWeight2);
free(cpuBias1);
free(cpuBias2);
free(cpuO1);
free(cpuO2);;
free(cpuErr1);
free(cpuErr2);
#endif
free(cpuMse);
cudaFree(gpuSamples);
cudaFree(gpuTargets);
cudaFree(gpuWeight1);
cudaFree(gpuWeight2);
cudaFree(gpuBias1);
cudaFree(gpuBias2);
cudaFree(gpuErr1);
cudaFree(gpuErr2);
cudaFree(gpuMse);
cudaFree(O1);
cudaFree(O2);
}
int main()
{
int magic[4], magic_l[2];
uchar *s = (uchar*)malloc(sizeof(uchar)*SAMPLE_ALL*In);
uchar *t = (uchar*)malloc(sizeof(uchar)*SAMPLE_ALL);
float* fSamples = (float*)malloc(sizeof(float)*SAMPLE_ALL*In);
float* fTargets = (float*)malloc(sizeof(float)*SAMPLE_ALL*Out);
memset(fTargets,0,sizeof(float)*SAMPLE_ALL*Out);
FILE* fp = fopen("train-images.idx3-ubyte","rb");
fread(magic,sizeof(int),4, fp);
fread(s,sizeof(uchar),SAMPLE_ALL*In,fp);
fclose(fp);
fp = fopen("train-labels.idx1-ubyte","rb");
fread(magic_l,sizeof(int),2,fp);
fread(t,sizeof(uchar),SAMPLE_ALL,fp);
fclose(fp);
for(int i=0; i<SAMPLE_ALL*In; i++)
{
fSamples[i] = (float)s[i]/255.f;
}
for(int i=0; i<SAMPLE_ALL; i++)
{
fTargets[i*Out + t[i]]= 1.0f;
}
free(s);
free(t);
float* weight1 = (float*)malloc(sizeof(float)*Hide*In);
float* weight2 = (float*)malloc(sizeof(float)*Hide*Out);
float* bias1 = (float*)malloc(sizeof(float)*Hide);
float* bias2 = (float*)malloc(sizeof(float)*Out);
fp = fopen("netInit.bin","rb");
fread(weight1,sizeof(float),Hide*In,fp);
fread(weight2,sizeof(float),Hide*Out,fp);
fread(bias1,sizeof(float),Hide,fp);
fread(bias2,sizeof(float),Out,fp);
fclose(fp);
float* mse = (float*)malloc(sizeof(float)*ONE_ROUND*ITER_MAX);
memset(mse,0,sizeof(float)*ONE_ROUND*ITER_MAX);
/****************************************/
BP_Train(fSamples, fTargets, weight1, weight2, bias1, bias2, mse);
/****************************************/
CUDA_CALL(cudaDeviceReset());
Save_BP(weight1, weight2, bias1, bias2);
Save_MSE(mse);
free(fSamples);
free(fTargets);
free(weight1);
free(weight2);
free(bias1);
free(bias2);
return 0;
}
void Save_BP( float* weight1, float* weight2, float* bias1, float* bias2)
{
char bpFile[100];
sprintf(bpFile,"%s%s%s",FILE_PATH,"bp",".txt");
FILE* fpBP = fopen(bpFile,"wb");
for(int i=0; i<In*Hide;++i)
{
fprintf(fpBP,"%8.6f ",weight1[i]);
}
fprintf(fpBP,"\n\n ");
for(int i=0; i<Hide*Out;++i)
{
fprintf(fpBP,"%8.6f ",weight2[i]);
}
fprintf(fpBP,"\n\n ");
for(int i =0; i<Hide; ++i)
{
fprintf(fpBP,"%8.6f ",bias1[i]);
}
fprintf(fpBP,"\n\n ");
for(int i =0; i<Out; ++i)
{
fprintf(fpBP,"%8.6f ",bias2[i]);
}
fprintf(fpBP,"\n\n ");
fclose(fpBP);
}
void Save_MSE( float* mse)
{
char mseFile[100];
sprintf(mseFile,"%s%s%s",FILE_PATH,"mse",".txt");
FILE* fpMSE = fopen(mseFile,"wb");
for(int i =0; i<ITER_MAX*ONE_ROUND; ++i)
{
fprintf(fpMSE,"%8.6f ",mse[i]);
if(i%10 == 0)
fprintf(fpMSE,"\n ");
}
fprintf(fpMSE,"\n ");
fclose(fpMSE);
}