#include "MultiHide_BP.cuh"
void NN_TRAIN(float* Samples, float* Targets, int LayersNum, int* Layers, float**W, float**B, float* mse)
{
float* gpuSamples = nullptr;
float* gpuTargets = nullptr;
float* gpuMse = nullptr;
float* cpuMse = (float*)malloc(sizeof(float)*BLOCK_NUM);
CUDA_CALL(cudaMalloc((void**)&gpuSamples, sizeof(float)*SAMPLE_ALL*Layers[0]));
CUDA_CALL(cudaMalloc((void**)&gpuTargets, sizeof(float)*SAMPLE_ALL*Layers[LayersNum-1]));
CUDA_CALL(cudaMalloc((void**)&gpuMse, sizeof(float)*BLOCK_NUM));
CUDA_CALL(cudaMemcpy(gpuSamples, Samples, sizeof(float)*SAMPLE_ALL*Layers[0], cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuTargets, Targets, sizeof(float)*SAMPLE_ALL*Layers[LayersNum-1], cudaMemcpyHostToDevice));
int* gpuLayers = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuLayers, sizeof(int)*LayersNum));
CUDA_CALL(cudaMemcpy(gpuLayers, Layers, sizeof(int)*LayersNum, cudaMemcpyHostToDevice));
float* Ones = (float*) malloc(sizeof(float)*SAMPLE_NUM);
float* gpuOnes =nullptr;
memset(Ones, 1, sizeof(float)*SAMPLE_NUM);
CUDA_CALL(cudaMalloc((void**)&gpuOnes, sizeof(float)*SAMPLE_NUM));
CUDA_CALL(cudaMemcpy(gpuOnes, Ones, sizeof(float)*SAMPLE_NUM, cudaMemcpyHostToDevice));
float** gpuW = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuW, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuW[i], sizeof(float)*(Layers[i])*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuW[i], W[i], sizeof(float)*(Layers[i])*Layers[i+1], cudaMemcpyHostToDevice));
}
float** gpuB = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuB, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuB[i], sizeof(float)*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuB[i], B[i], sizeof(float)*Layers[i+1], cudaMemcpyHostToDevice));
}
float** gpuErr = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuErr, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuErr[i], sizeof(float)*SAMPLE_NUM*(Layers[i+1])));
}
float** gpuO =nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuO, sizeof(float*)*(LayersNum)));
for(int i=0; i<LayersNum; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuO[i], sizeof(float)*SAMPLE_NUM*(Layers[i])));
}
int iter = 0;
while(iter<ITER_MAX)
{
printf("iter = %d:\n",iter);
int iter_1round = 0;
while(iter_1round<ONE_ROUND)
{
NNFF(gpuW, gpuB, gpuO, LAYERSNUM, Layers, gpuSamples + iter_1round*Layers[0]*SAMPLE_NUM);
MSE(mse, cpuMse, gpuMse, gpuTargets + iter_1round*Layers[LayersNum-1]*SAMPLE_NUM, gpuO[LayersNum-1], Layers[LayersNum-1], iter, iter_1round);
NNBP( gpuW, gpuB, gpuErr, gpuO, gpuTargets + iter_1round*Layers[LayersNum-1]*SAMPLE_NUM, gpuOnes, LAYERSNUM, gpuLayers, iter);
iter_1round++;
}
iter++;
}
for(int i=0; i<LayersNum-1; ++i)
{
cudaMemcpy(W[i],gpuW[i],sizeof(float)*Layers[i]*Layers[i+1],cudaMemcpyDeviceToHost);
cudaMemcpy(B[i],gpuB[i],sizeof(float)*Layers[i+1],cudaMemcpyDeviceToHost);
}
free(cpuMse);
free(Ones);
cudaFree(gpuMse);
cudaFree(gpuOnes);
cudaFree(gpuLayers);
for(int i=0; i<LayersNum-1; ++i)
{
cudaFree(gpuW[i]);
cudaFree(gpuB[i]);
cudaFree(gpuErr[i]);
cudaFree(gpuO[i]);
}
cudaFree(gpuO[LayersNum-1]);
cudaFree(*gpuW);
cudaFree(*gpuB);
cudaFree(*gpuErr);
cudaFree(*gpuO);
cudaFree(gpuSamples);
cudaFree(gpuTargets);
}
void BP_Test(float* Samples, uchar* Targets,int LayersNum, int* Layers, float**W, float**B, int* Record)
{
float* gpuSamples = nullptr;
uchar* gpuTargets = nullptr;
int * gpuRecord = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuSamples, sizeof(float)*SAMPLE_TEST*Layers[0]));
CUDA_CALL(cudaMalloc((void**)&gpuTargets, sizeof(float)*SAMPLE_TEST*Layers[LayersNum-1]));
CUDA_CALL(cudaMalloc((void**)&gpuRecord,sizeof(int)*SAMPLE_TEST));
CUDA_CALL(cudaMemcpy(gpuSamples, Samples, sizeof(float)*SAMPLE_TEST*Layers[0], cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuTargets, Targets, sizeof(uchar)*SAMPLE_TEST, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuRecord, Record, sizeof(int)*SAMPLE_TEST, cudaMemcpyHostToDevice));
float** gpuW = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuW, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuW[i], sizeof(float)*(Layers[i])*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuW[i], W[i], sizeof(float)*(Layers[i])*Layers[i+1], cudaMemcpyHostToDevice));
}
float** gpuB = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuB, sizeof(float*)*(LayersNum-1)));
for(int i=0; i<LayersNum-1; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuB[i], sizeof(float)*Layers[i+1]));
CUDA_CALL(cudaMemcpy(gpuB[i], B[i], sizeof(float)*Layers[i+1], cudaMemcpyHostToDevice));
}
float** gpuO =nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuO, sizeof(float*)*(LayersNum)));
for(int i=0; i<LayersNum; ++i)
{
CUDA_CALL(cudaMalloc((void**)&gpuO[i], sizeof(float)*SAMPLE_NUM*(Layers[i])));
}
/***************************/
NNFF(gpuW, gpuB, gpuO, LAYERSNUM, Layers, gpuSamples);
GetResult<<<BLOCK_NUM,THREAD_NUM>>>(gpuO[LayersNum-1], gpuTargets, Layers[LayersNum-1], gpuRecord);
cudaMemcpy(Record, gpuRecord, sizeof(int)*SAMPLE_TEST, cudaMemcpyDeviceToHost);
/***************************/
for(int i=0; i<LayersNum-1; ++i)
{
cudaFree(gpuW[i]);
cudaFree(gpuB[i]);
cudaFree(gpuO[i]);
}
cudaFree(gpuO[LayersNum-1]);
cudaFree(*gpuW);
cudaFree(*gpuB);
cudaFree(*gpuO);
cudaFree(gpuSamples);
cudaFree(gpuTargets);
}
void NNFF(float**gpuW, float** gpuB, float**gpuO, int LayersNum, int* Layers, float* gpuSampleSub)
{
const float alpha_Nor = 1.0f;
const float beta_Nor = 0.0f;
CUDA_CALL(cudaMemcpy(gpuO[0], gpuSampleSub, sizeof(float)*SAMPLE_NUM*Layers[0], cudaMemcpyDeviceToDevice));
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
for(int i=1; i<LayersNum; ++i)
{
//ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Hide, SAMPLE_NUM, In, &alpha_Nor, gpuWeight1, Hide, gpuSamples + iter_1round*In*SAMPLE_NUM, In, &beta_Nor, O1, Hide);
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Layers[i], SAMPLE_NUM, Layers[i-1], &alpha_Nor, gpuW[i-1], Layers[i], gpuO[i-1], Layers[i-1], &beta_Nor, gpuO[i], Layers[i]);
SIGMOD<<<BLOCK_NUM,THREAD_NUM>>>(gpuO[i], gpuB[i-1], Layers[i], SAMPLE_NUM);
}
ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
}
void MSE(float*mse, float*cpuMse, float*gpuMse, float*gpuTargetSub, float*gpuOLast, const int OutNum, const int iter, const int iter_1round)
{
MSE_Kernel<<<BLOCK_NUM,THREAD_NUM>>>(gpuMse, gpuTargetSub, gpuOLast, OutNum);
CUDA_CALL(cudaMemcpy(cpuMse,gpuMse,sizeof(float)*BLOCK_NUM,cudaMemcpyDeviceToHost));
for(int m =0; m<BLOCK_NUM;++m)
{
mse[iter*ONE_ROUND + iter_1round] += cpuMse[m];
}
mse[iter*ONE_ROUND + iter_1round] /= SAMPLE_NUM;
}
void NNBP( float** gpuW, float** gpuB, float**gpuErr, float**gpuO, float*gpuTargetSub, float*gpuOnes, int LayersNum, int* Layers, const int iter)
{
static float alpha = ALFA/SAMPLE_NUM;
const float beta = 1.0f;
const float alpha_Nor = 1.0f;
const float beta_Nor = 0.0f;
ErroLastlayer<<<BLOCK_NUM,THREAD_NUM>>>(gpuErr[LayersNum-2], gpuTargetSub, gpuO[LayersNum-1], Layers[LayersNum-1]);
if(iter>THREAD)
{
alpha *=0.92;
}
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
//求err
for(int i= LayersNum-3; i>=0; i--)
{
ret = cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_N, Layers[i+1], SAMPLE_NUM, Layers[i+2], &alpha_Nor,gpuW[i+1], Layers[i+1],gpuErr[i+1], Layers[i+2],&beta_Nor,gpuErr[i],Layers[i+1]);
Erro<<<BLOCK_NUM,THREAD_NUM>>>(gpuErr[i], gpuO[i+1], Layers[i+1]);
}
//求weight和bias
for(int i=0; i<LayersNum-1;++i)
{
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Layers[i+1], 1, SAMPLE_NUM, &alpha, gpuErr[i], Layers[i+1], gpuOnes, SAMPLE_NUM, &beta, gpuB[i], Layers[i+1]);
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, Layers[i+1],Layers[i],SAMPLE_NUM, &alpha,gpuErr[i], Layers[i+1],gpuO[i], Layers[i], &beta, gpuW[i], Layers[i+1]);
}
ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
}
__global__ void SIGMOD(float* IO, float*Bias, const int NodeNum,const int SampleNum)
{
int idx = blockIdx.x*blockDim.x +threadIdx.x ;
for(int i = idx; i<NodeNum*SampleNum; i=i+BLOCK_NUM*THREAD_NUM)
{
int row = i%NodeNum;
IO[i] = 1/(1+exp(-IO[i]-Bias[row]));
}
__syncthreads();
}
__global__ void MSE_Kernel(float*gpuMse, float*targets, float*output, const int OutNum)
{
const size_t thID = threadIdx.x;
const size_t bloID = blockIdx.x;
__shared__ float sharedData[THREAD_NUM];
sharedData[thID] = 0;
for(size_t i = bloID*THREAD_NUM + thID ; i < SAMPLE_NUM*OutNum ; i = i+BLOCK_NUM*THREAD_NUM )
{
sharedData[thID] += 0.5*(targets[i]-output[i])*(targets[i]-output[i]);
}
__syncthreads( );
if(thID<128) sharedData[thID] += sharedData[thID+128];
__syncthreads( );
if ( thID < 64 ) sharedData[thID] += sharedData[thID + 64];
__syncthreads( );
if ( thID < 32 ) sharedData[thID] += sharedData[thID + 32];
if ( thID < 16 ) sharedData[thID] += sharedData[thID + 16];
if ( thID < 8 ) sharedData[thID]+= sharedData[thID + 8];
if ( thID < 4 ) sharedData[thID]+= sharedData[thID + 4];
if ( thID < 2 ) sharedData[thID]+= sharedData[thID + 2];
if ( thID < 1 ) sharedData[thID]+= sharedData[thID + 1];
if ( thID == 0 )// 如果线程ID为0,那么计算结果
{
gpuMse[bloID] = sharedData[0];
}
}
__global__ void ErroLastlayer(float*gpuErrLast, float* gpuTargetsSub, float* gpuOLast, const int NodeNum)
{
size_t idx = blockIdx.x*THREAD_NUM + threadIdx.x;
for(int i = idx; i< NodeNum*SAMPLE_NUM; i = i+BLOCK_NUM*THREAD_NUM)
{
gpuErrLast[i] = (gpuTargetsSub[i]-gpuOLast[i])*gpuOLast[i]*(1-gpuOLast[i]);
}
}
__global__ void Erro(float* Err, float* O, const int NodeNum)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
for(int i = idx; i<NodeNum*SAMPLE_NUM; i=i+BLOCK_NUM*THREAD_NUM)
{
Err[i] = Err[i]*O[i]*(1-O[i]);
}
}
__global__ void GetResult(float* OLast, uchar* Targets, const int OutNum, int * Record)
{
size_t idx = blockIdx.x*THREAD_NUM + threadIdx.x;
for(int i = idx; i < SAMPLE_TEST; i = i+THREAD_NUM*BLOCK_NUM )
{
float value = OLast[10*i];
uchar label = 0;
for(int j=1; j<OutNum; ++j)
{
if(OLast[10*i+j]>value)
{
value = OLast[10*i+j];
label =(uchar) j;
}
}
if(Targets[i] != label)
{
Record[i] = 1;
}
}
}