cuda: host
/*
* 电子科技大学 电子工程学院 张舒 编写,未经允许不得用于商业用途
* BP nerve neworl research
* Host code.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include "cublas.h"
#include <nerve_kernel.cu>
#define size_SamInEx ((int)(SamNum*(InDim+1)))
#define size_SamOut ((int)(SamNum*OutDim))
#define size_W1Ex ((int)(HiddenUnitNum*(InDim+1)))
#define size_W2Ex ((int)(OutDim*(HiddenUnitNum+1)))
int iDivUp(int, int);
void InitSample(int, int, int, int, float*, float*, float*, float*);
void train(float, int, int, int, int, int, float*, float*, float*, float*);
void logsig(float*, float*, float*, int, int);
extern "C"
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
const float lr = 0.1f;
const int MaxEpochs = 20000;
const int HiddenUnitNum = 10;
const int InDim = 2;
const int OutDim = 3;
const int SamNum = 200;
float* h_SamInEx;
float* h_SamOut;
float* h_W1Ex;
float* h_W2Ex;
CUT_DEVICE_INIT();
cublasInit();
CUDA_SAFE_CALL( cudaMallocHost((void**) &h_SamInEx, sizeof(float)*size_SamInEx));
CUDA_SAFE_CALL( cudaMallocHost((void**) &h_SamOut, sizeof(float)*size_SamOut));
CUDA_SAFE_CALL( cudaMallocHost((void**) &h_W1Ex, sizeof(float)*size_W1Ex));
CUDA_SAFE_CALL( cudaMallocHost((void**) &h_W2Ex, sizeof(float)*size_W2Ex));
InitSample(SamNum, InDim, OutDim, HiddenUnitNum, h_SamInEx, h_SamOut, h_W1Ex, h_W2Ex);
train(lr, MaxEpochs, HiddenUnitNum, InDim, OutDim, SamNum, h_SamInEx, h_SamOut, h_W1Ex, h_W2Ex);
printf("\n");
for(int i=0; i <size_W1Ex; i++)
{
printf("%3.3f ",h_W1Ex[i]);
}
printf("\n");
for(int i=0; i <size_W2Ex; i++)
{
printf("%3.3f ",h_W2Ex[i]);
}
FILE *p;
p = fopen("D:\\0_W1Ex.dat", "wb");
fwrite(h_W1Ex,sizeof(float),size_W1Ex,p);
fclose(p);
p = fopen("D:\\0_W2Ex.dat", "wb");
fwrite(h_W2Ex,sizeof(float),size_W2Ex,p);
fclose(p);
CUDA_SAFE_CALL( cudaFreeHost((h_SamInEx)));
CUDA_SAFE_CALL( cudaFreeHost((h_SamOut)));
CUDA_SAFE_CALL( cudaFreeHost((h_W1Ex)));
CUDA_SAFE_CALL( cudaFreeHost((h_W2Ex)));
h_SamInEx = NULL;
h_SamOut = NULL;
h_W1Ex=NULL;
h_W2Ex=NULL;
cublasShutdown();
CUT_EXIT(argc, argv);
}
int
iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
void InitSample(int SamNum, int InDim, int OutDim, int HiddenUnitNum, float* h_SamInEx, float* h_SamOut, float* h_W1Ex,float* h_W2Ex)
{
srand(clock());
FILE *p;
p = fopen("D:\\samin.dat", "rb");
for(int i=0; i <size_SamInEx; i++)
{
h_SamInEx[i] = 1.0f;
}
for(int i=0; i <SamNum; i++)
{
for(int j=0; j <InDim;j++)
{
fread(&h_SamInEx[j*SamNum+i],sizeof(float),1,p);
}
}
fclose(p);
for(int i=0; i <size_SamInEx; i++)
{
printf("%3.4f\t",h_SamInEx[i]);
}
printf("\n");
p = fopen("D:\\samout.dat", "rb");
for(int i=0; i <SamNum; i++)
{
for(int j=0; j <OutDim;j++)
{
fread(&h_SamOut[j*SamNum+i],sizeof(float),1,p);
}
}
/* for(int i=0; i <size_SamOut; i++)
{
printf("%3.5f\t",h_SamOut[i]);
}*/
fclose(p);
printf("\n");
for(int i=0; i <size_W1Ex; i++)
{
h_W1Ex[i]=0.2f*rand()/(float)RAND_MAX - 0.1f;
}
for(int i=0; i <size_W2Ex; i++)
{
h_W2Ex[i]=0.2f*rand()/(float)RAND_MAX - 0.1f;
}
/*
p = fopen("D:\\W1Ex.dat", "rb");
for(int i=0; i <(InDim+1); i++)
{
for(int j=0; j <HiddenUnitNum;j++)
{
fread(&h_W1Ex[j*(InDim+1)+i],sizeof(float),1,p);
}
}
for(int i=0; i <size_W1Ex; i++)
{
printf("%3.3f ",h_W1Ex[i]);
}
fclose(p);
printf("\n");
p = fopen("D:\\W2Ex.dat", "rb");
for(int i=0; i <(HiddenUnitNum+1); i++)
{
for(int j=0; j <OutDim;j++)
{
fread(&h_W2Ex[j*(HiddenUnitNum+1)+i],sizeof(float),1,p);
}
}
for(int i=0; i <size_W2Ex; i++)
{
printf("%3.3f ",h_W2Ex[i]);
}
fclose(p);
*/
printf("\n");
p=NULL;
}
void
train(float lr, int MaxEpochs, int HiddenUnitNum, int InDim,int OutDim,int SamNum, float* h_SamInEx, float* h_SamOut, float* h_W1Ex, float* h_W2Ex)
{
float* d_W1Ex;
float* d_W2Ex;
float* d_W2;
float* d_SamInEx;
float* d_SamOut;
float* d_error;
float* HiddenOutEx;
float* NetworkOut;
float* Delta1;
float* Delta2;
unsigned int timer = 0;
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 HiddenOutExgrid(iDivUp(SamNum, BLOCK_SIZE), iDivUp((HiddenUnitNum+1), BLOCK_SIZE));
dim3 HiddenOutgrid(iDivUp(SamNum, BLOCK_SIZE), iDivUp((HiddenUnitNum), BLOCK_SIZE));
dim3 SamOutgrid(iDivUp(SamNum, BLOCK_SIZE), iDivUp(OutDim, BLOCK_SIZE));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_error, sizeof(float)*size_SamOut));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_W1Ex, sizeof(float)*size_W1Ex));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_W2Ex, sizeof(float)*size_W2Ex));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_W2, sizeof(float)*OutDim*HiddenUnitNum));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_SamInEx, sizeof(float)*size_SamInEx));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_SamOut, sizeof(float)*size_SamOut));
CUDA_SAFE_CALL(cudaMalloc((void**) &HiddenOutEx, sizeof(float)*(HiddenUnitNum+1)*SamNum));
CUDA_SAFE_CALL(cudaMalloc((void**) &NetworkOut, sizeof(float)*size_SamOut));
CUDA_SAFE_CALL(cudaMalloc((void**) &Delta1, sizeof(float)*HiddenUnitNum*SamNum));
CUDA_SAFE_CALL(cudaMalloc((void**) &Delta2, sizeof(float)*size_SamOut));
CUDA_SAFE_CALL(cudaMemcpy(d_W1Ex, h_W1Ex, sizeof(float)*size_W1Ex,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_W2Ex, h_W2Ex, sizeof(float)*size_W2Ex,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_SamInEx, h_SamInEx, sizeof(float)*size_SamInEx,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_SamOut, h_SamOut, sizeof(float)*size_SamOut,cudaMemcpyHostToDevice));
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
for(int l=0; l < MaxEpochs; l++)
{
for(int i=0; i < OutDim; i++)
{
CUDA_SAFE_CALL(cudaMemcpy(d_W2 + HiddenUnitNum * i, d_W2Ex + (HiddenUnitNum + 1) * i, sizeof(float)*HiddenUnitNum,cudaMemcpyDeviceToDevice));
}
cublasSgemm('n','n',SamNum, HiddenUnitNum, (InDim+1), 1.0f, d_SamInEx, SamNum, d_W1Ex, (InDim+1), 0.0f, HiddenOutEx, SamNum );
logsig1 < < <HiddenOutExgrid, threads>>>(HiddenOutEx, SamNum, HiddenUnitNum);
cublasSgemm('n','n',SamNum, OutDim, (HiddenUnitNum+1), 1.0f, HiddenOutEx, SamNum, d_W2Ex, (HiddenUnitNum+1), 0.0f, NetworkOut, SamNum );
logsig2 < < <SamOutgrid,threads>>>(NetworkOut, SamNum, OutDim);
dotsub < < <SamOutgrid,threads>>>(Delta2, d_SamOut, NetworkOut, SamNum, OutDim);
getdelta < < <SamOutgrid,threads>>>(Delta2, NetworkOut, SamNum, OutDim);
cublasSgemm('t','n',(HiddenUnitNum+1), OutDim, SamNum, lr, HiddenOutEx, SamNum, Delta2, SamNum, 1.0f, d_W2Ex, (HiddenUnitNum+1) );
cublasSgemm('n','t', SamNum, HiddenUnitNum, OutDim, 1.0f, Delta2, SamNum, d_W2, HiddenUnitNum, 0.0f, Delta1, SamNum );
getdelta < < <HiddenOutgrid, threads>>>(Delta1, HiddenOutEx, SamNum, HiddenUnitNum);
cublasSgemm('t','n', (InDim+1), HiddenUnitNum, SamNum, lr, d_SamInEx, SamNum, Delta1, SamNum, 1.0f, d_W1Ex, (InDim+1));
}
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
CUDA_SAFE_CALL(cudaMemcpy(h_W1Ex, d_W1Ex, sizeof(float)*size_W1Ex,cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_W2Ex, d_W2Ex, sizeof(float)*size_W2Ex,cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_SamInEx));
CUDA_SAFE_CALL(cudaFree(d_SamOut));
CUDA_SAFE_CALL(cudaFree(d_W1Ex));
CUDA_SAFE_CALL(cudaFree(d_W2Ex));
CUDA_SAFE_CALL(cudaFree(d_SamOut));
CUDA_SAFE_CALL(cudaFree(HiddenOutEx));
CUDA_SAFE_CALL(cudaFree(NetworkOut));
CUDA_SAFE_CALL(cudaFree(Delta1));
CUDA_SAFE_CALL(cudaFree(Delta2));
}