I ported this piece of code to CUDA:
if(_layersCount > 1) { for(int i=_layersCount-2;i>=0;i--) { for(int j=0;j<_neuronsPerLayerCount[i];j++) // cuda kernel { localGradients[indexByLayerAndNeuron(i, j)] = 0; for(int k=0;k<_neuronsPerLayerCount[i+1];k++) { localGradients[indexByLayerAndNeuron(i, j)] += _neuronsInputsWeights[indexByLayerNeuronAndInput(i+1, k, j)] * localGradients[indexByLayerAndNeuron(i+1, k)]; } localGradients[indexByLayerAndNeuron(i, j)] *= derivatives[indexByLayerAndNeuron(i, j)]; } } }
Result:
if(_layersCount > 1) { for(int i=_layersCount-2;i>=0;i--) { // calculateLocalGradientsForAnotherLayers blocksCount = floor((double) _neuronsPerLayerCount[i] / threads.x) + 1; blocks = dim3(blocksCount, 1); calculateLocalGradientsForAnotherLayers <<<blocks, threads>>> (deviceLocalGradients, _neuronsInputsWeights, deviceDerivatives, _neuronsPerLayerCount[i], _neuronsInPreviousLayers[i], _neuronsInPreviousLayers[i+1], _neuronsPerLayerCount[i+1], _inputsInPreviousLayers[i], _inputsInCurrentLayer[i]); } }
Kernel calculateLocalGradientsForAnotherLayers:
__global__ void calculateLocalGradientsForAnotherLayers(double * localGradients, double * neuronsInputsWeights, double * derivatives, int neuronsCount, int neuronsInPreviousLayers, int neuronsInPreviousLayersWithCurrent, int neuronsInNextLayer, int inputsInPreviousLayers, int inputsInCurrentLayer) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < neuronsCount) { int neuron = neuronsInPreviousLayers + idx; localGradients[neuron] = 0; // this to Kernel, then reduce localGradients. for(int k=0;k<neuronsInNextLayer;k++) { localGradients[neuron] += neuronsInputsWeights[inputsInPreviousLayers + k*inputsInCurrentLayer + idx] * localGradients[neuronsInPreviousLayersWithCurrent + k]; } localGradients[neuron] *= derivatives[neuron]; } }
But I see differences in the results of calculations, starting with the second decimal. What could be the problem? All other kernels except for this work fine.
My card is GF555M. It supports double precision.