diff options
Diffstat (limited to 'cuda/2d/util.cu')
-rw-r--r-- | cuda/2d/util.cu | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu new file mode 100644 index 0000000..06f6714 --- /dev/null +++ b/cuda/2d/util.cu @@ -0,0 +1,244 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include "util.h" + +namespace astraCUDA { + +bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch) +{ + // TODO: a full memset isn't necessary. Only the edges. + cudaError_t err; + err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2); + ASTRA_CUDA_ASSERT(err); + err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + ASTRA_CUDA_ASSERT(err); + assert(err == cudaSuccess); + return true; +} + +bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch) +{ + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + ASTRA_CUDA_ASSERT(err); + return true; +} + + +bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch) +{ + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + ASTRA_CUDA_ASSERT(err); + return true; +} + +bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch) +{ + // TODO: a full memset isn't necessary. Only the edges. + cudaError_t err; + err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height); + ASTRA_CUDA_ASSERT(err); + err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + ASTRA_CUDA_ASSERT(err); + return true; +} + + +bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch) +{ + size_t p; + cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height); + if (ret != cudaSuccess) { + reportCudaError(ret); + fprintf(stderr, "Failed to allocate %dx%d GPU buffer\n", width, height); + return false; + } + + assert(p % sizeof(float) == 0); + + pitch = p / sizeof(float); + + return true; +} + +void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaError_t err; + err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height); + ASTRA_CUDA_ASSERT(err); +} + + +template <unsigned int blockSize> +__global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n) +{ + extern __shared__ float sdata[]; + unsigned int tid = threadIdx.x; + + unsigned int i = blockIdx.x*(blockSize*2) + tid; + unsigned int gridSize = blockSize*gridDim.x; + sdata[tid] = 0; + while (i < n) { sdata[tid] += g_idata[i]; i += gridSize; } + __syncthreads(); + if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } + if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } + if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } + if (tid < 32) { + volatile float* smem = sdata; + if (blockSize >= 64) smem[tid] += smem[tid + 32]; + if (blockSize >= 32) smem[tid] += smem[tid + 16]; + if (blockSize >= 16) smem[tid] += smem[tid + 8]; + if (blockSize >= 8) smem[tid] += smem[tid + 4]; + if (blockSize >= 4) smem[tid] += smem[tid + 2]; + if (blockSize >= 2) smem[tid] += smem[tid + 1]; + } + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + +__global__ void reduce2D(float *g_idata, float *g_odata, + unsigned int pitch, + unsigned int nx, unsigned int ny, + unsigned int padX, unsigned int padY) +{ + extern __shared__ float sdata[]; + const unsigned int tidx = threadIdx.x; + const unsigned int tidy = threadIdx.y; + const unsigned int tid = tidy * 16 + tidx; + + unsigned int x = blockIdx.x*16 + tidx; + unsigned int y = blockIdx.y*16 + tidy; + + sdata[tid] = 0; + + if (x >= padX && x < padX + nx) { + + while (y < padY + ny) { + if (y >= padY) + sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]); + y += 16 * gridDim.y; + } + + } + + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + if (tid < 32) { // 32 is warp size + volatile float* smem = sdata; + smem[tid] += smem[tid + 32]; + smem[tid] += smem[tid + 16]; + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } + + if (tid == 0) + g_odata[blockIdx.y * gridDim.x + blockIdx.x] = sdata[0]; +} + +float dotProduct2D(float* D_data, unsigned int pitch, + unsigned int width, unsigned int height, + unsigned int padX, unsigned int padY) +{ + unsigned int bx = ((width+padX) + 15) / 16; + unsigned int by = ((height+padY) + 127) / 128; + unsigned int shared_mem2 = sizeof(float) * 16 * 16; + + dim3 dimBlock2(16, 16); + dim3 dimGrid2(bx, by); + + float* D_buf; + cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) ); + + // Step 1: reduce 2D from image to a single vector, taking sum of squares + + reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY); + cudaTextForceKernelsCompletion(); + + // Step 2: reduce 1D: add up elements in vector + if (bx * by > 512) + reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 128) + reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 32) + reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 8) + reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by); + else + reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by); + + float x; + cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost); + + cudaTextForceKernelsCompletion(); + + cudaFree(D_buf); + + return x; +} + + +bool cudaTextForceKernelsCompletion() +{ + cudaError_t returnedCudaError = cudaThreadSynchronize(); + + if(returnedCudaError != cudaSuccess) { + fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError)); + return false; + } + + return true; +} + +void reportCudaError(cudaError_t err) +{ + if(err != cudaSuccess) + fprintf(stderr, "CUDA error %d: %s.\n", err, cudaGetErrorString(err)); +} + + + +} |