diff options
Diffstat (limited to 'cuda/3d/util3d.cu')
-rw-r--r-- | cuda/3d/util3d.cu | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu new file mode 100644 index 0000000..81ea823 --- /dev/null +++ b/cuda/3d/util3d.cu @@ -0,0 +1,514 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include "util3d.h" +#include "../2d/util.h" + +namespace astraCUDA3d { + + +cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; + + cudaError err = cudaMalloc3D(&volData, extentV); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iVolX, dims.iVolY, dims.iVolZ); + volData.ptr = 0; + // TODO: return 0 somehow? + } + + return volData; +} +cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims) +{ + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; + + cudaError err = cudaMalloc3D(&projData, extentP); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iProjU, dims.iProjAngles, dims.iProjV); + projData.ptr = 0; + // TODO: return 0 somehow? + } + + return projData; +} +bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims) +{ + char* t = (char*)D_data.ptr; + cudaError err; + + for (unsigned int z = 0; z < dims.iVolZ; ++z) { + err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY); + ASTRA_CUDA_ASSERT(err); + t += D_data.pitch * dims.iVolY; + } + return true; +} +bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims) +{ + char* t = (char*)D_data.ptr; + cudaError err; + + for (unsigned int z = 0; z < dims.iProjV; ++z) { + err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles); + ASTRA_CUDA_ASSERT(err); + t += D_data.pitch * dims.iProjAngles; + } + + return true; +} +bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iVolX; + + cudaPitchedPtr ptr; + ptr.ptr = (void*)data; // const cast away + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_data; + p.extent = extentV; + p.kind = cudaMemcpyHostToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iProjU; + + cudaPitchedPtr ptr; + ptr.ptr = (void*)data; // const cast away + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iProjU*sizeof(float); + ptr.ysize = dims.iProjAngles; + + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_data; + p.extent = extentV; + p.kind = cudaMemcpyHostToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iVolX; + + cudaPitchedPtr ptr; + ptr.ptr = data; + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_data; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = ptr; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToHost; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} +bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iProjU; + + cudaPitchedPtr ptr; + ptr.ptr = data; + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iProjU*sizeof(float); + ptr.ysize = dims.iProjAngles; + + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_data; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = ptr; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToHost; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_src; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_dst; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} +bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_src; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_dst; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + + + +// TODO: Consider using a single array of size max(proj,volume) (per dim) +// instead of allocating a new one each time + +// TODO: Figure out a faster way of zeroing the padding? + +cudaArray* allocateVolumeArray(const SDimensions3D& dims) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + cudaArray* cuArray; + cudaExtent extentA; + extentA.width = dims.iVolX+2; + extentA.height = dims.iVolY+2; + extentA.depth = dims.iVolZ+2; + cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iVolX, dims.iVolY, dims.iVolZ); + return 0; + } + + zeroVolumeArray(cuArray, dims); + + return cuArray; +} +cudaArray* allocateProjectionArray(const SDimensions3D& dims) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + cudaArray* cuArray; + cudaExtent extentA; + extentA.width = dims.iProjU+2; + extentA.height = dims.iProjAngles; + extentA.depth = dims.iProjV+2; + cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); + + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iProjU, dims.iProjAngles, dims.iProjV); + return 0; + } + + zeroProjectionArray(cuArray, dims); + + return cuArray; +} +bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims) +{ + cudaPitchedPtr zeroBuf; + cudaExtent extentS; + extentS.width = sizeof(float)*(dims.iVolX+2); + extentS.height = dims.iVolY+2; + extentS.depth = 1; + + cudaExtent extentA; + extentA.width = dims.iVolX+2; + extentA.height = dims.iVolY+2; + extentA.depth = 1; + + + + cudaError err; + err = cudaMalloc3D(&zeroBuf, extentS); + ASTRA_CUDA_ASSERT(err); + err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2); + ASTRA_CUDA_ASSERT(err); + + // zero array + for (unsigned int i = 0; i < dims.iVolZ+2; ++i) { + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {0, 0, i}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = zeroBuf; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + } + cudaFree(zeroBuf.ptr); + + // TODO: check errors + + return true; +} +bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims) +{ + cudaPitchedPtr zeroBuf; + cudaExtent extentS; + extentS.width = sizeof(float)*(dims.iProjU+2); + extentS.height = dims.iProjAngles; + extentS.depth = 1; + cudaExtent extentA; + extentA.width = dims.iProjU+2; + extentA.height = dims.iProjAngles; + extentA.depth = 1; + + + cudaError err; + err = cudaMalloc3D(&zeroBuf, extentS); + ASTRA_CUDA_ASSERT(err); + err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles); + ASTRA_CUDA_ASSERT(err); + + for (unsigned int i = 0; i < dims.iProjV+2; ++i) { + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {0, 0, i}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = zeroBuf; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + } + cudaFree(zeroBuf.ptr); + + // TODO: check errors + return true; +} + + +bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims) +{ + cudaExtent extentA; + extentA.width = dims.iVolX; + extentA.height = dims.iVolY; + extentA.depth = dims.iVolZ; + + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {1, 1, 1}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_volumeData; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + // TODO: check errors + + return true; +} +bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims) +{ + cudaExtent extentA; + extentA.width = dims.iProjU; + extentA.height = dims.iProjAngles; + extentA.depth = dims.iProjV; + + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {1, 0, 1}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_projData; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + // TODO: check errors + + return true; +} + + +float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, + unsigned int z) +{ + return astraCUDA::dotProduct2D((float*)data.ptr, data.pitch/sizeof(float), x, y*z, 0, 0); +} + + +bool cudaTextForceKernelsCompletion() +{ + cudaError_t returnedCudaError = cudaThreadSynchronize(); + + if(returnedCudaError != cudaSuccess) { + fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError)); + return false; + } + + return true; +} + +int calcNextPowerOfTwo(int _iValue) +{ + int iOutput = 1; + while(iOutput < _iValue) + iOutput *= 2; + return iOutput; +} + +} |