diff options
Diffstat (limited to 'cuda/3d')
-rw-r--r-- | cuda/3d/algo3d.cu | 108 | ||||
-rw-r--r-- | cuda/3d/algo3d.h | 68 | ||||
-rw-r--r-- | cuda/3d/arith3d.cu | 610 | ||||
-rw-r--r-- | cuda/3d/arith3d.h | 79 | ||||
-rw-r--r-- | cuda/3d/astra3d.cu | 1620 | ||||
-rw-r--r-- | cuda/3d/astra3d.h | 450 | ||||
-rw-r--r-- | cuda/3d/cgls3d.cu | 428 | ||||
-rw-r--r-- | cuda/3d/cgls3d.h | 114 | ||||
-rw-r--r-- | cuda/3d/cone_bp.cu | 481 | ||||
-rw-r--r-- | cuda/3d/cone_bp.h | 45 | ||||
-rw-r--r-- | cuda/3d/cone_fp.cu | 513 | ||||
-rw-r--r-- | cuda/3d/cone_fp.h | 46 | ||||
-rw-r--r-- | cuda/3d/darthelper3d.cu | 229 | ||||
-rw-r--r-- | cuda/3d/darthelper3d.h | 46 | ||||
-rw-r--r-- | cuda/3d/dims3d.h | 84 | ||||
-rw-r--r-- | cuda/3d/fdk.cu | 646 | ||||
-rw-r--r-- | cuda/3d/fdk.h | 43 | ||||
-rw-r--r-- | cuda/3d/par3d_bp.cu | 464 | ||||
-rw-r--r-- | cuda/3d/par3d_bp.h | 45 | ||||
-rw-r--r-- | cuda/3d/par3d_fp.cu | 814 | ||||
-rw-r--r-- | cuda/3d/par3d_fp.h | 51 | ||||
-rw-r--r-- | cuda/3d/sirt3d.cu | 533 | ||||
-rw-r--r-- | cuda/3d/sirt3d.h | 118 | ||||
-rw-r--r-- | cuda/3d/util3d.cu | 514 | ||||
-rw-r--r-- | cuda/3d/util3d.h | 69 |
25 files changed, 8218 insertions, 0 deletions
diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu new file mode 100644 index 0000000..20e7381 --- /dev/null +++ b/cuda/3d/algo3d.cu @@ -0,0 +1,108 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cassert> + +#include "algo3d.h" +#include "cone_fp.h" +#include "cone_bp.h" +#include "par3d_fp.h" +#include "par3d_bp.h" + +namespace astraCUDA3d { + +ReconAlgo3D::ReconAlgo3D() +{ + coneProjs = 0; + par3DProjs = 0; + shouldAbort = false; +} + +ReconAlgo3D::~ReconAlgo3D() +{ + reset(); +} + +void ReconAlgo3D::reset() +{ + delete[] coneProjs; + coneProjs = 0; + delete[] par3DProjs; + par3DProjs = 0; + shouldAbort = false; +} + +bool ReconAlgo3D::setConeGeometry(const SDimensions3D& _dims, const SConeProjection* _angles) +{ + dims = _dims; + + coneProjs = new SConeProjection[dims.iProjAngles]; + par3DProjs = 0; + + memcpy(coneProjs, _angles, sizeof(coneProjs[0]) * dims.iProjAngles); + + return true; +} + +bool ReconAlgo3D::setPar3DGeometry(const SDimensions3D& _dims, const SPar3DProjection* _angles) +{ + dims = _dims; + + par3DProjs = new SPar3DProjection[dims.iProjAngles]; + coneProjs = 0; + + memcpy(par3DProjs, _angles, sizeof(par3DProjs[0]) * dims.iProjAngles); + + return true; +} + + +bool ReconAlgo3D::callFP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData, + float outputScale) +{ + if (coneProjs) { + return ConeFP(D_volumeData, D_projData, dims, coneProjs, outputScale); + } else { + return Par3DFP(D_volumeData, D_projData, dims, par3DProjs, outputScale); + } +} + +bool ReconAlgo3D::callBP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData) +{ + if (coneProjs) { + return ConeBP(D_volumeData, D_projData, dims, coneProjs); + } else { + return Par3DBP(D_volumeData, D_projData, dims, par3DProjs); + } +} + + + +} diff --git a/cuda/3d/algo3d.h b/cuda/3d/algo3d.h new file mode 100644 index 0000000..2b44f6f --- /dev/null +++ b/cuda/3d/algo3d.h @@ -0,0 +1,68 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ALGO_H +#define _CUDA_ALGO_H + +#include "dims3d.h" +#include "util3d.h" + +namespace astraCUDA3d { + +class _AstraExport ReconAlgo3D { +public: + ReconAlgo3D(); + ~ReconAlgo3D(); + + bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs); + bool setPar3DGeometry(const SDimensions3D& dims, const SPar3DProjection* projs); + + void signalAbort() { shouldAbort = true; } + +protected: + void reset(); + + bool callFP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData, + float outputScale); + bool callBP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData); + + SDimensions3D dims; + SConeProjection* coneProjs; + SPar3DProjection* par3DProjs; + + volatile bool shouldAbort; + +}; + + +} + +#endif + diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu new file mode 100644 index 0000000..9a19be0 --- /dev/null +++ b/cuda/3d/arith3d.cu @@ -0,0 +1,610 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util3d.h" +#include "arith3d.h" +#include <cassert> + +namespace astraCUDA3d { + +struct opAddScaled { + __device__ void operator()(float& out, const float in, const float inp) { + out += in * inp; + } +}; +struct opScaleAndAdd { + __device__ void operator()(float& out, const float in, const float inp) { + out = in + out * inp; + } +}; +struct opAddMulScaled { + __device__ void operator()(float& out, const float in1, const float in2, const float inp) { + out += in1 * in2 * inp; + } +}; +struct opAddMul { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 * in2; + } +}; +struct opMul { + __device__ void operator()(float& out, const float in) { + out *= in; + } +}; +struct opMul2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out *= in1 * in2; + } +}; +struct opDividedBy { + __device__ void operator()(float& out, const float in) { + if (out > 0.000001f) // out is assumed to be positive + out = in / out; + else + out = 0.0f; + } +}; +struct opInvert { + __device__ void operator()(float& out) { + if (out > 0.000001f) // out is assumed to be positive + out = 1 / out; + else + out = 0.0f; + } +}; +struct opSet { + __device__ void operator()(float& out, const float inp) { + out = inp; + } +}; +struct opClampMin { + __device__ void operator()(float& out, const float inp) { + if (out < inp) + out = inp; + } +}; +struct opClampMax { + __device__ void operator()(float& out, const float inp) { + if (out > inp) + out = inp; + } +}; + + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], fParam); + off += pitch; + y++; + } +} + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off], fParam); + off += pitch; + y++; + } +} + + + + + + + + + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+511)/512); + + float *pfOut = (float*)out; + + devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + + devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn = (const float*)in; + + devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn = (const float*)in; + + devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn1 = (const float*)in1; + const float *pfIn2 = (const float*)in2; + + devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn1 = (const float*)in1; + const float *pfIn2 = (const float*)in2; + + devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + + +#define INST_DFtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); + +#define INST_DtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); + +#define INST_DDtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +#define INST_DDFtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); + + +#define INST_toD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); + +#define INST_FtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); + + + +INST_DFtoD(opAddScaled) +INST_DFtoD(opScaleAndAdd) +INST_DDFtoD(opAddMulScaled) +INST_DDtoD(opAddMul) +INST_DDtoD(opMul2) +INST_DtoD(opMul) +INST_DtoD(opDividedBy) +INST_toD(opInvert) +INST_FtoD(opSet) +INST_FtoD(opClampMin) +INST_FtoD(opClampMax) + + +} diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h new file mode 100644 index 0000000..53c9b79 --- /dev/null +++ b/cuda/3d/arith3d.h @@ -0,0 +1,79 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ARITH3D_H +#define _CUDA_ARITH3D_H + +#include <cuda.h> + +namespace astraCUDA3d { + +struct opAddScaled; +struct opScaleAndAdd; +struct opAddMulScaled; +struct opAddMul; +struct opMul; +struct opMul2; +struct opDividedBy; +struct opInvert; +struct opSet; +struct opClampMin; +struct opClampMax; + +enum VolType { + SINO = 0, + VOL = 1 +}; + + +template<typename op, VolType t> void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); + +template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + + + +} + +#endif diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu new file mode 100644 index 0000000..fd4b370 --- /dev/null +++ b/cuda/3d/astra3d.cu @@ -0,0 +1,1620 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "cgls3d.h" +#include "sirt3d.h" +#include "util3d.h" +#include "cone_fp.h" +#include "cone_bp.h" +#include "par3d_fp.h" +#include "par3d_bp.h" +#include "fdk.h" +#include "arith3d.h" +#include "astra3d.h" + +#include <iostream> + +using namespace astraCUDA3d; + +namespace astra { + +enum CUDAProjectionType3d { + PROJ_PARALLEL, + PROJ_CONE +}; + + +static SConeProjection* genConeProjections(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + double fOriginSourceDistance, + double fOriginDetectorDistance, + double fDetUSize, + double fDetVSize, + const float *pfAngles) +{ + SConeProjection base; + base.fSrcX = 0.0f; + base.fSrcY = -fOriginSourceDistance; + base.fSrcZ = 0.0f; + + base.fDetSX = iProjU * fDetUSize * -0.5f; + base.fDetSY = fOriginDetectorDistance; + base.fDetSZ = iProjV * fDetVSize * -0.5f; + + base.fDetUX = fDetUSize; + base.fDetUY = 0.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = fDetVSize; + + SConeProjection* p = new SConeProjection[iProjAngles]; + +#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0) + + for (unsigned int i = 0; i < iProjAngles; ++i) { + ROTATE0(Src, i, pfAngles[i]); + ROTATE0(DetS, i, pfAngles[i]); + ROTATE0(DetU, i, pfAngles[i]); + ROTATE0(DetV, i, pfAngles[i]); + } + +#undef ROTATE0 + + return p; +} + +static SPar3DProjection* genPar3DProjections(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + double fDetUSize, + double fDetVSize, + const float *pfAngles) +{ + SPar3DProjection base; + base.fRayX = 0.0f; + base.fRayY = 1.0f; + base.fRayZ = 0.0f; + + base.fDetSX = iProjU * fDetUSize * -0.5f; + base.fDetSY = 0.0f; + base.fDetSZ = iProjV * fDetVSize * -0.5f; + + base.fDetUX = fDetUSize; + base.fDetUY = 0.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = fDetVSize; + + SPar3DProjection* p = new SPar3DProjection[iProjAngles]; + +#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0) + + for (unsigned int i = 0; i < iProjAngles; ++i) { + ROTATE0(Ray, i, pfAngles[i]); + ROTATE0(DetS, i, pfAngles[i]); + ROTATE0(DetU, i, pfAngles[i]); + ROTATE0(DetV, i, pfAngles[i]); + } + +#undef ROTATE0 + + return p; +} + + + + +class AstraSIRT3d_internal { +public: + SDimensions3D dims; + CUDAProjectionType3d projType; + + float* angles; + float fOriginSourceDistance; + float fOriginDetectorDistance; + float fSourceZ; + float fDetSize; + + SConeProjection* projs; + SPar3DProjection* parprojs; + + float fPixelSize; + + bool initialized; + bool setStartReconstruction; + + bool useVolumeMask; + bool useSinogramMask; + + // Input/output + cudaPitchedPtr D_projData; + cudaPitchedPtr D_volumeData; + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + SIRT sirt; +}; + +AstraSIRT3d::AstraSIRT3d() +{ + pData = new AstraSIRT3d_internal(); + + pData->angles = 0; + pData->D_projData.ptr = 0; + pData->D_volumeData.ptr = 0; + pData->D_maskData.ptr = 0; + pData->D_smaskData.ptr = 0; + + pData->dims.iVolX = 0; + pData->dims.iVolY = 0; + pData->dims.iVolZ = 0; + pData->dims.iProjAngles = 0; + pData->dims.iProjU = 0; + pData->dims.iProjV = 0; + pData->dims.iRaysPerDetDim = 1; + pData->dims.iRaysPerVoxelDim = 1; + + pData->projs = 0; + + pData->initialized = false; + pData->setStartReconstruction = false; + + pData->useVolumeMask = false; + pData->useSinogramMask = false; +} + +AstraSIRT3d::~AstraSIRT3d() +{ + delete[] pData->angles; + pData->angles = 0; + + delete[] pData->projs; + pData->projs = 0; + + cudaFree(pData->D_projData.ptr); + pData->D_projData.ptr = 0; + + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + + cudaFree(pData->D_maskData.ptr); + pData->D_maskData.ptr = 0; + + cudaFree(pData->D_smaskData.ptr); + pData->D_smaskData.ptr = 0; + + delete pData; + pData = 0; +} + +bool AstraSIRT3d::setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/) +{ + if (pData->initialized) + return false; + + pData->dims.iVolX = iVolX; + pData->dims.iVolY = iVolY; + pData->dims.iVolZ = iVolZ; + + return (iVolX > 0 && iVolY > 0 && iVolZ > 0); +} + + +bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->parprojs = new SPar3DProjection[iProjAngles]; + memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_PARALLEL; + + return true; +} + +bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->parprojs = p; + pData->projType = PROJ_PARALLEL; + + return true; +} + + + +bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->projs = new SConeProjection[iProjAngles]; + memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->projs = p; + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraSIRT3d::enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling) +{ + if (pData->initialized) + return false; + + if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0) + return false; + + pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling; + pData->dims.iRaysPerDetDim = iDetectorSuperSampling; + + return true; +} + +bool AstraSIRT3d::enableVolumeMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->sirt.enableVolumeMask(); + pData->useVolumeMask = ok; + + return ok; +} + +bool AstraSIRT3d::enableSinogramMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->sirt.enableSinogramMask(); + pData->useSinogramMask = ok; + + return ok; +} + +bool AstraSIRT3d::setGPUIndex(int index) +{ + cudaSetDevice(index); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + +bool AstraSIRT3d::init() +{ + fprintf(stderr, "001: %d\n", true); + if (pData->initialized) + return false; + fprintf(stderr, "002: %d\n", true); + + if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0) + return false; + fprintf(stderr, "003: %d\n", true); + + bool ok; + + if (pData->projType == PROJ_PARALLEL) { + ok = pData->sirt.setPar3DGeometry(pData->dims, pData->parprojs); + } else { + ok = pData->sirt.setConeGeometry(pData->dims, pData->projs); + } + fprintf(stderr, "004: %d\n", ok); + + if (!ok) + return false; + + ok = pData->sirt.init(); + if (!ok) + return false; + fprintf(stderr, "005: %d\n", ok); + + pData->D_volumeData = allocateVolumeData(pData->dims); + ok = pData->D_volumeData.ptr; + if (!ok) + return false; + fprintf(stderr, "006: %d\n", ok); + + fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV); + pData->D_projData = allocateProjectionData(pData->dims); + ok = pData->D_projData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + return false; + } + fprintf(stderr, "007: %d\n", ok); + + if (pData->useVolumeMask) { + pData->D_maskData = allocateVolumeData(pData->dims); + ok = pData->D_maskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + return false; + } + } + + if (pData->useSinogramMask) { + pData->D_smaskData = allocateProjectionData(pData->dims); + ok = pData->D_smaskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + cudaFree(pData->D_maskData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + pData->D_maskData.ptr = 0; + return false; + } + } + fprintf(stderr, "008: %d\n", ok); + + pData->initialized = true; + + return true; +} + +bool AstraSIRT3d::setMinConstraint(float fMin) +{ + if (!pData->initialized) + return false; + return pData->sirt.setMinConstraint(fMin); +} + +bool AstraSIRT3d::setMaxConstraint(float fMax) +{ + if (!pData->initialized) + return false; + return pData->sirt.setMaxConstraint(fMax); +} + +bool AstraSIRT3d::setSinogram(const float* pfSinogram, + unsigned int iSinogramPitch) +{ + if (!pData->initialized) + return false; + if (!pfSinogram) + return false; + + bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch); + + if (!ok) + return false; + + ok = pData->sirt.setBuffers(pData->D_volumeData, pData->D_projData); + if (!ok) + return false; + + pData->setStartReconstruction = false; + + return true; +} + +bool AstraSIRT3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useVolumeMask) + return false; + if (!pfMask) + return false; + + bool ok = copyVolumeToDevice(pfMask, pData->D_maskData, + pData->dims, iMaskPitch); + if (!ok) + return false; + + ok = pData->sirt.setVolumeMask(pData->D_maskData); + if (!ok) + return false; + + return true; +} + +bool AstraSIRT3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useSinogramMask) + return false; + if (!pfMask) + return false; + + bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch); + + if (!ok) + return false; + + ok = pData->sirt.setSinogramMask(pData->D_smaskData); + if (!ok) + return false; + + return true; +} + +bool AstraSIRT3d::setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch) +{ + if (!pData->initialized) + return false; + if (!pfReconstruction) + return false; + + bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + pData->setStartReconstruction = true; + + return true; +} + +bool AstraSIRT3d::iterate(unsigned int iIterations) +{ + if (!pData->initialized) + return false; + + if (!pData->setStartReconstruction) + zeroVolumeData(pData->D_volumeData, pData->dims); + + bool ok = pData->sirt.iterate(iIterations); + if (!ok) + return false; + + return true; +} + +bool AstraSIRT3d::getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const +{ + if (!pData->initialized) + return false; + + bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + return true; +} + +void AstraSIRT3d::signalAbort() +{ + if (!pData->initialized) + return; + + pData->sirt.signalAbort(); +} + +float AstraSIRT3d::computeDiffNorm() +{ + if (!pData->initialized) + return 0.0f; // FIXME: Error? + + return pData->sirt.computeDiffNorm(); +} + + + + +class AstraCGLS3d_internal { +public: + SDimensions3D dims; + CUDAProjectionType3d projType; + + float* angles; + float fOriginSourceDistance; + float fOriginDetectorDistance; + float fSourceZ; + float fDetSize; + + SConeProjection* projs; + SPar3DProjection* parprojs; + + float fPixelSize; + + bool initialized; + bool setStartReconstruction; + + bool useVolumeMask; + bool useSinogramMask; + + // Input/output + cudaPitchedPtr D_projData; + cudaPitchedPtr D_volumeData; + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + CGLS cgls; +}; + +AstraCGLS3d::AstraCGLS3d() +{ + pData = new AstraCGLS3d_internal(); + + pData->angles = 0; + pData->D_projData.ptr = 0; + pData->D_volumeData.ptr = 0; + pData->D_maskData.ptr = 0; + pData->D_smaskData.ptr = 0; + + pData->dims.iVolX = 0; + pData->dims.iVolY = 0; + pData->dims.iVolZ = 0; + pData->dims.iProjAngles = 0; + pData->dims.iProjU = 0; + pData->dims.iProjV = 0; + pData->dims.iRaysPerDetDim = 1; + pData->dims.iRaysPerVoxelDim = 1; + + pData->projs = 0; + + pData->initialized = false; + pData->setStartReconstruction = false; + + pData->useVolumeMask = false; + pData->useSinogramMask = false; +} + +AstraCGLS3d::~AstraCGLS3d() +{ + delete[] pData->angles; + pData->angles = 0; + + delete[] pData->projs; + pData->projs = 0; + + cudaFree(pData->D_projData.ptr); + pData->D_projData.ptr = 0; + + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + + cudaFree(pData->D_maskData.ptr); + pData->D_maskData.ptr = 0; + + cudaFree(pData->D_smaskData.ptr); + pData->D_smaskData.ptr = 0; + + delete pData; + pData = 0; +} + +bool AstraCGLS3d::setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/) +{ + if (pData->initialized) + return false; + + pData->dims.iVolX = iVolX; + pData->dims.iVolY = iVolY; + pData->dims.iVolZ = iVolZ; + + return (iVolX > 0 && iVolY > 0 && iVolZ > 0); +} + + +bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->parprojs = new SPar3DProjection[iProjAngles]; + memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_PARALLEL; + + return true; +} + +bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->parprojs = p; + pData->projType = PROJ_PARALLEL; + + return true; +} + + + +bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->projs = new SConeProjection[iProjAngles]; + memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->projs = p; + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraCGLS3d::enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling) +{ + if (pData->initialized) + return false; + + if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0) + return false; + + pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling; + pData->dims.iRaysPerDetDim = iDetectorSuperSampling; + + return true; +} + +bool AstraCGLS3d::enableVolumeMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->cgls.enableVolumeMask(); + pData->useVolumeMask = ok; + + return ok; +} + +#if 0 +bool AstraCGLS3d::enableSinogramMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->cgls.enableSinogramMask(); + pData->useSinogramMask = ok; + + return ok; +} +#endif + +bool AstraCGLS3d::setGPUIndex(int index) +{ + cudaSetDevice(index); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + +bool AstraCGLS3d::init() +{ + fprintf(stderr, "001: %d\n", true); + if (pData->initialized) + return false; + fprintf(stderr, "002: %d\n", true); + + if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0) + return false; + fprintf(stderr, "003: %d\n", true); + + bool ok; + + if (pData->projType == PROJ_PARALLEL) { + ok = pData->cgls.setPar3DGeometry(pData->dims, pData->parprojs); + } else { + ok = pData->cgls.setConeGeometry(pData->dims, pData->projs); + } + fprintf(stderr, "004: %d\n", ok); + + if (!ok) + return false; + + ok = pData->cgls.init(); + if (!ok) + return false; + fprintf(stderr, "005: %d\n", ok); + + pData->D_volumeData = allocateVolumeData(pData->dims); + ok = pData->D_volumeData.ptr; + if (!ok) + return false; + fprintf(stderr, "006: %d\n", ok); + + fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV); + pData->D_projData = allocateProjectionData(pData->dims); + ok = pData->D_projData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + return false; + } + fprintf(stderr, "007: %d\n", ok); + + if (pData->useVolumeMask) { + pData->D_maskData = allocateVolumeData(pData->dims); + ok = pData->D_maskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + return false; + } + } + + if (pData->useSinogramMask) { + pData->D_smaskData = allocateProjectionData(pData->dims); + ok = pData->D_smaskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + cudaFree(pData->D_maskData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + pData->D_maskData.ptr = 0; + return false; + } + } + fprintf(stderr, "008: %d\n", ok); + + pData->initialized = true; + + return true; +} + +#if 0 +bool AstraCGLS3d::setMinConstraint(float fMin) +{ + if (!pData->initialized) + return false; + return pData->cgls.setMinConstraint(fMin); +} + +bool AstraCGLS3d::setMaxConstraint(float fMax) +{ + if (!pData->initialized) + return false; + return pData->cgls.setMaxConstraint(fMax); +} +#endif + +bool AstraCGLS3d::setSinogram(const float* pfSinogram, + unsigned int iSinogramPitch) +{ + if (!pData->initialized) + return false; + if (!pfSinogram) + return false; + + bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch); + + if (!ok) + return false; + + ok = pData->cgls.setBuffers(pData->D_volumeData, pData->D_projData); + if (!ok) + return false; + + pData->setStartReconstruction = false; + + return true; +} + +bool AstraCGLS3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useVolumeMask) + return false; + if (!pfMask) + return false; + + bool ok = copyVolumeToDevice(pfMask, pData->D_maskData, + pData->dims, iMaskPitch); + if (!ok) + return false; + + ok = pData->cgls.setVolumeMask(pData->D_maskData); + if (!ok) + return false; + + return true; +} + +#if 0 +bool AstraCGLS3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useSinogramMask) + return false; + if (!pfMask) + return false; + + bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch); + + if (!ok) + return false; + + ok = pData->cgls.setSinogramMask(pData->D_smaskData); + if (!ok) + return false; + + return true; +} +#endif + +bool AstraCGLS3d::setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch) +{ + if (!pData->initialized) + return false; + if (!pfReconstruction) + return false; + + bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + pData->setStartReconstruction = true; + + return true; +} + +bool AstraCGLS3d::iterate(unsigned int iIterations) +{ + if (!pData->initialized) + return false; + + if (!pData->setStartReconstruction) + zeroVolumeData(pData->D_volumeData, pData->dims); + + bool ok = pData->cgls.iterate(iIterations); + if (!ok) + return false; + + return true; +} + +bool AstraCGLS3d::getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const +{ + if (!pData->initialized) + return false; + + bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + return true; +} + +void AstraCGLS3d::signalAbort() +{ + if (!pData->initialized) + return; + + pData->cgls.signalAbort(); +} + +float AstraCGLS3d::computeDiffNorm() +{ + if (!pData->initialized) + return 0.0f; // FIXME: Error? + + return pData->cgls.computeDiffNorm(); +} + + + +bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaConeFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling); + + delete[] p; + + return ok; +} + +bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerDetDim = iDetectorSuperSampling; + + if (iDetectorSuperSampling == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + ok &= zeroProjectionData(D_projData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= ConeFP(D_volumeData, D_projData, dims, pfAngles, 1.0f); + + ok &= copyProjectionsFromDevice(pfProjections, D_projData, + dims, dims.iProjU); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + +bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaPar3DFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling, + projKernel); + + delete[] p; + + return ok; +} + + +bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerDetDim = iDetectorSuperSampling; + + if (iDetectorSuperSampling == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + ok &= zeroProjectionData(D_projData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + switch (projKernel) { + case ker3d_default: + ok &= Par3DFP(D_volumeData, D_projData, dims, pfAngles, 1.0f); + break; + case ker3d_sum_square_weights: + ok &= Par3DFP_SumSqW(D_volumeData, D_projData, dims, pfAngles, 1.0f); + break; + default: + assert(false); + } + + ok &= copyProjectionsFromDevice(pfProjections, D_projData, + dims, dims.iProjU); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + +bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaConeBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling); + + delete[] p; + + return ok; +} + +bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, + dims, dims.iProjU); + + ok &= zeroVolumeData(D_volumeData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= ConeBP(D_volumeData, D_projData, dims, pfAngles); + + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + +bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaPar3DBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling); + + delete[] p; + + return ok; +} + + +bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, + dims, dims.iProjU); + + ok &= zeroVolumeData(D_volumeData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles); + + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + + + +bool astraCudaFDK(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + bool bShortScan, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iVoxelSuperSampling == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, dims, dims.iProjU); + + ok &= zeroVolumeData(D_volumeData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + // TODO: Offer interface for SrcZ, DetZ + ok &= FDK(D_volumeData, D_projData, fOriginSourceDistance, + fOriginDetectorDistance, 0, 0, fDetUSize, fDetVSize, + dims, pfAngles, bShortScan); + + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + + + + +} diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h new file mode 100644 index 0000000..5712f89 --- /dev/null +++ b/cuda/3d/astra3d.h @@ -0,0 +1,450 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ASTRA3D_H +#define _CUDA_ASTRA3D_H + +#include "dims3d.h" + +namespace astra { + + +// TODO: Switch to a class hierarchy as with the 2D algorithms + + +enum Cuda3DProjectionKernel { + ker3d_default = 0, + ker3d_sum_square_weights +}; + + +class AstraSIRT3d_internal; + + +class _AstraExport AstraSIRT3d { +public: + + AstraSIRT3d(); + ~AstraSIRT3d(); + + // Set the number of pixels in the reconstruction rectangle, + // and the length of the edge of a pixel. + // Volume pixels are assumed to be square. + // This must be called before setting the projection geometry. + bool setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/); + + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs); + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fSourceZ, + float fDetSize, + const float *pfAngles); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fSourceZ, + float fDetSize, + const float *pfAngles); + + // Enable supersampling. + // + // The number of rays used in FP is the square of iDetectorSuperSampling. + // The number of rays used in BP is the cube of iVoxelSuperSampling. + bool enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling); + + // Enable volume/sinogram masks + // + // This may optionally be called before init(). + // If it is called, setVolumeMask()/setSinogramMask() must be called between + // setSinogram() and iterate(). + bool enableVolumeMask(); + bool enableSinogramMask(); + + // Set GPU index + // + // This should be called before init(). Note that setting the GPU index + // in a thread which has already used the GPU may not work. + bool setGPUIndex(int index); + + // Allocate GPU buffers and + // precompute geometry-specific data. + // + // This must be called after calling setReconstructionGeometry() and + // setProjectionGeometry() or setFanProjectionGeometry(). + bool init(); + + // Setup input sinogram for a slice. + // pfSinogram must be a float array of size XXX + // NB: iSinogramPitch is measured in floats, not in bytes. + // + // This must be called after init(), and before iterate(). It may be + // called again after iterate()/getReconstruction() to start a new slice. + // + // pfSinogram will only be read from during this call. + bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); + + // Setup volume mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only volume pixels + // for which pfMask[z] is 1.0f are processed. + bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch); + + // Setup sinogram mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels + // for which pfMask[z] is 1.0f are processed. + bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch); + + // Set the starting reconstruction for SIRT. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called between setSinogram() and iterate(). + // If this function is not called before iterate(), SIRT will start + // from a zero reconstruction. + // + // pfReconstruction will only be read from during this call. + bool setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch); + + // Enable min/max constraint. + // + // These may optionally be called between init() and iterate() + bool setMinConstraint(float fMin); + bool setMaxConstraint(float fMax); + + // Perform a number of (additive) SIRT iterations. + // This must be called after setSinogram(). + // + // If called multiple times, without calls to setSinogram() or + // setStartReconstruction() in between, iterate() will continue from + // the result of the previous call. + // Calls to getReconstruction() are allowed between calls to iterate() and + // do not change the state. + bool iterate(unsigned int iIterations); + + // Get the reconstructed slice. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called after iterate(). + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + + // Compute the norm of the difference of the FP of the current + // reconstruction and the sinogram. (This performs one FP.) + // It can be called after iterate(). + float computeDiffNorm(); + + // Signal the algorithm that it should abort after the current iteration. + // This is intended to be called from another thread. + void signalAbort(); + +protected: + AstraSIRT3d_internal *pData; +}; + + +class AstraCGLS3d_internal; + + +class _AstraExport AstraCGLS3d { +public: + + AstraCGLS3d(); + ~AstraCGLS3d(); + + // Set the number of pixels in the reconstruction rectangle, + // and the length of the edge of a pixel. + // Volume pixels are assumed to be square. + // This must be called before setting the projection geometry. + bool setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/); + + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs); + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fSourceZ, + float fDetSize, + const float *pfAngles); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fSourceZ, + float fDetSize, + const float *pfAngles); + + // Enable supersampling. + // + // The number of rays used in FP is the square of iDetectorSuperSampling. + // The number of rays used in BP is the cube of iVoxelSuperSampling. + bool enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling); + + // Enable volume/sinogram masks + // + // This may optionally be called before init(). + // If it is called, setVolumeMask()/setSinogramMask() must be called between + // setSinogram() and iterate(). + bool enableVolumeMask(); + //bool enableSinogramMask(); + + // Set GPU index + // + // This should be called before init(). Note that setting the GPU index + // in a thread which has already used the GPU may not work. + bool setGPUIndex(int index); + + // Allocate GPU buffers and + // precompute geometry-specific data. + // + // This must be called after calling setReconstructionGeometry() and + // setProjectionGeometry() or setFanProjectionGeometry(). + bool init(); + + // Setup input sinogram for a slice. + // pfSinogram must be a float array of size XXX + // NB: iSinogramPitch is measured in floats, not in bytes. + // + // This must be called after init(), and before iterate(). It may be + // called again after iterate()/getReconstruction() to start a new slice. + // + // pfSinogram will only be read from during this call. + bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); + + // Setup volume mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only volume pixels + // for which pfMask[z] is 1.0f are processed. + bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch); + + // Setup sinogram mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels + // for which pfMask[z] is 1.0f are processed. + //bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch); + + // Set the starting reconstruction for SIRT. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called between setSinogram() and iterate(). + // If this function is not called before iterate(), SIRT will start + // from a zero reconstruction. + // + // pfReconstruction will only be read from during this call. + bool setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch); + + // Enable min/max constraint. + // + // These may optionally be called between init() and iterate() + //bool setMinConstraint(float fMin); + //bool setMaxConstraint(float fMax); + + // Perform a number of (additive) SIRT iterations. + // This must be called after setSinogram(). + // + // If called multiple times, without calls to setSinogram() or + // setStartReconstruction() in between, iterate() will continue from + // the result of the previous call. + // Calls to getReconstruction() are allowed between calls to iterate() and + // do not change the state. + bool iterate(unsigned int iIterations); + + // Get the reconstructed slice. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called after iterate(). + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + + // Compute the norm of the difference of the FP of the current + // reconstruction and the sinogram. (This performs one FP.) + // It can be called after iterate(). + float computeDiffNorm(); + + // Signal the algorithm that it should abort after the current iteration. + // This is intended to be called from another thread. + void signalAbort(); + +protected: + AstraCGLS3d_internal *pData; +}; + + + +_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling); + +_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling); + +_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel); + +_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel); + + +_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + bool bShortScan, + int iGPUIndex, int iVoxelSuperSampling); + +} + + +#endif diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu new file mode 100644 index 0000000..72bb9cd --- /dev/null +++ b/cuda/3d/cgls3d.cu @@ -0,0 +1,428 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "cgls3d.h" +#include "util3d.h" +#include "arith3d.h" +#include "cone_fp.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA3d { + +CGLS::CGLS() : ReconAlgo3D() +{ + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_r.ptr = 0; + D_w.ptr = 0; + D_z.ptr = 0; + D_p.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; +} + + +CGLS::~CGLS() +{ + reset(); +} + +void CGLS::reset() +{ + cudaFree(D_r.ptr); + cudaFree(D_w.ptr); + cudaFree(D_z.ptr); + cudaFree(D_p.ptr); + + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_r.ptr = 0; + D_w.ptr = 0; + D_z.ptr = 0; + D_p.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; + + sliceInitialized = false; + + ReconAlgo3D::reset(); +} + +bool CGLS::enableVolumeMask() +{ + useVolumeMask = true; + return true; +} + +bool CGLS::enableSinogramMask() +{ + useSinogramMask = true; + return true; +} + + +bool CGLS::init() +{ + D_z = allocateVolumeData(dims); + D_p = allocateVolumeData(dims); + D_r = allocateProjectionData(dims); + D_w = allocateProjectionData(dims); + + // TODO: check if allocations succeeded + return true; +} + +bool CGLS::setVolumeMask(cudaPitchedPtr& _D_maskData) +{ + assert(useVolumeMask); + + D_maskData = _D_maskData; + + return true; +} + +bool CGLS::setSinogramMask(cudaPitchedPtr& _D_smaskData) +{ + return false; +#if 0 + // TODO: Implement this + assert(useSinogramMask); + + D_smaskData = _D_smaskData; + return true; +#endif +} + +bool CGLS::setBuffers(cudaPitchedPtr& _D_volumeData, + cudaPitchedPtr& _D_projData) +{ + D_volumeData = _D_volumeData; + D_sinoData = _D_projData; + + fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr); + + sliceInitialized = false; + + return true; +} + +bool CGLS::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (!sliceInitialized) { + + // copy sinogram + duplicateProjectionData(D_r, D_sinoData, dims); + + // r = sino - A*x + if (useVolumeMask) { + duplicateVolumeData(D_z, D_volumeData, dims); + processVol3D<opMul>(D_z, D_maskData, dims); + callFP(D_z, D_r, -1.0f); + } else { + callFP(D_volumeData, D_r, -1.0f); + } + + // p = A'*r + zeroVolumeData(D_p, dims); + callBP(D_p, D_r); + if (useVolumeMask) + processVol3D<opMul>(D_p, D_maskData, dims); + + gamma = dotProduct3D(D_p, dims.iVolX, dims.iVolY, dims.iVolZ); + + sliceInitialized = true; + + } + + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // w = A*p + zeroProjectionData(D_w, dims); + callFP(D_p, D_w, 1.0f); + + // alpha = gamma / <w,w> + float ww = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV); + float alpha = gamma / ww; + + // x += alpha*p + processVol3D<opAddScaled>(D_volumeData, D_p, alpha, dims); + + // r -= alpha*w + processSino3D<opAddScaled>(D_r, D_w, -alpha, dims); + + // z = A'*r + zeroVolumeData(D_z, dims); + callBP(D_z, D_r); + if (useVolumeMask) + processVol3D<opMul>(D_z, D_maskData, dims); + + float beta = 1.0f / gamma; + gamma = dotProduct3D(D_z, dims.iVolX, dims.iVolY, dims.iVolZ); + + beta *= gamma; + + // p = z + beta*p + processVol3D<opScaleAndAdd>(D_p, D_z, beta, dims); + } + + return true; +} + +float CGLS::computeDiffNorm() +{ + // We can use w and z as temporary storage here since they're not + // used outside of iterations. + + // copy sinogram to w + duplicateProjectionData(D_w, D_sinoData, dims); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + duplicateVolumeData(D_z, D_volumeData, dims); + processVol3D<opMul>(D_z, D_maskData, dims); + callFP(D_z, D_w, -1.0f); + } else { + callFP(D_volumeData, D_w, -1.0f); + } + + float s = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV); + return sqrt(s); +} + + +bool doCGLS(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_sinoData, + cudaPitchedPtr& D_maskData, + const SDimensions3D& dims, const SConeProjection* angles, + unsigned int iterations) +{ + CGLS cgls; + bool ok = true; + + ok &= cgls.setConeGeometry(dims, angles); + if (D_maskData.ptr) + ok &= cgls.enableVolumeMask(); + + if (!ok) + return false; + + ok = cgls.init(); + if (!ok) + return false; + + if (D_maskData.ptr) + ok &= cgls.setVolumeMask(D_maskData); + + ok &= cgls.setBuffers(D_volumeData, D_sinoData); + if (!ok) + return false; + + ok = cgls.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 100; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + SConeProjection angle[100]; + angle[0].fSrcX = -2905.6; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 694.4; + angle[0].fDetSY = -122.4704; + angle[0].fDetSZ = -122.4704; + + angle[0].fDetUX = 0; + angle[0].fDetUY = .4784; + //angle[0].fDetUY = .5; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = .4784; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 100; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/100); + ROTATE0(DetS, i, i*2*M_PI/100); + ROTATE0(DetU, i, i*2*M_PI/100); + ROTATE0(DetV, i, i*2*M_PI/100); + } +#undef ROTATE0 + + + cudaPitchedPtr volData = allocateVolumeData(dims); + cudaPitchedPtr projData = allocateProjectionData(dims); + zeroProjectionData(projData, dims); + + float* pbuf = new float[100*512*512]; + copyProjectionsFromDevice(pbuf, projData, dims); + copyProjectionsToDevice(pbuf, projData, dims); + delete[] pbuf; + +#if 0 + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256; ++i) { + for (unsigned int y = 0; y < 256; ++y) + for (unsigned int x = 0; x < 256; ++x) + slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f; + + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + +#else + + for (int i = 0; i < 100; ++i) { + char fname[32]; + sprintf(fname, "Tiffs/%04d.png", 4*i); + unsigned int w,h; + float* bufp = loadImage(fname, w,h); + + for (int j = 0; j < 512*512; ++j) { + float v = bufp[j]; + if (v > 236.0f) v = 236.0f; + v = logf(236.0f / v); + bufp[j] = 256*v; + } + + for (int j = 0; j < 512; ++j) { + cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] bufp; + + } +#endif + +#if 0 + float* bufs = new float[100*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 100, 512, bufs); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + zeroVolumeData(volData, dims); + + cudaPitchedPtr maskData; + maskData.ptr = 0; + + astraCUDA3d::doCGLS(volData, projData, maskData, dims, angle, 50); +#if 1 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + + return 0; +} +#endif + diff --git a/cuda/3d/cgls3d.h b/cuda/3d/cgls3d.h new file mode 100644 index 0000000..d16b571 --- /dev/null +++ b/cuda/3d/cgls3d.h @@ -0,0 +1,114 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CGLS3D_H +#define _CUDA_CGLS3D_H + +#include "util3d.h" +#include "algo3d.h" + +namespace astraCUDA3d { + +class _AstraExport CGLS : public ReconAlgo3D { +public: + CGLS(); + ~CGLS(); + +// bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs); + + + bool enableVolumeMask(); + bool enableSinogramMask(); + + // init should be called after setting all geometry + bool init(); + + // setVolumeMask should be called after init and before iterate, + // but only if enableVolumeMask was called before init. + // It may be called again after iterate. + bool setVolumeMask(cudaPitchedPtr& D_maskData); + + // setSinogramMask should be called after init and before iterate, + // but only if enableSinogramMask was called before init. + // It may be called again after iterate. + bool setSinogramMask(cudaPitchedPtr& D_smaskData); + + + // setBuffers should be called after init and before iterate. + // It may be called again after iterate. + bool setBuffers(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData); + + + // set Min/Max constraints. They may be called at any time, and will affect + // any iterate() calls afterwards. + bool setMinConstraint(float fMin) { return false; } + bool setMaxConstraint(float fMax) { return false; } + + // iterate should be called after init and setBuffers. + // It may be called multiple times. + bool iterate(unsigned int iterations); + + // Compute the norm of the difference of the FP of the current reconstruction + // and the sinogram. (This performs one FP.) + // It can be called after iterate. + float computeDiffNorm(); + +protected: + void reset(); + + bool useVolumeMask; + bool useSinogramMask; + + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + // Input/output + cudaPitchedPtr D_sinoData; + cudaPitchedPtr D_volumeData; + + // Temporary buffers + cudaPitchedPtr D_r; + cudaPitchedPtr D_w; + cudaPitchedPtr D_z; + cudaPitchedPtr D_p; + + float gamma; + + bool sliceInitialized; +}; + +_AstraExport bool doCGLS(cudaPitchedPtr D_volumeData, unsigned int volumePitch, + cudaPitchedPtr D_projData, unsigned int projPitch, + cudaPitchedPtr D_maskData, unsigned int maskPitch, + const SDimensions3D& dims, const SConeProjection* projs, + unsigned int iterations); + +} + +#endif diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu new file mode 100644 index 0000000..7f8e320 --- /dev/null +++ b/cuda/3d/cone_bp.cu @@ -0,0 +1,481 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "cone_fp.h" +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_coneProjTexture; + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 16; + +static const unsigned int g_anglesPerBlock = 64; +static const unsigned int g_volBlockX = 32; +static const unsigned int g_volBlockY = 16; + +static const unsigned g_MaxAngles = 1024; + +__constant__ float gC_Cux[g_MaxAngles]; +__constant__ float gC_Cuy[g_MaxAngles]; +__constant__ float gC_Cuz[g_MaxAngles]; +__constant__ float gC_Cuc[g_MaxAngles]; +__constant__ float gC_Cvx[g_MaxAngles]; +__constant__ float gC_Cvy[g_MaxAngles]; +__constant__ float gC_Cvz[g_MaxAngles]; +__constant__ float gC_Cvc[g_MaxAngles]; +__constant__ float gC_Cdx[g_MaxAngles]; +__constant__ float gC_Cdy[g_MaxAngles]; +__constant__ float gC_Cdz[g_MaxAngles]; +__constant__ float gC_Cdc[g_MaxAngles]; + + +bool bindProjDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneProjTexture.filterMode = cudaFilterModeLinear; + gT_coneProjTexture.normalized = false; + + cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + +__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + const float fCdx = gC_Cdx[angle]; + const float fCdy = gC_Cdy[angle]; + const float fCdz = gC_Cdz[angle]; + const float fCdc = gC_Cdc[angle]; + + const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; + const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; + const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz; + + const float fU = fUNum / fDen + 1.0f; + const float fV = fVNum / fDen + 1.0f; + + fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; + } + +} + +// supersampling version +__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + const float fSubStep = 1.0f/dims.iRaysPerVoxelDim; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + const float fCdx = gC_Cdx[angle]; + const float fCdy = gC_Cdy[angle]; + const float fCdz = gC_Cdz[angle]; + const float fCdc = gC_Cdc[angle]; + + float fXs = fX; + for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) { + float fYs = fY; + for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) { + float fZs = fZ; + for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) { + + const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; + const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; + const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz; + + const float fU = fUNum / fDen + 1.0f; + const float fV = fVNum / fDen + 1.0f; + + fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); + + fZs += fSubStep; + } + fYs += fSubStep; + } + fXs += fSubStep; + } + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim); + } + +} + + +bool ConeBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SConeProjection* angles) +{ + bindProjDataTexture(D_projArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVY - (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVZ , Cux ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVZ -(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVX , Cuy ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVY , Cuz ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fSrcX - (angles[i].fDetSX*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVX)*angles[i].fSrcY + (angles[i].fDetSX*angles[i].fDetVY - angles[i].fDetSY*angles[i].fDetVX)*angles[i].fSrcZ , Cuc ); + + TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUZ-(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUY, Cvx ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUZ , Cvy ); + TRANSFER_TO_CONSTANT((angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUY-(angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUX , Cvz ); + TRANSFER_TO_CONSTANT( -(angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fSrcX + (angles[i].fDetSX*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUX)*angles[i].fSrcY - (angles[i].fDetSX*angles[i].fDetUY - angles[i].fDetSY*angles[i].fDetUX)*angles[i].fSrcZ , Cvc ); + + TRANSFER_TO_CONSTANT( angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY , Cdx ); + TRANSFER_TO_CONSTANT( angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ , Cdy ); + TRANSFER_TO_CONSTANT( angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX , Cdz ); + TRANSFER_TO_CONSTANT( -angles[i].fSrcX * (angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY) - angles[i].fSrcY * (angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ) - angles[i].fSrcZ * (angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX) , Cdc ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); + if (dims.iRaysPerVoxelDim == 1) + dev_cone_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + else + dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + } + + cudaTextForceKernelsCompletion(); + + // printf("%f\n", toc(t)); + + return true; +} + +bool ConeBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles); + + cudaFreeArray(cuArray); + + return ret; +} + + +} + +#ifdef STANDALONE +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 180; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); +#if 0 + if (i == 128) { + for (unsigned int j = 0; j < 256*256; ++j) + slice[j] = 0.0f; + } +#endif + } + + + SConeProjection angle[180]; + angle[0].fSrcX = -1536; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 512; + angle[0].fDetSY = -256; + angle[0].fDetSZ = -256; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = 1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 180; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + ROTATE0(DetV, i, i*2*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); +#if 0 + float* bufs = new float[180*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 180, 512, bufs); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 0.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + + astraCUDA3d::ConeBP(volData, projData, dims, angle); +#if 0 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + +} +#endif diff --git a/cuda/3d/cone_bp.h b/cuda/3d/cone_bp.h new file mode 100644 index 0000000..c77714e --- /dev/null +++ b/cuda/3d/cone_bp.h @@ -0,0 +1,45 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CONE_BP_H +#define _CUDA_CONE_BP_H + +namespace astraCUDA3d { + +_AstraExport bool ConeBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SConeProjection* angles); + +_AstraExport bool ConeBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles); + + +} + +#endif diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu new file mode 100644 index 0000000..40dca4f --- /dev/null +++ b/cuda/3d/cone_fp.cu @@ -0,0 +1,513 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_coneVolumeTexture; + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 64; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_SrcZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +bool bindVolumeDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneVolumeTexture.filterMode = cudaFilterModeLinear; + gT_coneVolumeTexture.normalized = false; + + cudaBindTextureToArray(gT_coneVolumeTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + // threadIdx: x = ??? detector (u?) + // y = relative angle + + // blockIdx: x = ??? detector (u+v?) + // y = angle block + + +#define CONE_FP_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fSrcX = gC_SrcX[angle]; \ + const float fSrcY = gC_SrcY[angle]; \ + const float fSrcZ = gC_SrcZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray from Src to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0); \ + const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0); \ + const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0; \ + const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + +#define CONE_FP_SS_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fSrcX = gC_SrcX[angle]; \ + const float fSrcY = gC_SrcY[angle]; \ + const float fSrcZ = gC_SrcZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVolX) \ + endSlice = dims.iVolX; \ + \ + const float fSubStep = 1.0f/dims.iRaysPerDetDim; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray from Src to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + float fV = 0.0f; \ + \ + float fdU = detectorU - 0.5f + 0.5f*fSubStep; \ + for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \ + float fdV = detectorV - 0.5f + 0.5f*fSubStep; \ + for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \ + \ + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \ + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \ + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0); \ + const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0); \ + const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0; \ + const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + fV += fVal; \ + \ + } \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\ + } + + + + + +__global__ void FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_BODY(X,Y,Z) +} + +__global__ void FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_BODY(Y,X,Z) +} + +__global__ void FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_BODY(Z,X,Y) +} + + +__global__ void FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_SS_BODY(X,Y,Z) +} + +__global__ void FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_SS_BODY(Y,X,Z) +} + +__global__ void FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_SS_BODY(Z,X,Y) +} + + + +bool ConeFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale) +{ + bindVolumeDataTexture(D_volArray); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(SrcZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f)); + float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f)); + float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f)); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + // printf("%f\n", toc(t)); + + return true; +} + +bool ConeFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale) +{ + // transfer volume to array + + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + bool ret = ConeFP_Array(cuArray, D_projData, dims, angles, fOutputScale); + + cudaFreeArray(cuArray); + + return ret; +} + + +} + +#ifdef STANDALONE +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 32; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjV; + extentP.depth = dims.iProjAngles; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaError err = cudaMemcpy3D(&p); + assert(!err); + } + + + SConeProjection angle[32]; + angle[0].fSrcX = -1536; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 200; + + angle[0].fDetSX = 512; + angle[0].fDetSY = -256; + angle[0].fDetSZ = -256; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = 1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 32; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*1*M_PI/180); + ROTATE0(DetS, i, i*1*M_PI/180); + ROTATE0(DetU, i, i*1*M_PI/180); + ROTATE0(DetV, i, i*1*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + + float* buf = new float[512*512]; + + cudaMemcpy(buf, ((float*)projData.ptr)+512*512*8, 512*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + saveImage("proj.png", 512, 512, buf); + + +} +#endif diff --git a/cuda/3d/cone_fp.h b/cuda/3d/cone_fp.h new file mode 100644 index 0000000..2a0463b --- /dev/null +++ b/cuda/3d/cone_fp.h @@ -0,0 +1,46 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CONE_FP_H +#define _CUDA_CONE_FP_H + +namespace astraCUDA3d { + +_AstraExport bool ConeFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale); + +_AstraExport bool ConeFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale); + +} + +#endif diff --git a/cuda/3d/darthelper3d.cu b/cuda/3d/darthelper3d.cu new file mode 100644 index 0000000..68330a1 --- /dev/null +++ b/cuda/3d/darthelper3d.cu @@ -0,0 +1,229 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util3d.h" +#include "dims3d.h" +#include "darthelper3d.h" +#include <cassert> + +namespace astraCUDA3d { + + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + __global__ void devDartSmoothing(cudaPitchedPtr out, cudaPitchedPtr in, float b, SDimensions3D dims) + { + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) { + + float* d = (float*)in.ptr; + float* m = (float*)out.ptr; + + unsigned int index; + unsigned int p = (out.pitch >> 2); + + for (unsigned int z = 0; z <= dims.iVolZ-1; z++) { + + float res = 0.0f; + + // bottom slice + if (z > 0) { + index = ((z-1)*dims.iVolY + y) * p + x; + res += d[index-p-1] + d[index-p] + d[index-p+1] + + d[index -1] + d[index ] + d[index +1] + + d[index+p-1] + d[index+p] + d[index+p+1]; + } + + // top slice + if (z < dims.iVolZ-1) { + index = ((z+1)*dims.iVolY + y) * p + x; + res += d[index-p-1] + d[index-p] + d[index-p+1] + + d[index -1] + d[index ] + d[index +1] + + d[index+p-1] + d[index+p] + d[index+p+1]; + } + + // same slice + index = (z*dims.iVolY + y) * p + x; + res += d[index-p-1] + d[index-p] + d[index-p+1] + + d[index -1] + d[index +1] + + d[index+p-1] + d[index+p] + d[index+p+1]; + + // result + m[index] = (1.0f-b) * d[index] + b * 0.038461538f * res; + + } + + } + } + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims) + { + cudaPitchedPtr D_inData; + D_inData = allocateVolumeData(dims); + copyVolumeToDevice(in, D_inData, dims); + + cudaPitchedPtr D_outData; + D_outData = allocateVolumeData(dims); + copyVolumeToDevice(out, D_outData, dims); + + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16); + + devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, dims); + + copyVolumeFromDevice(out, D_outData, dims); + + cudaFree(D_outData.ptr); + cudaFree(D_inData.ptr); + + } + + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + // CUDA function for the masking of DART with a radius == 1 + __global__ void devDartMasking(cudaPitchedPtr mask, cudaPitchedPtr in, unsigned int conn, SDimensions3D dims) + { + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) { + + float* d = (float*)in.ptr; + float* m = (float*)mask.ptr; + + unsigned int index; + unsigned int p = (in.pitch >> 2); + + for (unsigned int z = 0; z <= dims.iVolZ-1; z++) { + + unsigned int o2 = (z*dims.iVolY + y) * p + x; + + m[o2] = 0.0f; + + // bottom slice + if (z > 0) { + index = ((z-1)*dims.iVolY + y) * p + x; + if ((conn == 26 && + (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || + d[index -1] != d[o2] || d[index ] != d[o2] || d[index +1] != d[o2] || + d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] )) + || + (conn == 6 && d[index] != d[o2])) + { + m[o2] = 1.0f; + continue; + } + } + + // top slice + if (z < dims.iVolZ-1) { + index = ((z+1)*dims.iVolY + y) * p + x; + if ((conn == 26 && + (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || + d[index -1] != d[o2] || d[index ] != d[o2] || d[index +1] != d[o2] || + d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] )) + || + (conn == 6 && d[index] != d[o2])) + { + m[o2] = 1.0f; + continue; + } + } + + // other slices + index = (z*dims.iVolY + y) * p + x; + if ((conn == 26 && + (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || + d[index -1] != d[o2] || d[index +1] != d[o2] || + d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] )) + || + (conn == 6 && + ( d[index-p] != d[o2] || + d[index -1] != d[o2] || d[index +1] != d[o2] || + d[index+p] != d[o2] ))) + { + m[o2] = 1.0f; + continue; + } + + } + + } + } + + + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + void dartMasking(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims) + { + cudaPitchedPtr D_maskData; + D_maskData = allocateVolumeData(dims); + copyVolumeToDevice(mask, D_maskData, dims); + + cudaPitchedPtr D_segmentationData; + D_segmentationData = allocateVolumeData(dims); + copyVolumeToDevice(segmentation, D_segmentationData, dims); + + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16); + + if (threshold == 1 && radius == 1) + devDartMasking<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, dims); + //else if (threshold > 1 && radius == 1) + // devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1); + //else if (threshold == 1 && radius > 1) + // devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1); + //else + // devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1); + + copyVolumeFromDevice(mask, D_maskData, dims); + + cudaFree(D_maskData.ptr); + cudaFree(D_segmentationData.ptr); + + } + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + bool setGPUIndex(int iGPUIndex) + { + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; + } + + +} diff --git a/cuda/3d/darthelper3d.h b/cuda/3d/darthelper3d.h new file mode 100644 index 0000000..7899629 --- /dev/null +++ b/cuda/3d/darthelper3d.h @@ -0,0 +1,46 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_DARTHELPER3_H +#define _CUDA_DARTHELPER3_H + +#include <cuda.h> +#include <driver_types.h> +#include "util3d.h" +#include "algo3d.h" + +namespace astraCUDA3d { + + void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims); + void dartMasking(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims); + + bool setGPUIndex(int index); + +} + +#endif diff --git a/cuda/3d/dims3d.h b/cuda/3d/dims3d.h new file mode 100644 index 0000000..ec3c4a3 --- /dev/null +++ b/cuda/3d/dims3d.h @@ -0,0 +1,84 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CONE_DIMS_H +#define _CUDA_CONE_DIMS_H + +namespace astra { + +struct SConeProjection { + // the source + double fSrcX, fSrcY, fSrcZ; + + // the origin ("bottom left") of the (flat-panel) detector + double fDetSX, fDetSY, fDetSZ; + + // the U-edge of a detector pixel + double fDetUX, fDetUY, fDetUZ; + + // the V-edge of a detector pixel + double fDetVX, fDetVY, fDetVZ; +}; + +struct SPar3DProjection { + // the ray direction + double fRayX, fRayY, fRayZ; + + // the origin ("bottom left") of the (flat-panel) detector + double fDetSX, fDetSY, fDetSZ; + + // the U-edge of a detector pixel + double fDetUX, fDetUY, fDetUZ; + + // the V-edge of a detector pixel + double fDetVX, fDetVY, fDetVZ; +}; + +} + + +namespace astraCUDA3d { + +using astra::SConeProjection; +using astra::SPar3DProjection; + +struct SDimensions3D { + unsigned int iVolX; + unsigned int iVolY; + unsigned int iVolZ; + unsigned int iProjAngles; + unsigned int iProjU; // number of detectors in the U direction + unsigned int iProjV; // number of detectors in the V direction + unsigned int iRaysPerDetDim; + unsigned int iRaysPerVoxelDim; +}; + +} + +#endif + diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu new file mode 100644 index 0000000..ad0604c --- /dev/null +++ b/cuda/3d/fdk.cu @@ -0,0 +1,646 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "cone_fp.h" +#include "testutil.h" +#endif + +#include "dims3d.h" +#include "../2d/fft.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_coneProjTexture; + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 16; + +static const unsigned int g_anglesPerBlock = 64; +static const unsigned int g_volBlockX = 32; +static const unsigned int g_volBlockY = 16; + +static const unsigned int g_anglesPerWeightBlock = 16; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 2048; + +__constant__ float gC_angle_sin[g_MaxAngles]; +__constant__ float gC_angle_cos[g_MaxAngles]; +__constant__ float gC_angle[g_MaxAngles]; + + +// per-detector u/v shifts? + +static bool bindProjDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneProjTexture.filterMode = cudaFilterModeLinear; + gT_coneProjTexture.normalized = false; + + cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + +__global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fInvDetUSize, float fInvDetVSize, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X > dims.iVolX) + return; + if (Y > dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ; + + const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f; + const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ); + + // Note re. fZ/rV_base: the computations below are all relative to the + // optical axis, so we do the Z-adjustments beforehand. + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + + const float fR = fSrcOrigin; + const float fD = fR - fX * sin_theta + fY * cos_theta; + float fWeight = fR / fD; + fWeight *= fWeight; + + const float fScaleFactor = (fR + fDetOrigin) / fD; + const float fU = fU_base + (fX*cos_theta+fY*sin_theta) * fScaleFactor * fInvDetUSize; + const float fV = fV_base + fZ * fScaleFactor * fInvDetVSize; + + fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; +// projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU] = 10.0f; +// if (threadIdx.x == 0 && threadIdx.y == 0) { printf("%d,%d,%d [%d / %d] -> %f\n", angle, detectorU, detectorV, (angle*dims.iProjV+detectorV)*projPitch+detectorU, projPitch, projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU]); } + } + +} + + +bool FDK_BP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, + const SDimensions3D& dims, const float* angles) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bindProjDataTexture(cuArray); + + float* angle_sin = new float[dims.iProjAngles]; + float* angle_cos = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + angle_sin[i] = sinf(angles[i]); + angle_cos[i] = cosf(angles[i]); + } + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + assert(e1 == cudaSuccess); + assert(e2 == cudaSuccess); + + delete[] angle_sin; + delete[] angle_cos; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + devBP_FDK<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, 1.0f / fDetUSize, 1.0f / fDetVSize, dims); + } + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(cuArray); + + // printf("%f\n", toc(t)); + + return true; +} + +__global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, const SDimensions3D dims) +{ + float* projData = (float*)D_projData; + int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + // We need the length of the central ray and the length of the ray(s) to + // our detector pixel(s). + + const float fCentralRayLength = fSrcOrigin + fDetOrigin; + + const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize; + + const float fT = fCentralRayLength * fCentralRayLength + fU * fU; + + float fV = (startDetectorV - 0.5f*dims.iProjV + 0.5f) * fDetVSize + fDetZ - fSrcZ; + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + const float fRayLength = sqrtf(fT + fV * fV); + + const float fWeight = fCentralRayLength / fRayLength; + + projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight; + + fV += 1.0f; + } +} + +__global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fCentralFanAngle, const SDimensions3D dims) +{ + float* projData = (float*)D_projData; + int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + // We need the length of the central ray and the length of the projection + // of our ray onto the central slice + + const float fCentralRayLength = fSrcOrigin + fDetOrigin; + + // TODO: Detector pixel size + const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize; + + //const float fGamma = atanf(fU / fCentralRayLength); + //const float fBeta = gC_angle[angle]; + const float fGamma = atanf(fU / fCentralRayLength); + const float fBeta = -gC_angle[angle]; + + // compute the weight depending on the location in the central fan's radon + // space + float fWeight; + + if (fBeta <= 0.0f) { + fWeight = 0.0f; + } else if (fBeta <= 2.0f*(fCentralFanAngle + fGamma)) { + fWeight = sinf((M_PI / 4.0f) * fBeta / (fCentralFanAngle + fGamma)); + fWeight *= fWeight; + } else if (fBeta <= M_PI + 2*fGamma) { + fWeight = 1.0f; + } else if (fBeta <= M_PI + 2*fCentralFanAngle) { + fWeight = sinf((M_PI / 4.0f) * (M_PI + 2.0f*fCentralFanAngle - fBeta) / (fCentralFanAngle - fGamma)); + fWeight *= fWeight; + } else { + fWeight = 0.0f; + } + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + + projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight; + + } +} + + + +// Perform the FDK pre-weighting and filtering +bool FDK_Filter(cudaPitchedPtr D_projData, + cufftComplex * D_filter, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, + float fDetUSize, float fDetVSize, bool bShortScan, + const SDimensions3D& dims, const float* angles) +{ + // The pre-weighting factor for a ray is the cosine of the angle between + // the central line and the ray. + + dim3 dimBlock(g_detBlockU, g_anglesPerWeightBlock); + dim3 dimGrid( ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), + (dims.iProjAngles+g_anglesPerWeightBlock-1)/g_anglesPerWeightBlock); + + int projPitch = D_projData.pitch/sizeof(float); + + devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fDetVSize, dims); + + cudaTextForceKernelsCompletion(); + + if (bShortScan) { + // We do short-scan Parker weighting + + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle, angles, + dims.iProjAngles*sizeof(float), 0, + cudaMemcpyHostToDevice); + assert(!e1); + + // TODO: detector pixel size! + float fCentralFanAngle = atanf((dims.iProjU*0.5f) / + (fSrcOrigin + fDetOrigin)); + + devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fCentralFanAngle, dims); + + } + + cudaTextForceKernelsCompletion(); + + + // The filtering is a regular ramp filter per detector line. + + int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); + int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + + + + // We process one sinogram at a time. + float* D_sinoData = (float*)D_projData.ptr; + + cufftComplex * D_sinoFFT = NULL; + allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_sinoFFT); + + bool ok = true; + + for (int v = 0; v < dims.iProjV; ++v) { + + ok = runCudaFFT(dims.iProjAngles, D_sinoData, projPitch, 0, + dims.iProjU, iPaddedDetCount, iHalfFFTSize, + D_sinoFFT); + + if (!ok) break; + + applyFilter(dims.iProjAngles, iHalfFFTSize, D_sinoFFT, D_filter); + + + ok = runCudaIFFT(dims.iProjAngles, D_sinoFFT, D_sinoData, projPitch, + 0, dims.iProjU, iPaddedDetCount, iHalfFFTSize); + + if (!ok) break; + + D_sinoData += (dims.iProjAngles * projPitch); + } + + freeComplexOnDevice(D_sinoFFT); + + return ok; +} + + +bool FDK(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, + const SDimensions3D& dims, const float* angles, bool bShortScan) +{ + bool ok; + // Generate filter + // TODO: Check errors + cufftComplex * D_filter; + int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); + int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + + cufftComplex *pHostFilter = new cufftComplex[dims.iProjAngles * iHalfFFTSize]; + memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iHalfFFTSize); + + genFilter(FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize); + + allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_filter); + uploadComplexArrayToDevice(dims.iProjAngles, iHalfFFTSize, pHostFilter, D_filter); + + delete [] pHostFilter; + + + // Perform filtering + + ok = FDK_Filter(D_projData, D_filter, fSrcOrigin, fDetOrigin, + fSrcZ, fDetZ, fDetUSize, fDetVSize, + bShortScan, dims, angles); + + // Clean up filter + freeComplexOnDevice(D_filter); + + + if (!ok) + return false; + + // Perform BP + + ok = FDK_BP(D_volumeData, D_projData, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, + fDetUSize, fDetVSize, dims, angles); + + if (!ok) + return false; + + return true; +} + + +} + +#ifdef STANDALONE +void dumpVolume(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax) +{ + float* buf = new float[dims.iVolX*dims.iVolY]; + unsigned int pitch = data.pitch / sizeof(float); + + for (int i = 0; i < dims.iVolZ; ++i) { + cudaMemcpy2D(buf, dims.iVolX*sizeof(float), ((float*)data.ptr)+pitch*dims.iVolY*i, data.pitch, dims.iVolX*sizeof(float), dims.iVolY, cudaMemcpyDeviceToHost); + + char fname[512]; + sprintf(fname, filespec, dims.iVolZ-i-1); + saveImage(fname, dims.iVolY, dims.iVolX, buf, fMin, fMax); + } +} + +void dumpSinograms(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax) +{ + float* bufs = new float[dims.iProjAngles*dims.iProjU]; + unsigned int pitch = data.pitch / sizeof(float); + + for (int i = 0; i < dims.iProjV; ++i) { + cudaMemcpy2D(bufs, dims.iProjU*sizeof(float), ((float*)data.ptr)+pitch*dims.iProjAngles*i, data.pitch, dims.iProjU*sizeof(float), dims.iProjAngles, cudaMemcpyDeviceToHost); + + char fname[512]; + sprintf(fname, filespec, i); + saveImage(fname, dims.iProjAngles, dims.iProjU, bufs, fMin, fMax); + } +} + +void dumpProjections(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax) +{ + float* bufp = new float[dims.iProjV*dims.iProjU]; + unsigned int pitch = data.pitch / sizeof(float); + + for (int i = 0; i < dims.iProjAngles; ++i) { + for (int j = 0; j < dims.iProjV; ++j) { + cudaMemcpy(bufp+dims.iProjU*j, ((float*)data.ptr)+pitch*dims.iProjAngles*j+pitch*i, dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[512]; + sprintf(fname, filespec, i); + saveImage(fname, dims.iProjV, dims.iProjU, bufp, fMin, fMax); + } +} + + + + +int main() +{ +#if 0 + SDimensions3D dims; + dims.iVolX = 512; + dims.iVolY = 512; + dims.iVolZ = 512; + dims.iProjAngles = 180; + dims.iProjU = 1024; + dims.iProjV = 1024; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + +#if 0 + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); +#if 0 + if (i == 128) { + for (unsigned int j = 0; j < 256*256; ++j) + slice[j] = 0.0f; + } +#endif + } +#endif + + SConeProjection angle[180]; + angle[0].fSrcX = -1536; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 1024; + angle[0].fDetSY = -512; + angle[0].fDetSZ = 512; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = -1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 180; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + ROTATE0(DetV, i, i*2*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + + //dumpSinograms("sino%03d.png", projData, dims, 0, 512); + //dumpProjections("proj%03d.png", projData, dims, 0, 512); + + astraCUDA3d::zeroVolumeData(volData, dims); + + float* angles = new float[dims.iProjAngles]; + for (int i = 0; i < 180; ++i) + angles[i] = i*2*M_PI/180; + + astraCUDA3d::FDK(volData, projData, 1536, 512, 0, 0, dims, angles); + + dumpVolume("vol%03d.png", volData, dims, -20, 100); + + +#else + + SDimensions3D dims; + dims.iVolX = 1000; + dims.iVolY = 999; + dims.iVolZ = 500; + dims.iProjAngles = 376; + dims.iProjU = 1024; + dims.iProjV = 524; + dims.iRaysPerDet = 1; + + float* angles = new float[dims.iProjAngles]; + for (int i = 0; i < dims.iProjAngles; ++i) + angles[i] = -i*(M_PI)/360; + + cudaPitchedPtr volData = astraCUDA3d::allocateVolumeData(dims); + cudaPitchedPtr projData = astraCUDA3d::allocateProjectionData(dims); + astraCUDA3d::zeroProjectionData(projData, dims); + astraCUDA3d::zeroVolumeData(volData, dims); + + timeval t; + tic(t); + + for (int i = 0; i < dims.iProjAngles; ++i) { + char fname[256]; + sprintf(fname, "/home/wpalenst/tmp/Elke/proj%04d.png", i); + unsigned int w,h; + float* bufp = loadImage(fname, w,h); + + int pitch = projData.pitch / sizeof(float); + for (int j = 0; j < dims.iProjV; ++j) { + cudaMemcpy(((float*)projData.ptr)+dims.iProjAngles*pitch*j+pitch*i, bufp+dims.iProjU*j, dims.iProjU*sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] bufp; + } + printf("Load time: %f\n", toc(t)); + + //dumpSinograms("sino%03d.png", projData, dims, -8.0f, 256.0f); + //astraCUDA3d::FDK(volData, projData, 7350, 62355, 0, 10, dims, angles); + //astraCUDA3d::FDK(volData, projData, 7350, -380, 0, 10, dims, angles); + + tic(t); + + astraCUDA3d::FDK(volData, projData, 7383.29867, 0, 0, 10, dims, angles); + + printf("FDK time: %f\n", toc(t)); + tic(t); + + dumpVolume("vol%03d.png", volData, dims, -65.9f, 200.0f); + //dumpVolume("vol%03d.png", volData, dims, 0.0f, 256.0f); + printf("Save time: %f\n", toc(t)); + +#endif + + +} +#endif diff --git a/cuda/3d/fdk.h b/cuda/3d/fdk.h new file mode 100644 index 0000000..5443b19 --- /dev/null +++ b/cuda/3d/fdk.h @@ -0,0 +1,43 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_FDK_H +#define _CUDA_FDK_H + +namespace astraCUDA3d { + +bool FDK(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, + const SDimensions3D& dims, const float* angles, bool bShortScan); + + +} + +#endif diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu new file mode 100644 index 0000000..872b1eb --- /dev/null +++ b/cuda/3d/par3d_bp.cu @@ -0,0 +1,464 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "par3d_fp.h" +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_par3DProjTexture; + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 16; + +static const unsigned int g_anglesPerBlock = 64; +static const unsigned int g_volBlockX = 32; +static const unsigned int g_volBlockY = 16; + +static const unsigned g_MaxAngles = 1024; + +__constant__ float gC_Cux[g_MaxAngles]; +__constant__ float gC_Cuy[g_MaxAngles]; +__constant__ float gC_Cuz[g_MaxAngles]; +__constant__ float gC_Cuc[g_MaxAngles]; +__constant__ float gC_Cvx[g_MaxAngles]; +__constant__ float gC_Cvy[g_MaxAngles]; +__constant__ float gC_Cvz[g_MaxAngles]; +__constant__ float gC_Cvc[g_MaxAngles]; + + +static bool bindProjDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DProjTexture.filterMode = cudaFilterModeLinear; + gT_par3DProjTexture.normalized = false; + + cudaBindTextureToArray(gT_par3DProjTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + +__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + + const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; + const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; + + const float fU = fUNum + 1.0f; + const float fV = fVNum + 1.0f; + + fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; + } + +} + +// supersampling version +__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + + const float fSubStep = 1.0f/dims.iRaysPerVoxelDim; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + + float fXs = fX; + for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) { + float fYs = fY; + for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) { + float fZs = fZ; + for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) { + + const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; + const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; + + const float fU = fUNum + 1.0f; + const float fV = fVNum + 1.0f; + + fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order + fZs += fSubStep; + } + fYs += fSubStep; + } + fXs += fSubStep; + } + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim); + } + +} + +bool Par3DBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SPar3DProjection* angles) +{ + bindProjDataTexture(D_projArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + +#define DENOM (angles[i].fRayX*angles[i].fDetUY*angles[i].fDetVZ - angles[i].fRayX*angles[i].fDetUZ*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetUX*angles[i].fDetVZ + angles[i].fRayY*angles[i].fDetUZ*angles[i].fDetVX + angles[i].fRayZ*angles[i].fDetUX*angles[i].fDetVY - angles[i].fRayZ*angles[i].fDetUY*angles[i].fDetVX) + + + TRANSFER_TO_CONSTANT( ( - (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)) / DENOM , Cux ); + TRANSFER_TO_CONSTANT( ( (angles[i].fRayX*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVX)) / DENOM , Cuy ); + TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetVX) ) / DENOM , Cuz ); + TRANSFER_TO_CONSTANT( (-(angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fRayX + (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)*angles[i].fDetSX - (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetVX) / DENOM , Cuc ); + + TRANSFER_TO_CONSTANT( ((angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY) ) / DENOM , Cvx ); + TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUX) ) / DENOM , Cvy ); + TRANSFER_TO_CONSTANT( ((angles[i].fRayX*angles[i].fDetUY - angles[i].fRayY*angles[i].fDetUX) ) / DENOM , Cvz ); + TRANSFER_TO_CONSTANT( ((angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fRayX - (angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY)*angles[i].fDetSX + (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetUX ) / DENOM , Cvc ); + +#undef TRANSFER_TO_CONSTANT +#undef DENOM + + delete[] tmp; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); + if (dims.iRaysPerVoxelDim == 1) + dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + else + dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + } + + cudaTextForceKernelsCompletion(); + + // printf("%f\n", toc(t)); + + return true; +} + +bool Par3DBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles); + + cudaFreeArray(cuArray); + + return ret; +} + + +} + +#ifdef STANDALONE +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 180; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); +#if 0 + if (i == 128) { + for (unsigned int j = 0; j < 256*256; ++j) + slice[j] = 0.0f; + } +#endif + } + + + SPar3DProjection angle[180]; + angle[0].fRayX = 1; + angle[0].fRayY = 0; + angle[0].fRayZ = 0; + + angle[0].fDetSX = 512; + angle[0].fDetSY = -256; + angle[0].fDetSZ = -256; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = 1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 180; ++i) { + angle[i] = angle[0]; + ROTATE0(Ray, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + ROTATE0(DetV, i, i*2*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f); +#if 1 + float* bufs = new float[180*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 180, 512, bufs, 0, 512); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp, 0, 512); + } +#endif + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 0.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + + astraCUDA3d::Par3DBP(volData, projData, dims, angle); +#if 1 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf, 0, 60000); + } +#endif + +} +#endif diff --git a/cuda/3d/par3d_bp.h b/cuda/3d/par3d_bp.h new file mode 100644 index 0000000..399a3cb --- /dev/null +++ b/cuda/3d/par3d_bp.h @@ -0,0 +1,45 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR3D_BP_H +#define _CUDA_PAR3D_BP_H + +namespace astraCUDA3d { + +_AstraExport bool Par3DBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SPar3DProjection* angles); + +_AstraExport bool Par3DBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles); + + +} + +#endif diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu new file mode 100644 index 0000000..6bf9037 --- /dev/null +++ b/cuda/3d/par3d_fp.cu @@ -0,0 +1,814 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_par3DVolumeTexture; + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 64; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_RayX[g_MaxAngles]; +__constant__ float gC_RayY[g_MaxAngles]; +__constant__ float gC_RayZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +static bool bindVolumeDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear; + gT_par3DVolumeTexture.normalized = false; + + cudaBindTextureToArray(gT_par3DVolumeTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + + +// threadIdx: x = u detector +// y = relative angle +// blockIdx: x = u/v detector +// y = angle block + +#define PAR3D_FP_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + + + +// Supersampling version +#define PAR3D_FP_SS_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + const float fSubStep = 1.0f/dims.iRaysPerDetDim; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + \ + float fV = 0.0f; \ + \ + float fdU = detectorU - 0.5f + 0.5f*fSubStep; \ + for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \ + float fdV = detectorV - 0.5f + 0.5f*fSubStep; \ + for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \ + \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \ + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \ + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + fV += fVal; \ + \ + } \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\ + } + + + +__global__ void par3D_FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(X,Y,Z) +} + +__global__ void par3D_FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(Y,X,Z) +} + +__global__ void par3D_FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(Z,X,Y) +} + +__global__ void par3D_FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(X,Y,Z) +} + +__global__ void par3D_FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(Y,X,Z) +} + +__global__ void par3D_FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(Z,X,Y) +} + + +__device__ float dirWeights(float fX, float fN) { + if (fX <= 0.5f) // outside image on left + return 0.0f; + if (fX <= 1.5f) // half outside image on left + return (fX - 0.5f) * (fX - 0.5f); + if (fX <= fN + 0.5f) { // inside image + float t = fX - 0.5f - floorf(fX - 0.5f); + return t*t + (1-t)*(1-t); + } + if (fX <= fN + 1.5f) // half outside image on right + return (fN + 1.5f - fX) * (fN + 1.5f - fX); + return 0.0f; // outside image on right +} + +#define PAR3D_FP_SUMSQW_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += dirWeights(f##c1, dims.iVol##c1) * dirWeights(f##c2, dims.iVol##c2) * fDistCorr * fDistCorr; \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + +// Supersampling version +// TODO + + +__global__ void par3D_FP_SumSqW_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(X,Y,Z) +} + +__global__ void par3D_FP_SumSqW_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(Y,X,Z) +} + +__global__ void par3D_FP_SumSqW_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(Z,X,Y) +} + + + +bool Par3DFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + + bindVolumeDataTexture(D_volArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + + // printf("%f\n", toc(t)); + + return true; +} + +bool Par3DFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + // transfer volume to array + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + bool ret = Par3DFP_Array(cuArray, D_projData, dims, angles, fOutputScale); + + cudaFreeArray(cuArray); + + return ret; +} + + + +bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + + // printf("%f\n", toc(t)); + + return true; +} + + + + + + + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + cudaSetDevice(1); + + + SDimensions3D dims; + dims.iVolX = 500; + dims.iVolY = 500; + dims.iVolZ = 81; + dims.iProjAngles = 241; + dims.iProjU = 600; + dims.iProjV = 100; + dims.iRaysPerDet = 1; + + SPar3DProjection base; + base.fRayX = 1.0f; + base.fRayY = 0.0f; + base.fRayZ = 0.1f; + + base.fDetSX = 0.0f; + base.fDetSY = -300.0f; + base.fDetSZ = -50.0f; + + base.fDetUX = 0.0f; + base.fDetUY = 1.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = 1.0f; + + SPar3DProjection angle[dims.iProjAngles]; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + volData = allocateVolumeData(dims); + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + projData = allocateProjectionData(dims); + + unsigned int ix = 500,iy = 500; + + float* buf = new float[dims.iProjU*dims.iProjV]; + + float* slice = new float[dims.iVolX*dims.iVolY]; + for (int i = 0; i < dims.iVolX*dims.iVolY; ++i) + slice[i] = 1.0f; + + for (unsigned int a = 0; a < 241; a += dims.iProjAngles) { + + zeroProjectionData(projData, dims); + + for (int y = 0; y < iy; y += dims.iVolY) { + for (int x = 0; x < ix; x += dims.iVolX) { + + timeval st; + tic(st); + + for (int z = 0; z < dims.iVolZ; ++z) { +// char sfn[256]; +// sprintf(sfn, "/home/wpalenst/projects/cone_simulation/phantom_4096/mouse_fem_phantom_%04d.png", 30+z); +// float* slice = loadSubImage(sfn, x, y, dims.iVolX, dims.iVolY); + + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = dims.iVolX*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, z }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaError err = cudaMemcpy3D(&p); + assert(!err); +// delete[] slice; + } + + printf("Load: %f\n", toc(st)); + +#if 0 + + cudaPos zp = { 0, 0, 0 }; + + cudaPitchedPtr t; + t.ptr = new float[1024*1024]; + t.pitch = 1024*4; + t.xsize = 1024*4; + t.ysize = 1024; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = volData; + p.extent = extentS; + p.dstArray = 0; + p.dstPtr = t; + p.dstPos = zp; + p.kind = cudaMemcpyDeviceToHost; + cudaError err = cudaMemcpy3D(&p); + assert(!err); + + char fn[32]; + sprintf(fn, "t%d%d.png", x / dims.iVolX, y / dims.iVolY); + saveImage(fn, 1024, 1024, (float*)t.ptr); + saveImage("s.png", 4096, 4096, slice); + delete[] (float*)t.ptr; +#endif + + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); angle[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); angle[i].f##name##Z = base.f##name##Z; } while(0) +#define SHIFT(name,i,x,y) do { angle[i].f##name##X += x; angle[i].f##name##Y += y; } while(0) + for (int i = 0; i < dims.iProjAngles; ++i) { + ROTATE0(Ray, i, (a+i)*.8*M_PI/180); + ROTATE0(DetS, i, (a+i)*.8*M_PI/180); + ROTATE0(DetU, i, (a+i)*.8*M_PI/180); + ROTATE0(DetV, i, (a+i)*.8*M_PI/180); + + +// SHIFT(Src, i, (-x+1536), (-y+1536)); +// SHIFT(DetS, i, (-x+1536), (-y+1536)); + } +#undef ROTATE0 +#undef SHIFT + tic(st); + + astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f); + + printf("FP: %f\n", toc(st)); + + } + } + for (unsigned int aa = 0; aa < dims.iProjAngles; ++aa) { + for (unsigned int v = 0; v < dims.iProjV; ++v) + cudaMemcpy(buf+v*dims.iProjU, ((float*)projData.ptr)+(v*dims.iProjAngles+aa)*(projData.pitch/sizeof(float)), dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[32]; + sprintf(fname, "proj%03d.png", a+aa); + saveImage(fname, dims.iProjV, dims.iProjU, buf, 0.0f, 1000.0f); + } + } + + delete[] buf; + +} +#endif diff --git a/cuda/3d/par3d_fp.h b/cuda/3d/par3d_fp.h new file mode 100644 index 0000000..7208361 --- /dev/null +++ b/cuda/3d/par3d_fp.h @@ -0,0 +1,51 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR3D_FP_H +#define _CUDA_PAR3D_FP_H + +namespace astraCUDA3d { + +_AstraExport bool Par3DFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale); + +_AstraExport bool Par3DFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale); + +_AstraExport bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale); + +} + +#endif diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu new file mode 100644 index 0000000..f615204 --- /dev/null +++ b/cuda/3d/sirt3d.cu @@ -0,0 +1,533 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "sirt3d.h" +#include "util3d.h" +#include "arith3d.h" +#include "cone_fp.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA3d { + +SIRT::SIRT() : ReconAlgo3D() +{ + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_projData.ptr = 0; + D_tmpData.ptr = 0; + + D_lineWeight.ptr = 0; + D_pixelWeight.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; + + useMinConstraint = false; + useMaxConstraint = false; +} + + +SIRT::~SIRT() +{ + reset(); +} + +void SIRT::reset() +{ + cudaFree(D_projData.ptr); + cudaFree(D_tmpData.ptr); + cudaFree(D_lineWeight.ptr); + cudaFree(D_pixelWeight.ptr); + + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_projData.ptr = 0; + D_tmpData.ptr = 0; + + D_lineWeight.ptr = 0; + D_pixelWeight.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; + + ReconAlgo3D::reset(); +} + +bool SIRT::enableVolumeMask() +{ + useVolumeMask = true; + return true; +} + +bool SIRT::enableSinogramMask() +{ + useSinogramMask = true; + return true; +} + + +bool SIRT::init() +{ + D_pixelWeight = allocateVolumeData(dims); + zeroVolumeData(D_pixelWeight, dims); + + D_tmpData = allocateVolumeData(dims); + zeroVolumeData(D_tmpData, dims); + + D_projData = allocateProjectionData(dims); + zeroProjectionData(D_projData, dims); + + D_lineWeight = allocateProjectionData(dims); + zeroProjectionData(D_lineWeight, dims); + + // We can't precompute lineWeights and pixelWeights when using a mask + if (!useVolumeMask && !useSinogramMask) + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool SIRT::setMinConstraint(float fMin) +{ + fMinConstraint = fMin; + useMinConstraint = true; + return true; +} + +bool SIRT::setMaxConstraint(float fMax) +{ + fMaxConstraint = fMax; + useMaxConstraint = true; + return true; +} + +bool SIRT::precomputeWeights() +{ + zeroProjectionData(D_lineWeight, dims); + if (useVolumeMask) { + callFP(D_maskData, D_lineWeight, 1.0f); + } else { + processVol3D<opSet>(D_tmpData, 1.0f, dims); + callFP(D_tmpData, D_lineWeight, 1.0f); + } + processSino3D<opInvert>(D_lineWeight, dims); + + if (useSinogramMask) { + // scale line weights with sinogram mask to zero out masked sinogram pixels + processSino3D<opMul>(D_lineWeight, D_smaskData, dims); + } + + zeroVolumeData(D_pixelWeight, dims); + + if (useSinogramMask) { + callBP(D_pixelWeight, D_smaskData); + } else { + processSino3D<opSet>(D_projData, 1.0f, dims); + callBP(D_pixelWeight, D_projData); + } +#if 0 + float* bufp = new float[512*512]; + + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "ray%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + +#if 0 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "pix%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + processVol3D<opInvert>(D_pixelWeight, dims); + + if (useVolumeMask) { + // scale pixel weights with mask to zero out masked pixels + processVol3D<opMul>(D_pixelWeight, D_maskData, dims); + } + + return true; +} + + +bool SIRT::setVolumeMask(cudaPitchedPtr& _D_maskData) +{ + assert(useVolumeMask); + + D_maskData = _D_maskData; + + return true; +} + +bool SIRT::setSinogramMask(cudaPitchedPtr& _D_smaskData) +{ + assert(useSinogramMask); + + D_smaskData = _D_smaskData; + + return true; +} + +bool SIRT::setBuffers(cudaPitchedPtr& _D_volumeData, + cudaPitchedPtr& _D_projData) +{ + D_volumeData = _D_volumeData; + D_sinoData = _D_projData; + + fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr); + + return true; +} + +bool SIRT::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (useVolumeMask || useSinogramMask) + precomputeWeights(); + +#if 0 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "pix%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif +#if 0 + float* bufp = new float[512*512]; + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)D_lineWeight.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "ray%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + // copy sinogram to projection data + duplicateProjectionData(D_projData, D_sinoData, dims); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + duplicateVolumeData(D_tmpData, D_volumeData, dims); + processVol3D<opMul>(D_tmpData, D_maskData, dims); + callFP(D_tmpData, D_projData, -1.0f); + } else { + callFP(D_volumeData, D_projData, -1.0f); + } + + processSino3D<opMul>(D_projData, D_lineWeight, dims); + + zeroVolumeData(D_tmpData, dims); +#if 0 + float* bufp = new float[512*512]; + printf("Dumping projData: %p\n", (void*)D_projData.ptr); + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "diff%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + + callBP(D_tmpData, D_projData); +#if 0 + printf("Dumping tmpData: %p\n", (void*)D_tmpData.ptr); + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)D_tmpData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "add%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + + + processVol3D<opAddMul>(D_volumeData, D_tmpData, D_pixelWeight, dims); + + if (useMinConstraint) + processVol3D<opClampMin>(D_volumeData, fMinConstraint, dims); + if (useMaxConstraint) + processVol3D<opClampMax>(D_volumeData, fMaxConstraint, dims); + } + + return true; +} + +float SIRT::computeDiffNorm() +{ + // copy sinogram to projection data + duplicateProjectionData(D_projData, D_sinoData, dims); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + duplicateVolumeData(D_tmpData, D_volumeData, dims); + processVol3D<opMul>(D_tmpData, D_maskData, dims); + callFP(D_tmpData, D_projData, -1.0f); + } else { + callFP(D_volumeData, D_projData, -1.0f); + } + + float s = dotProduct3D(D_projData, dims.iProjU, dims.iProjAngles, dims.iProjV); + return sqrt(s); +} + + +bool doSIRT(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_sinoData, + cudaPitchedPtr& D_maskData, + const SDimensions3D& dims, const SConeProjection* angles, + unsigned int iterations) +{ + SIRT sirt; + bool ok = true; + + ok &= sirt.setConeGeometry(dims, angles); + if (D_maskData.ptr) + ok &= sirt.enableVolumeMask(); + + if (!ok) + return false; + + ok = sirt.init(); + if (!ok) + return false; + + if (D_maskData.ptr) + ok &= sirt.setVolumeMask(D_maskData); + + ok &= sirt.setBuffers(D_volumeData, D_sinoData); + if (!ok) + return false; + + ok = sirt.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 100; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + SConeProjection angle[100]; + angle[0].fSrcX = -2905.6; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 694.4; + angle[0].fDetSY = -122.4704; + angle[0].fDetSZ = -122.4704; + + angle[0].fDetUX = 0; + angle[0].fDetUY = .4784; + //angle[0].fDetUY = .5; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = .4784; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 100; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/100); + ROTATE0(DetS, i, i*2*M_PI/100); + ROTATE0(DetU, i, i*2*M_PI/100); + ROTATE0(DetV, i, i*2*M_PI/100); + } +#undef ROTATE0 + + + cudaPitchedPtr volData = allocateVolumeData(dims); + cudaPitchedPtr projData = allocateProjectionData(dims); + zeroProjectionData(projData, dims); + + float* pbuf = new float[100*512*512]; + copyProjectionsFromDevice(pbuf, projData, dims); + copyProjectionsToDevice(pbuf, projData, dims); + delete[] pbuf; + +#if 0 + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256; ++i) { + for (unsigned int y = 0; y < 256; ++y) + for (unsigned int x = 0; x < 256; ++x) + slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f; + + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + +#else + + for (int i = 0; i < 100; ++i) { + char fname[32]; + sprintf(fname, "Tiffs/%04d.png", 4*i); + unsigned int w,h; + float* bufp = loadImage(fname, w,h); + + for (int j = 0; j < 512*512; ++j) { + float v = bufp[j]; + if (v > 236.0f) v = 236.0f; + v = logf(236.0f / v); + bufp[j] = 256*v; + } + + for (int j = 0; j < 512; ++j) { + cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] bufp; + + } +#endif + +#if 0 + float* bufs = new float[100*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 100, 512, bufs); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + zeroVolumeData(volData, dims); + + cudaPitchedPtr maskData; + maskData.ptr = 0; + + astraCUDA3d::doSIRT(volData, projData, maskData, dims, angle, 50); +#if 1 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + + return 0; +} +#endif + diff --git a/cuda/3d/sirt3d.h b/cuda/3d/sirt3d.h new file mode 100644 index 0000000..c3752c2 --- /dev/null +++ b/cuda/3d/sirt3d.h @@ -0,0 +1,118 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_SIRT3D_H +#define _CUDA_SIRT3D_H + +#include "util3d.h" +#include "algo3d.h" + +namespace astraCUDA3d { + +class _AstraExport SIRT : public ReconAlgo3D { +public: + SIRT(); + ~SIRT(); + +// bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs); + + + bool enableVolumeMask(); + bool enableSinogramMask(); + + // init should be called after setting all geometry + bool init(); + + // setVolumeMask should be called after init and before iterate, + // but only if enableVolumeMask was called before init. + // It may be called again after iterate. + bool setVolumeMask(cudaPitchedPtr& D_maskData); + + // setSinogramMask should be called after init and before iterate, + // but only if enableSinogramMask was called before init. + // It may be called again after iterate. + bool setSinogramMask(cudaPitchedPtr& D_smaskData); + + + // setBuffers should be called after init and before iterate. + // It may be called again after iterate. + bool setBuffers(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData); + + + // set Min/Max constraints. They may be called at any time, and will affect + // any iterate() calls afterwards. + bool setMinConstraint(float fMin); + bool setMaxConstraint(float fMax); + + // iterate should be called after init and setBuffers. + // It may be called multiple times. + bool iterate(unsigned int iterations); + + // Compute the norm of the difference of the FP of the current reconstruction + // and the sinogram. (This performs one FP.) + // It can be called after iterate. + float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + bool useVolumeMask; + bool useSinogramMask; + + bool useMinConstraint; + bool useMaxConstraint; + float fMinConstraint; + float fMaxConstraint; + + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + // Input/output + cudaPitchedPtr D_sinoData; + cudaPitchedPtr D_volumeData; + + // Temporary buffers + cudaPitchedPtr D_projData; + cudaPitchedPtr D_tmpData; + + // Geometry-specific precomputed data + cudaPitchedPtr D_lineWeight; + cudaPitchedPtr D_pixelWeight; +}; + +bool doSIRT(cudaPitchedPtr D_volumeData, unsigned int volumePitch, + cudaPitchedPtr D_projData, unsigned int projPitch, + cudaPitchedPtr D_maskData, unsigned int maskPitch, + const SDimensions3D& dims, const SConeProjection* projs, + unsigned int iterations); + +} + +#endif diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu new file mode 100644 index 0000000..81ea823 --- /dev/null +++ b/cuda/3d/util3d.cu @@ -0,0 +1,514 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include "util3d.h" +#include "../2d/util.h" + +namespace astraCUDA3d { + + +cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; + + cudaError err = cudaMalloc3D(&volData, extentV); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iVolX, dims.iVolY, dims.iVolZ); + volData.ptr = 0; + // TODO: return 0 somehow? + } + + return volData; +} +cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims) +{ + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; + + cudaError err = cudaMalloc3D(&projData, extentP); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iProjU, dims.iProjAngles, dims.iProjV); + projData.ptr = 0; + // TODO: return 0 somehow? + } + + return projData; +} +bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims) +{ + char* t = (char*)D_data.ptr; + cudaError err; + + for (unsigned int z = 0; z < dims.iVolZ; ++z) { + err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY); + ASTRA_CUDA_ASSERT(err); + t += D_data.pitch * dims.iVolY; + } + return true; +} +bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims) +{ + char* t = (char*)D_data.ptr; + cudaError err; + + for (unsigned int z = 0; z < dims.iProjV; ++z) { + err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles); + ASTRA_CUDA_ASSERT(err); + t += D_data.pitch * dims.iProjAngles; + } + + return true; +} +bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iVolX; + + cudaPitchedPtr ptr; + ptr.ptr = (void*)data; // const cast away + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_data; + p.extent = extentV; + p.kind = cudaMemcpyHostToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iProjU; + + cudaPitchedPtr ptr; + ptr.ptr = (void*)data; // const cast away + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iProjU*sizeof(float); + ptr.ysize = dims.iProjAngles; + + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_data; + p.extent = extentV; + p.kind = cudaMemcpyHostToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iVolX; + + cudaPitchedPtr ptr; + ptr.ptr = data; + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_data; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = ptr; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToHost; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} +bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iProjU; + + cudaPitchedPtr ptr; + ptr.ptr = data; + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iProjU*sizeof(float); + ptr.ysize = dims.iProjAngles; + + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_data; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = ptr; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToHost; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_src; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_dst; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} +bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_src; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_dst; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + + + +// TODO: Consider using a single array of size max(proj,volume) (per dim) +// instead of allocating a new one each time + +// TODO: Figure out a faster way of zeroing the padding? + +cudaArray* allocateVolumeArray(const SDimensions3D& dims) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + cudaArray* cuArray; + cudaExtent extentA; + extentA.width = dims.iVolX+2; + extentA.height = dims.iVolY+2; + extentA.depth = dims.iVolZ+2; + cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iVolX, dims.iVolY, dims.iVolZ); + return 0; + } + + zeroVolumeArray(cuArray, dims); + + return cuArray; +} +cudaArray* allocateProjectionArray(const SDimensions3D& dims) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + cudaArray* cuArray; + cudaExtent extentA; + extentA.width = dims.iProjU+2; + extentA.height = dims.iProjAngles; + extentA.depth = dims.iProjV+2; + cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); + + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iProjU, dims.iProjAngles, dims.iProjV); + return 0; + } + + zeroProjectionArray(cuArray, dims); + + return cuArray; +} +bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims) +{ + cudaPitchedPtr zeroBuf; + cudaExtent extentS; + extentS.width = sizeof(float)*(dims.iVolX+2); + extentS.height = dims.iVolY+2; + extentS.depth = 1; + + cudaExtent extentA; + extentA.width = dims.iVolX+2; + extentA.height = dims.iVolY+2; + extentA.depth = 1; + + + + cudaError err; + err = cudaMalloc3D(&zeroBuf, extentS); + ASTRA_CUDA_ASSERT(err); + err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2); + ASTRA_CUDA_ASSERT(err); + + // zero array + for (unsigned int i = 0; i < dims.iVolZ+2; ++i) { + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {0, 0, i}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = zeroBuf; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + } + cudaFree(zeroBuf.ptr); + + // TODO: check errors + + return true; +} +bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims) +{ + cudaPitchedPtr zeroBuf; + cudaExtent extentS; + extentS.width = sizeof(float)*(dims.iProjU+2); + extentS.height = dims.iProjAngles; + extentS.depth = 1; + cudaExtent extentA; + extentA.width = dims.iProjU+2; + extentA.height = dims.iProjAngles; + extentA.depth = 1; + + + cudaError err; + err = cudaMalloc3D(&zeroBuf, extentS); + ASTRA_CUDA_ASSERT(err); + err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles); + ASTRA_CUDA_ASSERT(err); + + for (unsigned int i = 0; i < dims.iProjV+2; ++i) { + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {0, 0, i}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = zeroBuf; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + } + cudaFree(zeroBuf.ptr); + + // TODO: check errors + return true; +} + + +bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims) +{ + cudaExtent extentA; + extentA.width = dims.iVolX; + extentA.height = dims.iVolY; + extentA.depth = dims.iVolZ; + + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {1, 1, 1}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_volumeData; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + // TODO: check errors + + return true; +} +bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims) +{ + cudaExtent extentA; + extentA.width = dims.iProjU; + extentA.height = dims.iProjAngles; + extentA.depth = dims.iProjV; + + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {1, 0, 1}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_projData; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + // TODO: check errors + + return true; +} + + +float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, + unsigned int z) +{ + return astraCUDA::dotProduct2D((float*)data.ptr, data.pitch/sizeof(float), x, y*z, 0, 0); +} + + +bool cudaTextForceKernelsCompletion() +{ + cudaError_t returnedCudaError = cudaThreadSynchronize(); + + if(returnedCudaError != cudaSuccess) { + fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError)); + return false; + } + + return true; +} + +int calcNextPowerOfTwo(int _iValue) +{ + int iOutput = 1; + while(iOutput < _iValue) + iOutput *= 2; + return iOutput; +} + +} diff --git a/cuda/3d/util3d.h b/cuda/3d/util3d.h new file mode 100644 index 0000000..cf04a18 --- /dev/null +++ b/cuda/3d/util3d.h @@ -0,0 +1,69 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_UTIL3D_H +#define _CUDA_UTIL3D_H + +#include <cuda.h> +#include "dims3d.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif +#include "../2d/util.h" + +namespace astraCUDA3d { + +cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims); +cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims); +bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims); +bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims); +bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool duplicateVolumeData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims); +bool duplicateProjectionData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims); + + +bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims); +bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims); +bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims); +bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims); +cudaArray* allocateProjectionArray(const SDimensions3D& dims); +cudaArray* allocateVolumeArray(const SDimensions3D& dims); + +bool cudaTextForceKernelsCompletion(); + +float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, unsigned int z); + +int calcNextPowerOfTwo(int _iValue); + +} + +#endif |