summaryrefslogtreecommitdiffstats
path: root/patches/astra-toolbox-approximate-projectors
diff options
context:
space:
mode:
Diffstat (limited to 'patches/astra-toolbox-approximate-projectors')
-rw-r--r--patches/astra-toolbox-approximate-projectors/cone_bp.cu397
-rw-r--r--patches/astra-toolbox-approximate-projectors/cone_fp.cu516
-rw-r--r--patches/astra-toolbox-approximate-projectors/par3d_bp.cu327
-rw-r--r--patches/astra-toolbox-approximate-projectors/par3d_fp.cu770
-rw-r--r--patches/astra-toolbox-approximate-projectors/rounding.h50
5 files changed, 2060 insertions, 0 deletions
diff --git a/patches/astra-toolbox-approximate-projectors/cone_bp.cu b/patches/astra-toolbox-approximate-projectors/cone_bp.cu
new file mode 100644
index 0000000..01edcb9
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/cone_bp.cu
@@ -0,0 +1,397 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 6;
+
+static const unsigned int g_anglesPerBlock = 32;
+static const unsigned int g_volBlockX = 16;
+static const unsigned int g_volBlockY = 32;
+
+static const unsigned g_MaxAngles = 1024;
+
+struct DevConeParams {
+ float4 fNumU;
+ float4 fNumV;
+ float4 fDen;
+};
+
+__constant__ DevConeParams gC_C[g_MaxAngles];
+
+#include "rounding.h"
+
+//__launch_bounds__(32*16, 4)
+template<bool FDKWEIGHT, unsigned int ZSIZE>
+__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch,
+ cudaTextureObject_t tex,
+ int startAngle, int angleOffset,
+ const astraCUDA3d::SDimensions3D dims,
+ float fOutputScale)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles - angleOffset)
+ endAngle = dims.iProjAngles - angleOffset;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ const float fX = X - 0.5f*dims.iVolX + 0.5f;
+ const float fY = Y - 0.5f*dims.iVolY + 0.5f;
+ const float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+ float Z[ZSIZE];
+ for(int i=0; i < ZSIZE; i++)
+ Z[i] = 0.0f;
+
+
+ {
+ float fAngle = startAngle + angleOffset + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+ float4 fCu = gC_C[angle].fNumU;
+ float4 fCv = gC_C[angle].fNumV;
+ float4 fCd = gC_C[angle].fDen;
+
+ float fUNum = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z;
+ float fVNum = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z;
+ float fDen = (FDKWEIGHT ? 1.0f : fCd.w) + fX * fCd.x + fY * fCd.y + fZ * fCd.z;
+
+ float fU,fV, fr;
+
+ for (int idx = 0; idx < ZSIZE; idx++)
+ {
+ fr = __fdividef(1.0f, fDen);
+ fU = fUNum * fr;
+ fV = fVNum * fr;
+
+ float fUf = round(fU) - 0.5f;
+ float fVf = round(fV) - 0.5f;
+
+ textype fU_ = texto(fU);
+ textype fV_ = texto(fV);
+ textype fUf_ = texto(fUf);
+ textype fVf_ = texto(fVf);
+
+ textype fVal0_0; textocheck(fVal0_0, "bp", tex3D<float>(tex, fUf, fAngle, fVf));
+ textype fVal1_0; textocheck(fVal1_0, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf));
+ textype fVal0_1; textocheck(fVal0_1, "bp", tex3D<float>(tex, fUf, fAngle, fVf + 1.0f));
+ textype fVal1_1; textocheck(fVal1_1, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf + 1.0f));
+
+ textype fVal0 = interpolate(fVal0_0, fVal0_1, (fV_ - fVf_));
+ textype fVal1 = interpolate(fVal1_0, fVal1_1, (fV_ - fVf_));
+ float fVal = texfrom(interpolate(fVal0, fVal1, (fU_ - fUf_)));
+
+// float fVal = tex3D<float>(tex, fU, fAngle, fV);
+ Z[idx] += fr*fr*fVal;
+
+ fUNum += fCu.z;
+ fVNum += fCv.z;
+ fDen += fCd.z;
+ }
+ }
+ }
+
+ int endZ = ZSIZE;
+ if (endZ > dims.iVolZ - startZ)
+ endZ = dims.iVolZ - startZ;
+
+ for(int i=0; i < endZ; i++)
+ volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale;
+} //End kernel
+
+
+
+// supersampling version
+__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles - angleOffset)
+ endAngle = dims.iProjAngles - angleOffset;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+ const float fSubStep = 1.0f/iRaysPerVoxelDim;
+
+ fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim);
+
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + angleOffset + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+ float4 fCu = gC_C[angle].fNumU;
+ float4 fCv = gC_C[angle].fNumV;
+ float4 fCd = gC_C[angle].fDen;
+
+ float fXs = fX;
+ for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) {
+ float fYs = fY;
+ for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) {
+ float fZs = fZ;
+ for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) {
+
+ const float fUNum = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z;
+ const float fVNum = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z;
+ const float fDen = fCd.w + fXs * fCd.x + fYs * fCd.y + fZs * fCd.z;
+
+ const float fr = __fdividef(1.0f, fDen);
+ const float fU = fUNum * fr;
+ const float fV = fVNum * fr;
+
+ fVal += tex3D<float>(tex, fU, fAngle, fV) * fr * fr;
+
+ fZs += fSubStep;
+ }
+ fYs += fSubStep;
+ }
+ fXs += fSubStep;
+ }
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale;
+ }
+}
+
+
+bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params)
+{
+ DevConeParams *p = new DevConeParams[iProjAngles];
+
+ // We need three things in the kernel:
+ // projected coordinates of pixels on the detector:
+
+ // u: || (x-s) v (s-d) || / || u v (s-x) ||
+ // v: -|| u (x-s) (s-d) || / || u v (s-x) ||
+
+ // ray density weighting factor for the adjoint
+ // || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 )
+
+ // FDK weighting factor
+ // ( || u v s || / || u v (s-x) || ) ^ 2
+
+ // Since u and v are ratios with the same denominator, we have
+ // a degree of freedom to scale the denominator. We use that to make
+ // the square of the denominator equal to the relevant weighting factor.
+
+
+ for (unsigned int i = 0; i < iProjAngles; ++i) {
+ Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ);
+ Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ);
+ Vec3 s(angles[i].fSrcX, angles[i].fSrcY, angles[i].fSrcZ);
+ Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ);
+
+
+
+ double fScale;
+ if (!params.bFDKWeighting) {
+ // goal: 1/fDen^2 = || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 )
+ // fDen = ( sqrt(|cross(u,v)|) * || u v (s-x) || ) / || u v (s-d) ||
+ // i.e. scale = sqrt(|cross(u,v)|) * / || u v (s-d) ||
+
+
+ // NB: for cross(u,v) we invert the volume scaling (for the voxel
+ // size normalization) to get the proper dimensions for
+ // the scaling of the adjoint
+
+ fScale = sqrt(scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm()) / det3(u, v, s-d);
+ } else {
+ // goal: 1/fDen = || u v s || / || u v (s-x) ||
+ // fDen = || u v (s-x) || / || u v s ||
+ // i.e., scale = 1 / || u v s ||
+
+ fScale = 1.0 / det3(u, v, s);
+ }
+
+ p[i].fNumU.w = fScale * det3(s,v,d);
+ p[i].fNumU.x = fScale * det3x(v,s-d);
+ p[i].fNumU.y = fScale * det3y(v,s-d);
+ p[i].fNumU.z = fScale * det3z(v,s-d);
+ p[i].fNumV.w = -fScale * det3(s,u,d);
+ p[i].fNumV.x = -fScale * det3x(u,s-d);
+ p[i].fNumV.y = -fScale * det3y(u,s-d);
+ p[i].fNumV.z = -fScale * det3z(u,s-d);
+ p[i].fDen.w = fScale * det3(u, v, s); // == 1.0 for FDK
+ p[i].fDen.x = -fScale * det3x(u, v);
+ p[i].fDen.y = -fScale * det3y(u, v);
+ p[i].fDen.z = -fScale * det3z(u, v);
+ }
+
+ // TODO: Check for errors
+ cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevConeParams), 0, cudaMemcpyHostToDevice);
+
+ delete[] p;
+
+ return true;
+}
+
+
+bool ConeBP_Array(cudaPitchedPtr D_volumeData,
+ cudaArray *D_projArray,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ const SProjectorParams3D& params)
+{
+ cudaTextureObject_t D_texObj;
+ if (!createTextureObject3D(D_projArray, D_texObj))
+ return false;
+
+ float fOutputScale;
+ if (params.bFDKWeighting) {
+ // NB: assuming cube voxels here
+ fOutputScale = params.fOutputScale / (params.fVolScaleX);
+ } else {
+ fOutputScale = params.fOutputScale * (params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ);
+ }
+
+ bool ok = true;
+
+ for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) {
+ unsigned int angleCount = g_MaxAngles;
+ if (th + angleCount > dims.iProjAngles)
+ angleCount = dims.iProjAngles - th;
+
+ ok = transferConstants(angles, angleCount, params);
+ if (!ok)
+ break;
+
+ dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+ dim3 dimGrid(((dims.iVolX/1+g_volBlockX-1)/(g_volBlockX))*((dims.iVolY/1+1*g_volBlockY-1)/(1*g_volBlockY)), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
+ // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);
+ if (params.bFDKWeighting) {
+ if (dims.iVolZ == 1) {
+ dev_cone_BP<true, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+ } else {
+ dev_cone_BP<true, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+ }
+ } else if (params.iRaysPerVoxelDim == 1) {
+ if (dims.iVolZ == 1) {
+ dev_cone_BP<false, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+ } else {
+ dev_cone_BP<false, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+ }
+ } else
+ dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
+ }
+
+ // TODO: Consider not synchronizing here, if possible.
+ ok = checkCuda(cudaThreadSynchronize(), "cone_bp");
+ if (!ok)
+ break;
+
+ angles = angles + angleCount;
+ // printf("%f\n", toc(t));
+
+ }
+
+ cudaDestroyTextureObject(D_texObj);
+
+ return ok;
+}
+
+bool ConeBP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ const SProjectorParams3D& params)
+{
+ // transfer projections to array
+
+ cudaArray* cuArray = allocateProjectionArray(dims);
+ transferProjectionsToArray(D_projData, cuArray, dims);
+
+ bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles, params);
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/cone_fp.cu b/patches/astra-toolbox-approximate-projectors/cone_fp.cu
new file mode 100644
index 0000000..f11813c
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/cone_fp.cu
@@ -0,0 +1,516 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 4;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_SrcZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+// x=0, y=1, z=2
+struct DIR_X {
+ __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; }
+ __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; }
+ __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+ __device__ float c0(float x, float y, float z) const { return x; }
+ __device__ float c1(float x, float y, float z) const { return y; }
+ __device__ float c2(float x, float y, float z) const { return z; }
+ __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); }
+ __device__ float x(float f0, float f1, float f2) const { return f0; }
+ __device__ float y(float f0, float f1, float f2) const { return f1; }
+ __device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// y=0, x=1, z=2
+struct DIR_Y {
+ __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; }
+ __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+ __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+ __device__ float c0(float x, float y, float z) const { return y; }
+ __device__ float c1(float x, float y, float z) const { return x; }
+ __device__ float c2(float x, float y, float z) const { return z; }
+ __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); }
+ __device__ float x(float f0, float f1, float f2) const { return f1; }
+ __device__ float y(float f0, float f1, float f2) const { return f0; }
+ __device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// z=0, x=1, y=2
+struct DIR_Z {
+ __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; }
+ __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+ __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; }
+ __device__ float c0(float x, float y, float z) const { return z; }
+ __device__ float c1(float x, float y, float z) const { return x; }
+ __device__ float c2(float x, float y, float z) const { return y; }
+ __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); }
+ __device__ float x(float f0, float f1, float f2) const { return f1; }
+ __device__ float y(float f0, float f1, float f2) const { return f2; }
+ __device__ float z(float f0, float f1, float f2) const { return f0; }
+};
+
+struct SCALE_CUBE {
+ float fOutputScale;
+ __device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; }
+};
+
+struct SCALE_NONCUBE {
+ float fScale1;
+ float fScale2;
+ float fOutputScale;
+ __device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; }
+};
+
+
+bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles)
+{
+ // transfer angles to constant memory
+ float* tmp = new float[iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(SrcX);
+ TRANSFER_TO_CONSTANT(SrcY);
+ TRANSFER_TO_CONSTANT(SrcZ);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetSZ);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+ TRANSFER_TO_CONSTANT(DetUZ);
+ TRANSFER_TO_CONSTANT(DetVX);
+ TRANSFER_TO_CONSTANT(DetVY);
+ TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ return true;
+}
+
+
+#include "rounding.h"
+
+ // threadIdx: x = ??? detector (u?)
+ // y = relative angle
+
+ // blockIdx: x = ??? detector (u+v?)
+ // y = angle block
+
+template<class COORD, class SCALE>
+__global__ void cone_FP_t(float* D_projData, unsigned int projPitch,
+ cudaTextureObject_t tex,
+ unsigned int startSlice,
+ unsigned int startAngle, unsigned int endAngle,
+ const SDimensions3D dims,
+ SCALE sc)
+{
+ COORD c;
+
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fSrcZ = gC_SrcZ[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+ const float fDetUZ = gC_DetUZ[angle];
+ const float fDetVX = gC_DetVX[angle];
+ const float fDetVY = gC_DetVY[angle];
+ const float fDetVZ = gC_DetVZ[angle];
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > c.nSlices(dims))
+ endSlice = c.nSlices(dims);
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+ /* Trace ray from Src to (detectorU,detectorV) from */
+ /* X = startSlice to X = endSlice */
+
+ const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;
+ const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;
+ const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;
+
+ /* (x) ( 1) ( 0) */
+ /* ray: (y) = (ay) * x + (by) */
+ /* (z) (az) (bz) */
+
+ const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+ const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+ const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ);
+ const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ);
+
+ const float fDistCorr = sc.scale(a1, a2);
+
+ float fVal = 0.0f;
+
+ float f0 = startSlice + 0.5f;
+ float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+ float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+ for (int s = startSlice; s < endSlice; ++s)
+ {
+
+ float f1f = round(f1) - 0.5f;
+ float f2f = round(f2) - 0.5f;
+
+ textype f1_ = texto(f1);
+ textype f2_ = texto(f2);
+ textype f1f_ = texto(f1f);
+ textype f2f_ = texto(f2f);
+
+ textype fVal0_0; textocheck(fVal0_0, "fp", c.tex(tex, f0, f1f, f2f));
+ textype fVal1_0; textocheck(fVal1_0, "fp", c.tex(tex, f0, f1f + 1.0f, f2f));
+ textype fVal0_1; textocheck(fVal0_1, "fp", c.tex(tex, f0, f1f, f2f + 1.0f));
+ textype fVal1_1; textocheck(fVal1_1, "fp", c.tex(tex, f0, f1f + 1.0f, f2f + 1.0f));
+
+ textype fVal0 = interpolate(fVal0_0, fVal0_1, (f2_ - f2f_));
+ textype fVal1 = interpolate(fVal1_0, fVal1_1, (f2_ - f2f_));
+ fVal += texfrom(interpolate(fVal0, fVal1, (f1_ - f1f_)));
+
+// fVal += c.tex(tex, f0, f1, f2);
+ f0 += 1.0f;
+ f1 += a1;
+ f2 += a2;
+ }
+
+ fVal *= fDistCorr;
+
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;
+ }
+}
+
+template<class COORD>
+__global__ void cone_FP_SS_t(float* D_projData, unsigned int projPitch,
+ cudaTextureObject_t tex,
+ unsigned int startSlice,
+ unsigned int startAngle, unsigned int endAngle,
+ const SDimensions3D dims, int iRaysPerDetDim,
+ SCALE_NONCUBE sc)
+{
+ COORD c;
+
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const float fSrcX = gC_SrcX[angle];
+ const float fSrcY = gC_SrcY[angle];
+ const float fSrcZ = gC_SrcZ[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+ const float fDetUZ = gC_DetUZ[angle];
+ const float fDetVX = gC_DetVX[angle];
+ const float fDetVY = gC_DetVY[angle];
+ const float fDetVZ = gC_DetVZ[angle];
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > c.nSlices(dims))
+ endSlice = c.nSlices(dims);
+
+ const float fSubStep = 1.0f/iRaysPerDetDim;
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+ /* Trace ray from Src to (detectorU,detectorV) from */
+ /* X = startSlice to X = endSlice */
+
+ float fV = 0.0f;
+
+ float fdU = detectorU - 0.5f + 0.5f*fSubStep;
+ for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {
+ float fdV = detectorV - 0.5f + 0.5f*fSubStep;
+ for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {
+
+ const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;
+ const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;
+ const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;
+
+ /* (x) ( 1) ( 0) */
+ /* ray: (y) = (ay) * x + (by) */
+ /* (z) (az) (bz) */
+
+ const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+ const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+ const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ);
+ const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ);
+
+ const float fDistCorr = sc.scale(a1, a2);
+
+ float fVal = 0.0f;
+
+ float f0 = startSlice + 0.5f;
+ float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+ float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+ for (int s = startSlice; s < endSlice; ++s)
+ {
+ fVal += c.tex(tex, f0, f1, f2);
+ f0 += 1.0f;
+ f1 += a1;
+ f2 += a2;
+ }
+
+ fVal *= fDistCorr;
+ fV += fVal;
+
+ }
+ }
+
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim);
+ }
+}
+
+
+bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
+ cudaTextureObject_t D_texObj,
+ const SDimensions3D& dims,
+ unsigned int angleCount, const SConeProjection* angles,
+ const SProjectorParams3D& params)
+{
+ if (!transferConstants(angles, angleCount))
+ return false;
+
+ std::list<cudaStream_t> streams;
+ dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ int blockDirection = 0;
+
+ bool cube = true;
+ if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001)
+ cube = false;
+ if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001)
+ cube = false;
+
+ SCALE_CUBE scube;
+ scube.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+ SCALE_NONCUBE snoncubeX;
+ float fS1 = params.fVolScaleY / params.fVolScaleX;
+ snoncubeX.fScale1 = fS1 * fS1;
+ float fS2 = params.fVolScaleZ / params.fVolScaleX;
+ snoncubeX.fScale2 = fS2 * fS2;
+ snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+ SCALE_NONCUBE snoncubeY;
+ fS1 = params.fVolScaleX / params.fVolScaleY;
+ snoncubeY.fScale1 = fS1 * fS1;
+ fS2 = params.fVolScaleZ / params.fVolScaleY;
+ snoncubeY.fScale2 = fS2 * fS2;
+ snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY;
+
+ SCALE_NONCUBE snoncubeZ;
+ fS1 = params.fVolScaleX / params.fVolScaleZ;
+ snoncubeZ.fScale1 = fS1 * fS1;
+ fS2 = params.fVolScaleY / params.fVolScaleZ;
+ snoncubeZ.fScale2 = fS2 * fS2;
+ snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ;
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int a = 0; a <= angleCount; ++a) {
+ int dir = -1;
+ if (a != angleCount) {
+ float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f));
+ float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f));
+ float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f));
+
+ if (dX >= dY && dX >= dZ)
+ dir = 0;
+ else if (dY >= dX && dY >= dZ)
+ dir = 1;
+ else
+ dir = 2;
+ }
+
+ if (a == angleCount || dir != blockDirection) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+
+ dim3 dimGrid(
+ ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+
+ // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+ if (blockDirection == 0) {
+ for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ if (cube)
+ cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+ else
+ cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeX);
+ else
+ cone_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX);
+ } else if (blockDirection == 1) {
+ for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ if (cube)
+ cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+ else
+ cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeY);
+ else
+ cone_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY);
+ } else if (blockDirection == 2) {
+ for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ if (cube)
+ cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+ else
+ cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeZ);
+ else
+ cone_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ);
+ }
+
+ }
+
+ blockDirection = dir;
+ blockStart = a;
+ }
+ }
+
+ bool ok = true;
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+ cudaStreamDestroy(*iter);
+ }
+
+ // printf("%f\n", toc(t));
+
+ return ok;
+}
+
+
+bool ConeFP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ const SProjectorParams3D& params)
+{
+ // transfer volume to array
+ cudaArray* cuArray = allocateVolumeArray(dims);
+ transferVolumeToArray(D_volumeData, cuArray, dims);
+
+ cudaTextureObject_t D_texObj;
+ if (!createTextureObject3D(cuArray, D_texObj)) {
+ cudaFreeArray(cuArray);
+ return false;
+ }
+
+ bool ret;
+
+ for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
+ unsigned int iEndAngle = iAngle + g_MaxAngles;
+ if (iEndAngle >= dims.iProjAngles)
+ iEndAngle = dims.iProjAngles;
+
+ cudaPitchedPtr D_subprojData = D_projData;
+ D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch;
+
+ ret = ConeFP_Array_internal(D_subprojData, D_texObj,
+ dims, iEndAngle - iAngle, angles + iAngle,
+ params);
+ if (!ret)
+ break;
+ }
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/par3d_bp.cu b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu
new file mode 100644
index 0000000..7958ac9
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu
@@ -0,0 +1,327 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 6;
+
+static const unsigned int g_anglesPerBlock = 32;
+static const unsigned int g_volBlockX = 16;
+static const unsigned int g_volBlockY = 32;
+
+static const unsigned g_MaxAngles = 1024;
+
+struct DevPar3DParams {
+ float4 fNumU;
+ float4 fNumV;
+};
+
+__constant__ DevPar3DParams gC_C[g_MaxAngles];
+__constant__ float gC_scale[g_MaxAngles];
+
+
+#include "rounding.h"
+
+template<unsigned int ZSIZE>
+__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles - angleOffset)
+ endAngle = dims.iProjAngles - angleOffset;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+ float Z[ZSIZE];
+ for(int i=0; i < ZSIZE; i++)
+ Z[i] = 0.0f;
+
+ {
+ float fAngle = startAngle + angleOffset + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+
+ float4 fCu = gC_C[angle].fNumU;
+ float4 fCv = gC_C[angle].fNumV;
+ float fS = gC_scale[angle];
+
+ float fU = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z;
+ float fV = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z;
+// printf("%f %f\n", fU, fV);
+
+ for (int idx = 0; idx < ZSIZE; ++idx) {
+ float fVal;
+ textype h5 = texto(0.5f);
+ textype fU_ = texto(fU);
+ textype fUf_ = texto(floor(fU));
+ float fUf = floor(fU);
+
+ if ((fU - fUf) < 0.5f) {
+ textype fVal1 = texto(tex3D<float>(tex, fUf - 0.5f, fAngle, fV));
+ textype fVal2 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV));
+ fVal = texfrom(fVal1 + (fU_ + h5 - fUf_) * (fVal2 - fVal1));
+ } else {
+ textype fVal1 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV));
+ textype fVal2 = texto(tex3D<float>(tex, fUf + 1.5f, fAngle, fV));
+ fVal = texfrom(fVal1 + (fU_ - h5 - fUf_) * (fVal2 - fVal1));
+ }
+
+// float fVal = tex3D<float>(tex, fU, fAngle, fV);
+ Z[idx] += fVal * fS;
+
+ fU += fCu.z;
+ fV += fCv.z;
+ }
+
+ }
+ }
+
+ int endZ = ZSIZE;
+ if (endZ > dims.iVolZ - startZ)
+ endZ = dims.iVolZ - startZ;
+
+ for(int i=0; i < endZ; i++)
+ volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale;
+}
+
+// supersampling version
+__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles - angleOffset)
+ endAngle = dims.iProjAngles - angleOffset;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+
+ const float fSubStep = 1.0f/iRaysPerVoxelDim;
+
+ fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim);
+
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + angleOffset + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+ float4 fCu = gC_C[angle].fNumU;
+ float4 fCv = gC_C[angle].fNumV;
+ float fS = gC_scale[angle];
+
+ float fXs = fX;
+ for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) {
+ float fYs = fY;
+ for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) {
+ float fZs = fZ;
+ for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) {
+
+ const float fU = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z;
+ const float fV = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z;
+
+ fVal += tex3D<float>(tex, fU, fAngle, fV) * fS;
+ fZs += fSubStep;
+ }
+ fYs += fSubStep;
+ }
+ fXs += fSubStep;
+ }
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale;
+ }
+
+}
+
+bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params)
+{
+ DevPar3DParams *p = new DevPar3DParams[iProjAngles];
+ float *s = new float[iProjAngles];
+
+ for (unsigned int i = 0; i < iProjAngles; ++i) {
+ Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ);
+ Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ);
+ Vec3 r(angles[i].fRayX, angles[i].fRayY, angles[i].fRayZ);
+ Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ);
+
+ double fDen = det3(r,u,v);
+ p[i].fNumU.x = -det3x(r,v) / fDen;
+ p[i].fNumU.y = -det3y(r,v) / fDen;
+ p[i].fNumU.z = -det3z(r,v) / fDen;
+ p[i].fNumU.w = -det3(r,d,v) / fDen;
+ p[i].fNumV.x = det3x(r,u) / fDen;
+ p[i].fNumV.y = det3y(r,u) / fDen;
+ p[i].fNumV.z = det3z(r,u) / fDen;
+ p[i].fNumV.w = det3(r,d,u) / fDen;
+
+ s[i] = 1.0 / scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm();
+ }
+
+ cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevPar3DParams), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(gC_scale, s, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+ delete[] p;
+ delete[] s;
+
+ return true;
+}
+
+bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
+ cudaArray *D_projArray,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ const SProjectorParams3D& params)
+{
+ cudaTextureObject_t D_texObj;
+ if (!createTextureObject3D(D_projArray, D_texObj))
+ return false;
+
+ float fOutputScale = params.fOutputScale * params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ;
+
+ bool ok = true;
+
+ for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) {
+ unsigned int angleCount = g_MaxAngles;
+ if (th + angleCount > dims.iProjAngles)
+ angleCount = dims.iProjAngles - th;
+
+ ok = transferConstants(angles, angleCount, params);
+ if (!ok)
+ break;
+
+ dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+ dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
+ // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);
+ if (params.iRaysPerVoxelDim == 1) {
+ if (dims.iVolZ == 1) {
+ dev_par3D_BP<1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+ } else {
+ dev_par3D_BP<g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+ }
+ } else
+ dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
+ }
+
+ // TODO: Consider not synchronizing here, if possible.
+ ok = checkCuda(cudaThreadSynchronize(), "cone_bp");
+ if (!ok)
+ break;
+
+ angles = angles + angleCount;
+ // printf("%f\n", toc(t));
+
+ }
+
+ cudaDestroyTextureObject(D_texObj);
+
+ return true;
+}
+
+bool Par3DBP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ const SProjectorParams3D& params)
+{
+ // transfer projections to array
+
+ cudaArray* cuArray = allocateProjectionArray(dims);
+ transferProjectionsToArray(D_projData, cuArray, dims);
+
+ bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles, params);
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/par3d_fp.cu b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu
new file mode 100644
index 0000000..075784b
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu
@@ -0,0 +1,770 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 32;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_RayX[g_MaxAngles];
+__constant__ float gC_RayY[g_MaxAngles];
+__constant__ float gC_RayZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+// x=0, y=1, z=2
+struct DIR_X {
+ __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; }
+ __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; }
+ __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+ __device__ float c0(float x, float y, float z) const { return x; }
+ __device__ float c1(float x, float y, float z) const { return y; }
+ __device__ float c2(float x, float y, float z) const { return z; }
+ __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); }
+ __device__ float x(float f0, float f1, float f2) const { return f0; }
+ __device__ float y(float f0, float f1, float f2) const { return f1; }
+ __device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// y=0, x=1, z=2
+struct DIR_Y {
+ __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; }
+ __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+ __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+ __device__ float c0(float x, float y, float z) const { return y; }
+ __device__ float c1(float x, float y, float z) const { return x; }
+ __device__ float c2(float x, float y, float z) const { return z; }
+ __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); }
+ __device__ float x(float f0, float f1, float f2) const { return f1; }
+ __device__ float y(float f0, float f1, float f2) const { return f0; }
+ __device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// z=0, x=1, y=2
+struct DIR_Z {
+ __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; }
+ __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+ __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; }
+ __device__ float c0(float x, float y, float z) const { return z; }
+ __device__ float c1(float x, float y, float z) const { return x; }
+ __device__ float c2(float x, float y, float z) const { return y; }
+ __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); }
+ __device__ float x(float f0, float f1, float f2) const { return f1; }
+ __device__ float y(float f0, float f1, float f2) const { return f2; }
+ __device__ float z(float f0, float f1, float f2) const { return f0; }
+};
+
+struct SCALE_CUBE {
+ float fOutputScale;
+ __device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; }
+};
+
+struct SCALE_NONCUBE {
+ float fScale1;
+ float fScale2;
+ float fOutputScale;
+ __device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; }
+};
+
+bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles)
+{
+ // transfer angles to constant memory
+ float* tmp = new float[iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(RayX);
+ TRANSFER_TO_CONSTANT(RayY);
+ TRANSFER_TO_CONSTANT(RayZ);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetSZ);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+ TRANSFER_TO_CONSTANT(DetUZ);
+ TRANSFER_TO_CONSTANT(DetVX);
+ TRANSFER_TO_CONSTANT(DetVY);
+ TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ return true;
+}
+
+
+// threadIdx: x = u detector
+// y = relative angle
+// blockIdx: x = u/v detector
+// y = angle block
+
+#include "rounding.h"
+
+template<class COORD, class SCALE>
+__global__ void par3D_FP_t(float* D_projData, unsigned int projPitch,
+ cudaTextureObject_t tex,
+ unsigned int startSlice,
+ unsigned int startAngle, unsigned int endAngle,
+ const SDimensions3D dims,
+ SCALE sc)
+{
+ COORD c;
+
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const float fRayX = gC_RayX[angle];
+ const float fRayY = gC_RayY[angle];
+ const float fRayZ = gC_RayZ[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+ const float fDetUZ = gC_DetUZ[angle];
+ const float fDetVX = gC_DetVX[angle];
+ const float fDetVY = gC_DetVY[angle];
+ const float fDetVZ = gC_DetVZ[angle];
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+ const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+ const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+ const float fDistCorr = sc.scale(a1, a2);
+
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > c.nSlices(dims))
+ endSlice = c.nSlices(dims);
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+ /* Trace ray in direction Ray to (detectorU,detectorV) from */
+ /* X = startSlice to X = endSlice */
+
+ const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;
+ const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;
+ const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;
+
+ /* (x) ( 1) ( 0) */
+ /* ray: (y) = (ay) * x + (by) */
+ /* (z) (az) (bz) */
+
+ const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ);
+ const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ);
+
+ float fVal = 0.0f;
+
+ float f0 = startSlice + 0.5f;
+ float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+ float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+ //printf("%f, %f (%f), %f (%f)\n", f0, f1, a1, f2, a2); // Only f1 non linear
+
+ for (int s = startSlice; s < endSlice; ++s)
+ {
+
+ textype h5 = texto(0.5f);
+ textype f1_ = texto(f1);
+ textype f1f_ = texto(floor(f1));
+ float f1f = floor(f1);
+
+ if ((f1 - f1f) < 0.5f) {
+ textype fVal1 = texto(c.tex(tex, f0, f1f - 0.5f, f2));
+ textype fVal2 = texto(c.tex(tex, f0, f1f + 0.5f, f2));
+ fVal += texfrom(fVal1 + (f1_ + h5 - f1f_) * (fVal2 - fVal1));
+// fVal += texfrom(__hfma(__hadd(h5,__hsub(f1_, f1f_)), __hsub(fVal2, fVal1), fVal1));
+ } else {
+ textype fVal1 = texto(c.tex(tex, f0, f1f + 0.5f, f2));
+ textype fVal2 = texto(c.tex(tex, f0, f1f + 1.5f, f2));
+ fVal += texfrom(fVal1 + (f1_ - h5 - f1f_) * (fVal2 - fVal1));
+ }
+
+// fVal += c.tex(tex, f0, f1, f2);
+ f0 += 1.0f;
+ f1 += a1;
+ f2 += a2;
+ }
+
+ fVal *= fDistCorr;
+
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;
+ }
+}
+
+// Supersampling version
+template<class COORD>
+__global__ void par3D_FP_SS_t(float* D_projData, unsigned int projPitch,
+ cudaTextureObject_t tex,
+ unsigned int startSlice,
+ unsigned int startAngle, unsigned int endAngle,
+ const SDimensions3D dims, int iRaysPerDetDim,
+ SCALE_NONCUBE sc)
+{
+ COORD c;
+
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const float fRayX = gC_RayX[angle];
+ const float fRayY = gC_RayY[angle];
+ const float fRayZ = gC_RayZ[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+ const float fDetUZ = gC_DetUZ[angle];
+ const float fDetVX = gC_DetVX[angle];
+ const float fDetVY = gC_DetVY[angle];
+ const float fDetVZ = gC_DetVZ[angle];
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+ const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+ const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+ const float fDistCorr = sc.scale(a1, a2);
+
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > c.nSlices(dims))
+ endSlice = c.nSlices(dims);
+
+ const float fSubStep = 1.0f/iRaysPerDetDim;
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+
+ float fV = 0.0f;
+
+ float fdU = detectorU - 0.5f + 0.5f*fSubStep;
+ for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {
+ float fdV = detectorV - 0.5f + 0.5f*fSubStep;
+ for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {
+
+ /* Trace ray in direction Ray to (detectorU,detectorV) from */
+ /* X = startSlice to X = endSlice */
+
+ const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;
+ const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;
+ const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;
+
+ /* (x) ( 1) ( 0) */
+ /* ray: (y) = (ay) * x + (by) */
+ /* (z) (az) (bz) */
+
+ const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ);
+ const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ);
+
+
+ float fVal = 0.0f;
+
+ float f0 = startSlice + 0.5f;
+ float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+ float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+ for (int s = startSlice; s < endSlice; ++s)
+ {
+ fVal += c.tex(tex, f0, f1, f2);
+
+ f0 += 1.0f;
+ f1 += a1;
+ f2 += a2;
+ }
+
+ fV += fVal;
+
+ }
+ }
+
+ fV *= fDistCorr;
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim);
+ }
+}
+
+
+__device__ float dirWeights(float fX, float fN) {
+ if (fX <= -0.5f) // outside image on left
+ return 0.0f;
+ if (fX <= 0.5f) // half outside image on left
+ return (fX + 0.5f) * (fX + 0.5f);
+ if (fX <= fN - 0.5f) { // inside image
+ float t = fX + 0.5f - floorf(fX + 0.5f);
+ return t*t + (1-t)*(1-t);
+ }
+ if (fX <= fN + 0.5f) // half outside image on right
+ return (fN + 0.5f - fX) * (fN + 0.5f - fX);
+ return 0.0f; // outside image on right
+}
+
+template<class COORD>
+__global__ void par3D_FP_SumSqW_t(float* D_projData, unsigned int projPitch,
+ unsigned int startSlice,
+ unsigned int startAngle, unsigned int endAngle,
+ const SDimensions3D dims,
+ SCALE_NONCUBE sc)
+{
+ COORD c;
+
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const float fRayX = gC_RayX[angle];
+ const float fRayY = gC_RayY[angle];
+ const float fRayZ = gC_RayZ[angle];
+ const float fDetUX = gC_DetUX[angle];
+ const float fDetUY = gC_DetUY[angle];
+ const float fDetUZ = gC_DetUZ[angle];
+ const float fDetVX = gC_DetVX[angle];
+ const float fDetVY = gC_DetVY[angle];
+ const float fDetVZ = gC_DetVZ[angle];
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+ const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+ const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+ const float fDistCorr = sc.scale(a1, a2);
+
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ int endSlice = startSlice + g_blockSlices;
+ if (endSlice > c.nSlices(dims))
+ endSlice = c.nSlices(dims);
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+ /* Trace ray in direction Ray to (detectorU,detectorV) from */
+ /* X = startSlice to X = endSlice */
+
+ const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;
+ const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;
+ const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;
+
+ /* (x) ( 1) ( 0) */
+ /* ray: (y) = (ay) * x + (by) */
+ /* (z) (az) (bz) */
+
+ const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ);
+ const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ);
+
+ float fVal = 0.0f;
+
+ float f0 = startSlice + 0.5f;
+ float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+ float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+ for (int s = startSlice; s < endSlice; ++s)
+ {
+ fVal += dirWeights(f1, c.nDim1(dims)) * dirWeights(f2, c.nDim2(dims));
+ f0 += 1.0f;
+ f1 += a1;
+ f2 += a2;
+ }
+
+ fVal *= fDistCorr * fDistCorr;
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;
+ }
+}
+
+// Supersampling version
+// TODO
+
+
+bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
+ cudaTextureObject_t D_texObj,
+ const SDimensions3D& dims,
+ unsigned int angleCount, const SPar3DProjection* angles,
+ const SProjectorParams3D& params)
+{
+ if (!transferConstants(angles, angleCount))
+ return false;
+
+ std::list<cudaStream_t> streams;
+ dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ int blockDirection = 0;
+
+ bool cube = true;
+ if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001)
+ cube = false;
+ if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001)
+ cube = false;
+
+ SCALE_CUBE scube;
+ scube.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+ SCALE_NONCUBE snoncubeX;
+ float fS1 = params.fVolScaleY / params.fVolScaleX;
+ snoncubeX.fScale1 = fS1 * fS1;
+ float fS2 = params.fVolScaleZ / params.fVolScaleX;
+ snoncubeX.fScale2 = fS2 * fS2;
+ snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+ SCALE_NONCUBE snoncubeY;
+ fS1 = params.fVolScaleX / params.fVolScaleY;
+ snoncubeY.fScale1 = fS1 * fS1;
+ fS2 = params.fVolScaleY / params.fVolScaleY;
+ snoncubeY.fScale2 = fS2 * fS2;
+ snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY;
+
+ SCALE_NONCUBE snoncubeZ;
+ fS1 = params.fVolScaleX / params.fVolScaleZ;
+ snoncubeZ.fScale1 = fS1 * fS1;
+ fS2 = params.fVolScaleY / params.fVolScaleZ;
+ snoncubeZ.fScale2 = fS2 * fS2;
+ snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ;
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int a = 0; a <= angleCount; ++a) {
+ int dir = -1;
+ if (a != angleCount) {
+ float dX = fabsf(angles[a].fRayX);
+ float dY = fabsf(angles[a].fRayY);
+ float dZ = fabsf(angles[a].fRayZ);
+
+ if (dX >= dY && dX >= dZ)
+ dir = 0;
+ else if (dY >= dX && dY >= dZ)
+ dir = 1;
+ else
+ dir = 2;
+ }
+
+ if (a == angleCount || dir != blockDirection) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+
+ dim3 dimGrid(
+ ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+
+ // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+ if (blockDirection == 0) {
+ for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ if (cube)
+ par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+ else
+ par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, snoncubeX);
+ else
+ par3D_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX);
+ } else if (blockDirection == 1) {
+ for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ if (cube)
+ par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, scube);
+ else
+ par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, snoncubeY);
+ else
+ par3D_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY);
+ } else if (blockDirection == 2) {
+ for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ if (cube)
+ par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, scube);
+ else
+ par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, snoncubeZ);
+ else
+ par3D_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ);
+ }
+
+ }
+
+ blockDirection = dir;
+ blockStart = a;
+ }
+ }
+
+ bool ok = true;
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+ cudaStreamDestroy(*iter);
+ }
+
+ // printf("%f\n", toc(t));
+
+ return ok;
+}
+
+bool Par3DFP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ const SProjectorParams3D& params)
+{
+
+ // transfer volume to array
+ cudaArray* cuArray = allocateVolumeArray(dims);
+ transferVolumeToArray(D_volumeData, cuArray, dims);
+
+ cudaTextureObject_t D_texObj;
+ if (!createTextureObject3D(cuArray, D_texObj)) {
+ cudaFreeArray(cuArray);
+ return false;
+ }
+
+ bool ret;
+
+ for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
+ unsigned int iEndAngle = iAngle + g_MaxAngles;
+ if (iEndAngle >= dims.iProjAngles)
+ iEndAngle = dims.iProjAngles;
+
+ cudaPitchedPtr D_subprojData = D_projData;
+ D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch;
+
+ ret = Par3DFP_Array_internal(D_subprojData, D_texObj,
+ dims, iEndAngle - iAngle, angles + iAngle,
+ params);
+ if (!ret)
+ break;
+ }
+
+ cudaFreeArray(cuArray);
+
+ cudaDestroyTextureObject(D_texObj);
+
+ return ret;
+}
+
+
+
+bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ const SProjectorParams3D& params)
+{
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(RayX);
+ TRANSFER_TO_CONSTANT(RayY);
+ TRANSFER_TO_CONSTANT(RayZ);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetSZ);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+ TRANSFER_TO_CONSTANT(DetUZ);
+ TRANSFER_TO_CONSTANT(DetVX);
+ TRANSFER_TO_CONSTANT(DetVY);
+ TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ std::list<cudaStream_t> streams;
+ dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ int blockDirection = 0;
+
+ SCALE_NONCUBE snoncubeX;
+ float fS1 = params.fVolScaleY / params.fVolScaleX;
+ snoncubeX.fScale1 = fS1 * fS1;
+ float fS2 = params.fVolScaleZ / params.fVolScaleX;
+ snoncubeX.fScale2 = fS2 * fS2;
+ snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+ SCALE_NONCUBE snoncubeY;
+ fS1 = params.fVolScaleX / params.fVolScaleY;
+ snoncubeY.fScale1 = fS1 * fS1;
+ fS2 = params.fVolScaleY / params.fVolScaleY;
+ snoncubeY.fScale2 = fS2 * fS2;
+ snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY;
+
+ SCALE_NONCUBE snoncubeZ;
+ fS1 = params.fVolScaleX / params.fVolScaleZ;
+ snoncubeZ.fScale1 = fS1 * fS1;
+ fS2 = params.fVolScaleY / params.fVolScaleZ;
+ snoncubeZ.fScale2 = fS2 * fS2;
+ snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ;
+
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ int dir;
+ if (a != dims.iProjAngles) {
+ float dX = fabsf(angles[a].fRayX);
+ float dY = fabsf(angles[a].fRayY);
+ float dZ = fabsf(angles[a].fRayZ);
+
+ if (dX >= dY && dX >= dZ)
+ dir = 0;
+ else if (dY >= dX && dY >= dZ)
+ dir = 1;
+ else
+ dir = 2;
+ }
+
+ if (a == dims.iProjAngles || dir != blockDirection) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+
+ dim3 dimGrid(
+ ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+
+ // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+ if (blockDirection == 0) {
+ for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ par3D_FP_SumSqW_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeX);
+ else
+#if 0
+ par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+ assert(false);
+#endif
+ } else if (blockDirection == 1) {
+ for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ par3D_FP_SumSqW_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeY);
+ else
+#if 0
+ par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+ assert(false);
+#endif
+ } else if (blockDirection == 2) {
+ for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+ if (params.iRaysPerDetDim == 1)
+ par3D_FP_SumSqW_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeZ);
+ else
+#if 0
+ par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+ assert(false);
+#endif
+ }
+
+ }
+
+ blockDirection = dir;
+ blockStart = a;
+ }
+ }
+
+ bool ok = true;
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+ cudaStreamDestroy(*iter);
+ }
+
+ // printf("%f\n", toc(t));
+
+ return ok;
+}
+
+
+
+
+
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/rounding.h b/patches/astra-toolbox-approximate-projectors/rounding.h
new file mode 100644
index 0000000..c1cbffb
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/rounding.h
@@ -0,0 +1,50 @@
+#include <cuda_fp16.h>
+
+#define precision 8
+#define approximate_interpolation
+
+#ifdef approximate_interpolation
+# ifdef precision
+# if precision == 16
+# define texto(v) __float2half(v)
+# define texfrom(v) __half2float(v)
+# define textype half
+# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0))
+# define textocheck(var,msg,val) var=texto(val);
+# else
+# define precision_mult ((1<<precision)-1)
+# define texto(v) ((unsigned char)(precision_mult*v))
+# define texfrom(v) (1.f * v / precision_mult)
+# define textype unsigned
+# define interpolate(v0, v1, pos) (v0 + ((unsigned)(256 * pos)) * (v1 - v0) / 256)
+# define textocheck(var, msg, val) \
+ if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \
+ var=texto(val);
+# endif
+# else
+# define texto(v) (v)
+# define texfrom(v) (v)
+# define textype float
+# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0))
+# define textocheck(var,msg,val) var=texto(val);
+# endif
+#else
+# ifdef precision
+# if precision == 16
+# define texto(v) __half2float(__float2half(v))
+# define textocheck(var,msg,val) var=texto(val);
+# else
+# define precision_mult ((1<<precision)-1)
+# define texto(v) (floor(precision_mult*v)/precision_mult)
+# define textocheck(var, msg, val) \
+ if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \
+ var=texto(val);
+# endif
+# else
+# define texto(v) (v)
+# define textocheck(var,msg,val) var=texto(val);
+# endif
+# define texfrom(v) (v)
+# define textype float
+# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0))
+#endif