summaryrefslogtreecommitdiffstats
path: root/cuda
diff options
context:
space:
mode:
Diffstat (limited to 'cuda')
-rw-r--r--cuda/3d/arith3d.cu90
-rw-r--r--cuda/3d/cone_bp.cu14
-rw-r--r--cuda/3d/cone_fp.cu18
-rw-r--r--cuda/3d/fdk.cu10
-rw-r--r--cuda/3d/par3d_bp.cu14
-rw-r--r--cuda/3d/par3d_fp.cu38
-rw-r--r--cuda/3d/util3d.cu113
7 files changed, 96 insertions, 201 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index 9a19be0..7cb56f6 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -99,14 +99,14 @@ struct opClampMax {
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -116,14 +116,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -134,14 +134,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -151,14 +151,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -168,14 +168,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -185,14 +185,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u
}
}
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
unsigned int x = threadIdx.x + 16*blockIdx.x;
if (x >= width) return;
unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
- unsigned int off = (y+padY)*pitch+x+padX;
+ unsigned int off = y*pitch+x;
for (unsigned int i = 0; i < repeat; ++i) {
if (y >= height)
break;
@@ -210,7 +210,7 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2,
-template<typename op, VolType t>
+template<typename op>
void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
@@ -218,12 +218,12 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
float *pfOut = (float*)out;
- devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
@@ -231,12 +231,12 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
float *pfOut = (float*)out;
- devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
@@ -245,12 +245,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
float *pfOut = (float*)out;
const float *pfIn = (const float*)in;
- devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
@@ -259,12 +259,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
float *pfOut = (float*)out;
const float *pfIn = (const float*)in;
- devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
@@ -274,12 +274,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
const float *pfIn1 = (const float*)in1;
const float *pfIn2 = (const float*)in2;
- devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
cudaTextForceKernelsCompletion();
}
-template<typename op, VolType t>
+template<typename op>
void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height)
{
dim3 blockSize(16,16);
@@ -289,7 +289,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
const float *pfIn1 = (const float*)in1;
const float *pfIn2 = (const float*)in2;
- devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
cudaTextForceKernelsCompletion();
}
@@ -319,7 +319,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
}
@@ -335,7 +335,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
}
@@ -352,7 +352,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn += step;
}
@@ -370,7 +370,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn += step;
}
@@ -389,7 +389,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -409,7 +409,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
for (unsigned int i = 0; i < dims.iVolZ; ++i) {
- devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -439,7 +439,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
}
@@ -455,7 +455,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
}
@@ -472,7 +472,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn += step;
}
@@ -490,7 +490,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn += step;
}
@@ -509,7 +509,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -529,7 +529,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
for (unsigned int i = 0; i < dims.iProjV; ++i) {
- devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
pfOut += step;
pfIn1 += step;
pfIn2 += step;
@@ -556,39 +556,33 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
#define INST_DFtoD(name) \
- template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
#define INST_DtoD(name) \
- template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
#define INST_DDtoD(name) \
- template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
#define INST_DDFtoD(name) \
- template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
#define INST_toD(name) \
- template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
#define INST_FtoD(name) \
- template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
- template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \
template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 7f8e320..1b0154d 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -73,9 +73,9 @@ bool bindProjDataTexture(const cudaArray* array)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
- gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
- gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder;
+ gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder;
gT_coneProjTexture.filterMode = cudaFilterModeLinear;
gT_coneProjTexture.normalized = false;
@@ -148,8 +148,8 @@ __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAng
const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz;
- const float fU = fUNum / fDen + 1.0f;
- const float fV = fVNum / fDen + 1.0f;
+ const float fU = fUNum / fDen;
+ const float fV = fVNum / fDen;
fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
@@ -230,8 +230,8 @@ __global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int start
const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz;
- const float fU = fUNum / fDen + 1.0f;
- const float fV = fVNum / fDen + 1.0f;
+ const float fU = fUNum / fDen;
+ const float fV = fVNum / fDen;
fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index 40dca4f..fa308b9 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -72,9 +72,9 @@ bool bindVolumeDataTexture(const cudaArray* array)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp;
- gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp;
- gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_coneVolumeTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_coneVolumeTexture.addressMode[1] = cudaAddressModeBorder;
+ gT_coneVolumeTexture.addressMode[2] = cudaAddressModeBorder;
gT_coneVolumeTexture.filterMode = cudaFilterModeLinear;
gT_coneVolumeTexture.normalized = false;
@@ -142,9 +142,9 @@ bool bindVolumeDataTexture(const cudaArray* array)
\
float fVal = 0.0f; \
\
- float f##c0 = startSlice + 1.5f; \
- float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
- float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+ float f##c0 = startSlice + 0.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f; \
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f; \
\
for (int s = startSlice; s < endSlice; ++s) \
{ \
@@ -218,9 +218,9 @@ bool bindVolumeDataTexture(const cudaArray* array)
\
float fVal = 0.0f; \
\
- float f##c0 = startSlice + 1.5f; \
- float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
- float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+ float f##c0 = startSlice + 0.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f; \
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f; \
\
for (int s = startSlice; s < endSlice; ++s) \
{ \
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index d439d9b..883187d 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -71,9 +71,9 @@ static bool bindProjDataTexture(const cudaArray* array)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
- gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
- gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder;
+ gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder;
gT_coneProjTexture.filterMode = cudaFilterModeLinear;
gT_coneProjTexture.normalized = false;
@@ -120,8 +120,8 @@ __global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle
float fY = Y - 0.5f*dims.iVolY + 0.5f;
float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ;
- const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f;
- const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ);
+ const float fU_base = 0.5f*dims.iProjU - 0.5f + 0.5f;
+ const float fV_base = 0.5f*dims.iProjV - 0.5f + 0.5f + (fDetZ-fSrcZ);
// Note re. fZ/rV_base: the computations below are all relative to the
// optical axis, so we do the Z-adjustments beforehand.
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index 872b1eb..9bf9a9f 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -69,9 +69,9 @@ static bool bindProjDataTexture(const cudaArray* array)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp;
- gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp;
- gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_par3DProjTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_par3DProjTexture.addressMode[1] = cudaAddressModeBorder;
+ gT_par3DProjTexture.addressMode[2] = cudaAddressModeBorder;
gT_par3DProjTexture.filterMode = cudaFilterModeLinear;
gT_par3DProjTexture.normalized = false;
@@ -139,8 +139,8 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn
const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
- const float fU = fUNum + 1.0f;
- const float fV = fVNum + 1.0f;
+ const float fU = fUNum;
+ const float fV = fVNum;
fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
@@ -216,8 +216,8 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star
const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
- const float fU = fUNum + 1.0f;
- const float fV = fVNum + 1.0f;
+ const float fU = fUNum;
+ const float fV = fVNum;
fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
fZs += fSubStep;
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 6bf9037..e0f743c 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -72,9 +72,9 @@ static bool bindVolumeDataTexture(const cudaArray* array)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp;
- gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp;
- gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeBorder;
+ gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeBorder;
+ gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeBorder;
gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear;
gT_par3DVolumeTexture.normalized = false;
@@ -144,9 +144,9 @@ static bool bindVolumeDataTexture(const cudaArray* array)
\
float fVal = 0.0f; \
\
- float f##c0 = startSlice + 1.5f; \
- float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
- float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+ float f##c0 = startSlice + 0.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\
\
for (int s = startSlice; s < endSlice; ++s) \
{ \
@@ -226,9 +226,9 @@ static bool bindVolumeDataTexture(const cudaArray* array)
\
float fVal = 0.0f; \
\
- float f##c0 = startSlice + 1.5f; \
- float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
- float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+ float f##c0 = startSlice + 0.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\
\
for (int s = startSlice; s < endSlice; ++s) \
{ \
@@ -281,16 +281,16 @@ PAR3D_FP_SS_BODY(Z,X,Y)
__device__ float dirWeights(float fX, float fN) {
- if (fX <= 0.5f) // outside image on left
+ if (fX <= -0.5f) // outside image on left
return 0.0f;
- if (fX <= 1.5f) // half outside image on left
- return (fX - 0.5f) * (fX - 0.5f);
- if (fX <= fN + 0.5f) { // inside image
- float t = fX - 0.5f - floorf(fX - 0.5f);
+ if (fX <= 0.5f) // half outside image on left
+ return (fX + 0.5f) * (fX + 0.5f);
+ if (fX <= fN - 0.5f) { // inside image
+ float t = fX + 0.5f - floorf(fX + 0.5f);
return t*t + (1-t)*(1-t);
}
- if (fX <= fN + 1.5f) // half outside image on right
- return (fN + 1.5f - fX) * (fN + 1.5f - fX);
+ if (fX <= fN + 0.5f) // half outside image on right
+ return (fN + 0.5f - fX) * (fN + 0.5f - fX);
return 0.0f; // outside image on right
}
@@ -346,9 +346,9 @@ __device__ float dirWeights(float fX, float fN) {
\
float fVal = 0.0f; \
\
- float f##c0 = startSlice + 1.5f; \
- float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
- float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+ float f##c0 = startSlice + 0.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\
\
for (int s = startSlice; s < endSlice; ++s) \
{ \
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 6dc79c7..cf40fdc 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -292,16 +292,14 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src,
// TODO: Consider using a single array of size max(proj,volume) (per dim)
// instead of allocating a new one each time
-// TODO: Figure out a faster way of zeroing the padding?
-
cudaArray* allocateVolumeArray(const SDimensions3D& dims)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
cudaExtent extentA;
- extentA.width = dims.iVolX+2;
- extentA.height = dims.iVolY+2;
- extentA.depth = dims.iVolZ+2;
+ extentA.width = dims.iVolX;
+ extentA.height = dims.iVolY;
+ extentA.depth = dims.iVolZ;
cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
if (err != cudaSuccess) {
astraCUDA::reportCudaError(err);
@@ -309,8 +307,6 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims)
return 0;
}
- zeroVolumeArray(cuArray, dims);
-
return cuArray;
}
cudaArray* allocateProjectionArray(const SDimensions3D& dims)
@@ -318,9 +314,9 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
cudaExtent extentA;
- extentA.width = dims.iProjU+2;
+ extentA.width = dims.iProjU;
extentA.height = dims.iProjAngles;
- extentA.depth = dims.iProjV+2;
+ extentA.depth = dims.iProjV;
cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
if (err != cudaSuccess) {
@@ -329,101 +325,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
return 0;
}
- zeroProjectionArray(cuArray, dims);
-
return cuArray;
}
-bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims)
-{
- cudaPitchedPtr zeroBuf;
- cudaExtent extentS;
- extentS.width = sizeof(float)*(dims.iVolX+2);
- extentS.height = dims.iVolY+2;
- extentS.depth = 1;
-
- cudaExtent extentA;
- extentA.width = dims.iVolX+2;
- extentA.height = dims.iVolY+2;
- extentA.depth = 1;
-
-
-
- cudaError err;
- err = cudaMalloc3D(&zeroBuf, extentS);
- ASTRA_CUDA_ASSERT(err);
- err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2);
- ASTRA_CUDA_ASSERT(err);
-
- // zero array
- for (unsigned int i = 0; i < dims.iVolZ+2; ++i) {
- cudaMemcpy3DParms p;
- cudaPos zp = {0, 0, 0};
- cudaPos dp = {0, 0, i};
- p.srcArray = 0;
- p.srcPos = zp;
- p.srcPtr = zeroBuf;
- p.dstArray = array;
- p.dstPtr.ptr = 0;
- p.dstPtr.pitch = 0;
- p.dstPtr.xsize = 0;
- p.dstPtr.ysize = 0;
- p.dstPos = dp;
- p.extent = extentA;
- p.kind = cudaMemcpyDeviceToDevice;
-
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
- }
- cudaFree(zeroBuf.ptr);
-
- // TODO: check errors
-
- return true;
-}
-bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims)
-{
- cudaPitchedPtr zeroBuf;
- cudaExtent extentS;
- extentS.width = sizeof(float)*(dims.iProjU+2);
- extentS.height = dims.iProjAngles;
- extentS.depth = 1;
- cudaExtent extentA;
- extentA.width = dims.iProjU+2;
- extentA.height = dims.iProjAngles;
- extentA.depth = 1;
-
-
- cudaError err;
- err = cudaMalloc3D(&zeroBuf, extentS);
- ASTRA_CUDA_ASSERT(err);
- err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles);
- ASTRA_CUDA_ASSERT(err);
-
- for (unsigned int i = 0; i < dims.iProjV+2; ++i) {
- cudaMemcpy3DParms p;
- cudaPos zp = {0, 0, 0};
- cudaPos dp = {0, 0, i};
- p.srcArray = 0;
- p.srcPos = zp;
- p.srcPtr = zeroBuf;
- p.dstArray = array;
- p.dstPtr.ptr = 0;
- p.dstPtr.pitch = 0;
- p.dstPtr.xsize = 0;
- p.dstPtr.ysize = 0;
- p.dstPos = dp;
- p.extent = extentA;
- p.kind = cudaMemcpyDeviceToDevice;
-
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
- }
- cudaFree(zeroBuf.ptr);
-
- // TODO: check errors
- return true;
-}
-
bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims)
{
@@ -434,7 +337,6 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
cudaMemcpy3DParms p;
cudaPos zp = {0, 0, 0};
- cudaPos dp = {1, 1, 1};
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_volumeData;
@@ -443,7 +345,7 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
p.dstPtr.pitch = 0;
p.dstPtr.xsize = 0;
p.dstPtr.ysize = 0;
- p.dstPos = dp;
+ p.dstPos = zp;
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
@@ -462,7 +364,6 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
cudaMemcpy3DParms p;
cudaPos zp = {0, 0, 0};
- cudaPos dp = {1, 0, 1};
p.srcArray = 0;
p.srcPos = zp;
p.srcPtr = D_projData;
@@ -471,7 +372,7 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
p.dstPtr.pitch = 0;
p.dstPtr.xsize = 0;
p.dstPtr.ysize = 0;
- p.dstPos = dp;
+ p.dstPos = zp;
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;