diff options
author | Willem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be> | 2014-04-16 11:12:55 +0000 |
---|---|---|
committer | wpalenst <WillemJan.Palenstijn@uantwerpen.be> | 2014-04-16 11:12:55 +0000 |
commit | 3a6769465bee7d56d0ddff36613b886446421e07 (patch) | |
tree | 624e85c5d6a4ab19c958a388e3436219693a6296 /cuda/2d/arith.cu | |
parent | 4dfb881ceb82b07630437e952dec62323977ab56 (diff) | |
download | astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.gz astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.bz2 astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.xz astra-3a6769465bee7d56d0ddff36613b886446421e07.zip |
Remove padding in 2D cuda in favour of Border mode
Diffstat (limited to 'cuda/2d/arith.cu')
-rw-r--r-- | cuda/2d/arith.cu | 182 |
1 files changed, 84 insertions, 98 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index 1ee02ca..42c2c98 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -144,14 +144,14 @@ struct opMulMask { -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out, unsigned int width, unsigned int height) { float* D_out; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - processVol<op, t>(D_out, pitch, width, height); + processVol<op>(D_out, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); cudaFree(D_out); } -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out, float param, unsigned int width, unsigned int height) { float* D_out; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - processVol<op, t>(D_out, param, pitch, width, height); + processVol<op>(D_out, param, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); cudaFree(D_out); } -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height) { float* D_out1; float* D_out2; unsigned int pitch; - allocateVolume(D_out1, width+2, height+2, pitch); + allocateVolume(D_out1, width, height, pitch); copyVolumeToDevice(out1, width, width, height, D_out1, pitch); - allocateVolume(D_out2, width+2, height+2, pitch); + allocateVolume(D_out2, width, height, pitch); copyVolumeToDevice(out2, width, width, height, D_out2, pitch); - processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height); + processVol<op>(D_out1, D_out2, param1, param2, pitch, width, height); copyVolumeFromDevice(out1, width, width, height, D_out1, pitch); copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); @@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign } -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height) { float* D_out; float* D_in; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in, width+2, height+2, pitch); + allocateVolume(D_in, width, height, pitch); copyVolumeToDevice(in, width, width, height, D_in, pitch); - processVol<op, t>(D_out, D_in, pitch, width, height); + processVol<op>(D_out, D_in, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in cudaFree(D_in); } -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height) { float* D_out; float* D_in; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in, width+2, height+2, pitch); + allocateVolume(D_in, width, height, pitch); copyVolumeToDevice(in, width, width, height, D_in, pitch); - processVol<op, t>(D_out, D_in, param, pitch, width, height); + processVol<op>(D_out, D_in, param, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width cudaFree(D_in); } -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height) { float* D_out; @@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int float* D_in2; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in1, width+2, height+2, pitch); + allocateVolume(D_in1, width, height, pitch); copyVolumeToDevice(in1, width, width, height, D_in1, pitch); - allocateVolume(D_in2, width+2, height+2, pitch); + allocateVolume(D_in2, width, height, pitch); copyVolumeToDevice(in2, width, width, height, D_in2, pitch); - processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height); + processVol<op>(D_out, D_in1, D_in2, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int cudaFree(D_in2); } -template<typename op, VolType t> +template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height) { float* D_out; @@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param, float* D_in2; unsigned int pitch; - allocateVolume(D_out, width+2, height+2, pitch); + allocateVolume(D_out, width, height, pitch); copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in1, width+2, height+2, pitch); + allocateVolume(D_in1, width, height, pitch); copyVolumeToDevice(in1, width, width, height, D_in1, pitch); - allocateVolume(D_in2, width+2, height+2, pitch); + allocateVolume(D_in2, width, height, pitch); copyVolumeToDevice(in2, width, width, height, D_in2, pitch); - processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height); + processVol<op>(D_out, D_in1, D_in2, param, pitch, width, height); copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param, -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+511)/512); - devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); + devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); - devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); cudaTextForceKernelsCompletion(); } @@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo unsigned int step = out1.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut1 += step; pfOut2 += step; } @@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut1 += step; pfOut2 += step; } @@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit #define INST_DFtoD(name) \ - template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); #define INST_DtoD(name) \ - template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); #define INST_DDtoD(name) \ - template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); #define INST_DDFtoD(name) \ - template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); #define INST_toD(name) \ - template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); #define INST_FtoD(name) \ - template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); #define INST_FFtoDD(name) \ - template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ - template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ - template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVol<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); |