Remove padding in 2D cuda in favour of Border mode

author: Willem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be> 2014-04-16 11:12:55 +0000
committer: wpalenst <WillemJan.Palenstijn@uantwerpen.be> 2014-04-16 11:12:55 +0000
commit: 3a6769465bee7d56d0ddff36613b886446421e07 (patch)
tree: 624e85c5d6a4ab19c958a388e3436219693a6296 /cuda/2d/arith.cu
parent: 4dfb881ceb82b07630437e952dec62323977ab56 (diff)
download: astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.gz
astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.bz2
astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.xz
astra-3a6769465bee7d56d0ddff36613b886446421e07.zip
1 files changed, 84 insertions, 98 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index 1ee02ca..42c2c98 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -144,14 +144,14 @@ struct opMulMask {
 
 
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa
 
 
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2,
 
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, unsigned int width, unsigned int height)
 {
 	float* D_out;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
 
-	processVol<op, t>(D_out, pitch, width, height);
+	processVol<op>(D_out, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
 	cudaFree(D_out);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, float param, unsigned int width, unsigned int height)
 {
 	float* D_out;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
 
-	processVol<op, t>(D_out, param, pitch, width, height);
+	processVol<op>(D_out, param, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
 	cudaFree(D_out);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)
 {
 	float* D_out1;
 	float* D_out2;
 
 	unsigned int pitch;
-	allocateVolume(D_out1, width+2, height+2, pitch);
+	allocateVolume(D_out1, width, height, pitch);
 	copyVolumeToDevice(out1, width, width, height, D_out1, pitch);
-	allocateVolume(D_out2, width+2, height+2, pitch);
+	allocateVolume(D_out2, width, height, pitch);
 	copyVolumeToDevice(out2, width, width, height, D_out2, pitch);
 
-	processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height);
+	processVol<op>(D_out1, D_out2, param1, param2, pitch, width, height);
 
 	copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);
 	copyVolumeFromDevice(out2, width, width, height, D_out2, pitch);
@@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign
 }
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)
 {
 	float* D_out;
 	float* D_in;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in, width+2, height+2, pitch);
+	allocateVolume(D_in, width, height, pitch);
 	copyVolumeToDevice(in, width, width, height, D_in, pitch);
 
-	processVol<op, t>(D_out, D_in, pitch, width, height);
+	processVol<op>(D_out, D_in, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in
 	cudaFree(D_in);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)
 {
 	float* D_out;
 	float* D_in;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in, width+2, height+2, pitch);
+	allocateVolume(D_in, width, height, pitch);
 	copyVolumeToDevice(in, width, width, height, D_in, pitch);
 
-	processVol<op, t>(D_out, D_in, param, pitch, width, height);
+	processVol<op>(D_out, D_in, param, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width
 	cudaFree(D_in);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)
 {
 	float* D_out;
@@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
 	float* D_in2;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in1, width+2, height+2, pitch);
+	allocateVolume(D_in1, width, height, pitch);
 	copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
-	allocateVolume(D_in2, width+2, height+2, pitch);
+	allocateVolume(D_in2, width, height, pitch);
 	copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
 
-	processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height);
+	processVol<op>(D_out, D_in1, D_in2, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
 	cudaFree(D_in2);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)
 {
 	float* D_out;
@@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
 	float* D_in2;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in1, width+2, height+2, pitch);
+	allocateVolume(D_in1, width, height, pitch);
 	copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
-	allocateVolume(D_in2, width+2, height+2, pitch);
+	allocateVolume(D_in2, width, height, pitch);
 	copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
 
-	processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height);
+	processVol<op>(D_out, D_in1, D_in2, param, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
 
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+511)/512);
 
-	devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
+	devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
@@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 	}
 
@@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 	}
 
@@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo
 	unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut1 += step;
 		pfOut2 += step;
 	}
@@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 	}
 
@@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 	}
 
@@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl
 	unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut1 += step;
 		pfOut2 += step;
 	}
@@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 
 
 #define INST_DFtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
 
 #define INST_DtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
 
 #define INST_DDtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
 
 #define INST_DDFtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
 
 
 #define INST_toD(name) \
-  template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
 
 #define INST_FtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, float param, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);
 
 #define INST_FFtoDD(name) \
-  template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
author	Willem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be>	2014-04-16 11:12:55 +0000
committer	wpalenst <WillemJan.Palenstijn@uantwerpen.be>	2014-04-16 11:12:55 +0000
commit	3a6769465bee7d56d0ddff36613b886446421e07 (patch)
tree	624e85c5d6a4ab19c958a388e3436219693a6296 /cuda/2d/arith.cu
parent	4dfb881ceb82b07630437e952dec62323977ab56 (diff)
download	astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.gz astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.bz2 astra-3a6769465bee7d56d0ddff36613b886446421e07.tar.xz astra-3a6769465bee7d56d0ddff36613b886446421e07.zip