summaryrefslogtreecommitdiffstats
path: root/cuda
diff options
context:
space:
mode:
authorWillem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>2021-11-16 14:07:55 +0100
committerWillem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>2021-11-16 14:07:55 +0100
commitc6e203411abf3dad3e677aaa1186b927086f8ba7 (patch)
tree6c728e1125961fc04ba6f77bf9af637925825f5b /cuda
parent231dd3e5e28319aa16155efd9ec7fdc69834666b (diff)
parent39582115bc93b5435d25e56891815ae7cb1898fd (diff)
downloadastra-c6e203411abf3dad3e677aaa1186b927086f8ba7.tar.gz
astra-c6e203411abf3dad3e677aaa1186b927086f8ba7.tar.bz2
astra-c6e203411abf3dad3e677aaa1186b927086f8ba7.tar.xz
astra-c6e203411abf3dad3e677aaa1186b927086f8ba7.zip
Merge branch 'checkCuda'
This cleans up error logging and handling for cuda calls.
Diffstat (limited to 'cuda')
-rw-r--r--cuda/2d/arith.cu14
-rw-r--r--cuda/2d/fan_bp.cu15
-rw-r--r--cuda/2d/fan_fp.cu11
-rw-r--r--cuda/2d/par_bp.cu10
-rw-r--r--cuda/2d/par_fp.cu19
-rw-r--r--cuda/2d/sart.cu2
-rw-r--r--cuda/2d/util.cu69
-rw-r--r--cuda/3d/arith3d.cu36
-rw-r--r--cuda/3d/cone_bp.cu4
-rw-r--r--cuda/3d/cone_fp.cu17
-rw-r--r--cuda/3d/fdk.cu6
-rw-r--r--cuda/3d/par3d_bp.cu4
-rw-r--r--cuda/3d/par3d_fp.cu30
-rw-r--r--cuda/3d/util3d.cu103
14 files changed, 127 insertions, 213 deletions
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index aa0edae..45622d0 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -451,7 +451,7 @@ void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned
devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -462,7 +462,7 @@ void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int wi
devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -473,7 +473,7 @@ void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, uns
devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -485,7 +485,7 @@ void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned i
devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -496,7 +496,7 @@ void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pit
devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -507,7 +507,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fPa
devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -518,7 +518,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned
devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index fc42456..2068d03 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -322,13 +322,12 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
else
devFanBP<false><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
}
- cudaThreadSynchronize();
- cudaTextForceKernelsCompletion();
+ ok = checkCuda(cudaStreamSynchronize(stream), "FanBP");
cudaStreamDestroy(stream);
- return true;
+ return ok;
}
bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
@@ -354,13 +353,12 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
devFanBP<true><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
}
- cudaThreadSynchronize();
- cudaTextForceKernelsCompletion();
+ ok = checkCuda(cudaStreamSynchronize(stream), "FanBP_FBPWeighted");
cudaStreamDestroy(stream);
- return true;
+ return ok;
}
// D_projData is a pointer to one padded sinogram line
@@ -382,11 +380,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
(dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims, fOutputScale);
- cudaThreadSynchronize();
- cudaTextForceKernelsCompletion();
-
- return true;
+ return checkCuda(cudaThreadSynchronize(), "FanBP_SART");
}
bool FanBP(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
index 9bbb401..342ca4c 100644
--- a/cuda/2d/fan_fp.cu
+++ b/cuda/2d/fan_fp.cu
@@ -268,16 +268,17 @@ bool FanFP_internal(float* D_volumeData, unsigned int volumePitch,
for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
- cudaStreamDestroy(stream1);
- cudaStreamDestroy(stream2);
+ bool ok = true;
- cudaThreadSynchronize();
+ ok &= checkCuda(cudaStreamSynchronize(stream1), "fan_fp hor");
+ cudaStreamDestroy(stream1);
- cudaTextForceKernelsCompletion();
+ ok &= checkCuda(cudaStreamSynchronize(stream2), "fan_fp ver");
+ cudaStreamDestroy(stream2);
cudaFreeArray(D_dataArray);
- return true;
+ return ok;
}
bool FanFP(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index b50b5a5..d7c3ab0 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -231,13 +231,12 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
else
devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
}
- cudaThreadSynchronize();
- cudaTextForceKernelsCompletion();
+ bool ok = checkCuda(cudaStreamSynchronize(stream), "par_bp");
cudaStreamDestroy(stream);
- return true;
+ return ok;
}
bool BP(float* D_volumeData, unsigned int volumePitch,
@@ -284,11 +283,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
(dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale);
- cudaThreadSynchronize();
-
- cudaTextForceKernelsCompletion();
- return true;
+ return checkCuda(cudaThreadSynchronize(), "BP_SART");
}
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index 8c48280..e947428 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -305,8 +305,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
(dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks
- // TODO: check if we can't immediately
- // destroy the stream after use
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -323,19 +323,16 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaThreadSynchronize();
+ bool ok = true;
- cudaTextForceKernelsCompletion();
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "par_fp");
+ cudaStreamDestroy(*iter);
+ }
cudaFreeArray(D_dataArray);
-
- return true;
+ return ok;
}
bool FP_simple(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index 29f5b43..89d58c2 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -54,7 +54,7 @@ void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int
devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), "MUL_SART");
}
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
index 2a47472..ac360f0 100644
--- a/cuda/2d/util.cu
+++ b/cuda/2d/util.cu
@@ -40,12 +40,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
{
size_t width = dims.iVolWidth;
size_t height = dims.iVolHeight;
- // TODO: memory order
- cudaError_t err;
- err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
- ASTRA_CUDA_ASSERT(err);
- assert(err == cudaSuccess);
- return true;
+
+ return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copyVolumeToDevice");
}
bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
@@ -54,10 +50,8 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
{
size_t width = dims.iVolWidth;
size_t height = dims.iVolHeight;
- // TODO: memory order
- cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
- ASTRA_CUDA_ASSERT(err);
- return true;
+
+ return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copyVolumeFromDevice");
}
@@ -67,10 +61,8 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
{
size_t width = dims.iProjDets;
size_t height = dims.iProjAngles;
- // TODO: memory order
- cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
- ASTRA_CUDA_ASSERT(err);
- return true;
+
+ return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copySinogramFromDevice");
}
bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
@@ -79,20 +71,15 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
{
size_t width = dims.iProjDets;
size_t height = dims.iProjAngles;
- // TODO: memory order
- cudaError_t err;
- err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
- ASTRA_CUDA_ASSERT(err);
- return true;
+
+ return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copySinogramToDevice");
}
bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch)
{
size_t p;
- cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height);
- if (ret != cudaSuccess) {
- reportCudaError(ret);
+ if (!checkCuda(cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height), "allocateVolume")) {
ASTRA_ERROR("Failed to allocate %dx%d GPU buffer", width, height);
return false;
}
@@ -104,11 +91,9 @@ bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsign
return true;
}
-void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+bool zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
{
- cudaError_t err;
- err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height);
- ASTRA_CUDA_ASSERT(err);
+ return checkCuda(cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height), "zeroVolume");
}
bool allocateVolumeData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims)
@@ -121,14 +106,14 @@ bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimension
return allocateVolume(D_ptr, dims.iProjDets, dims.iProjAngles, pitch);
}
-void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
+bool zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
{
- zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight);
+ return zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight);
}
-void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
+bool zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
{
- zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles);
+ return zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles);
}
void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims)
@@ -231,7 +216,7 @@ float dotProduct2D(float* D_data, unsigned int pitch,
// Step 1: reduce 2D from image to a single vector, taking sum of squares
reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), "dotProduct2D reduce2D");
// Step 2: reduce 1D: add up elements in vector
if (bx * by > 512)
@@ -248,31 +233,21 @@ float dotProduct2D(float* D_data, unsigned int pitch,
float x;
cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), "dotProduct2D");
cudaFree(D_buf);
return x;
}
-
-bool cudaTextForceKernelsCompletion()
+bool checkCuda(cudaError_t err, const char *msg)
{
- cudaError_t returnedCudaError = cudaThreadSynchronize();
-
- if(returnedCudaError != cudaSuccess) {
- ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
+ if (err != cudaSuccess) {
+ ASTRA_ERROR("%s: CUDA error %d: %s.", msg, err, cudaGetErrorString(err));
return false;
+ } else {
+ return true;
}
-
- return true;
}
-void reportCudaError(cudaError_t err)
-{
- if(err != cudaSuccess)
- ASTRA_ERROR("CUDA error %d: %s.", err, cudaGetErrorString(err));
-}
-
-
}
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index fbaa50c..b495f22 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 7c3fc8d..e265304 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -357,7 +357,9 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
}
- cudaTextForceKernelsCompletion();
+ // TODO: Consider not synchronizing here, if possible.
+ if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+ return false;
angles = angles + angleCount;
// printf("%f\n", toc(t));
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index 4937d24..fede53b 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -402,8 +402,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
dim3 dimGrid(
((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
- // TODO: check if we can't immediately
- // destroy the stream after use
+
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -446,16 +447,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
+ bool ok = true;
- cudaTextForceKernelsCompletion();
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index 7b36c93..0b8d2ab 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims);
- cudaTextForceKernelsCompletion();
+ if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight"))
+ return false;
if (bShortScan && dims.iProjAngles > 1) {
ASTRA_DEBUG("Doing Parker weighting");
@@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims);
+ if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight"))
+ return false;
}
- cudaTextForceKernelsCompletion();
return true;
}
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index d356b9f..1dc75ce 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -291,7 +291,9 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
}
- cudaTextForceKernelsCompletion();
+ // TODO: Consider not synchronizing here, if possible.
+ if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+ return false;
angles = angles + angleCount;
// printf("%f\n", toc(t));
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 1f58516..cf8336c 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -501,8 +501,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
dim3 dimGrid(
((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
- // TODO: check if we can't immediately
- // destroy the stream after use
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -545,17 +545,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaTextForceKernelsCompletion();
+ bool ok = true;
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
bool Par3DFP(cudaPitchedPtr D_volumeData,
@@ -726,17 +725,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaTextForceKernelsCompletion();
+ bool ok = true;
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok = ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 844b880..71b5668 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -46,12 +46,9 @@ cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)
cudaPitchedPtr volData;
- cudaError err = cudaMalloc3D(&volData, extentV);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+ if (!checkCuda(cudaMalloc3D(&volData, extentV), "allocateVolumeData 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iVolX, dims.iVolY, dims.iVolZ);
volData.ptr = 0;
- // TODO: return 0 somehow?
}
return volData;
@@ -65,12 +62,9 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
cudaPitchedPtr projData;
- cudaError err = cudaMalloc3D(&projData, extentP);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+ if (!checkCuda(cudaMalloc3D(&projData, extentP), "allocateProjectionData 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iProjU, dims.iProjAngles, dims.iProjV);
projData.ptr = 0;
- // TODO: return 0 somehow?
}
return projData;
@@ -78,11 +72,11 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
{
char* t = (char*)D_data.ptr;
- cudaError err;
for (unsigned int z = 0; z < dims.iVolZ; ++z) {
- err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY);
- ASTRA_CUDA_ASSERT(err);
+ if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY), "zeroVolumeData 3D")) {
+ return false;
+ }
t += D_data.pitch * dims.iVolY;
}
return true;
@@ -90,11 +84,11 @@ bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
{
char* t = (char*)D_data.ptr;
- cudaError err;
for (unsigned int z = 0; z < dims.iProjV; ++z) {
- err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles);
- ASTRA_CUDA_ASSERT(err);
+ if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles), "zeroProjectionData 3D")) {
+ return false;
+ }
t += D_data.pitch * dims.iProjAngles;
}
@@ -128,11 +122,7 @@ bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimens
p.extent = extentV;
p.kind = cudaMemcpyHostToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyVolumeToDevice 3D");
}
bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
@@ -163,11 +153,7 @@ bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SD
p.extent = extentV;
p.kind = cudaMemcpyHostToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyProjectionsToDevice 3D");
}
bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
@@ -198,12 +184,9 @@ bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDime
p.extent = extentV;
p.kind = cudaMemcpyDeviceToHost;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyVolumeFromDevice 3D");
}
+
bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
{
if (!pitch)
@@ -232,11 +215,7 @@ bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const
p.extent = extentV;
p.kind = cudaMemcpyDeviceToHost;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyProjectionsFromDevice 3D");
}
bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
@@ -258,12 +237,9 @@ bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, con
p.extent = extentV;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "duplicateVolumeData 3D");
}
+
bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
{
cudaExtent extentV;
@@ -283,11 +259,7 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src,
p.extent = extentV;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "duplicateProjectionData 3D");
}
@@ -303,9 +275,8 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims)
extentA.width = dims.iVolX;
extentA.height = dims.iVolY;
extentA.depth = dims.iVolZ;
- cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+
+ if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateVolumeArray 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iVolX, dims.iVolY, dims.iVolZ);
return 0;
}
@@ -320,10 +291,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
extentA.width = dims.iProjU;
extentA.height = dims.iProjAngles;
extentA.depth = dims.iProjV;
- cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+ if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateProjectionArray 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iProjU, dims.iProjAngles, dims.iProjV);
return 0;
}
@@ -352,12 +321,9 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
- // TODO: check errors
-
- return true;
+ return checkCuda(cudaMemcpy3D(&p), "transferVolumeToArray 3D");
}
+
bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)
{
cudaExtent extentA;
@@ -379,13 +345,9 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- // TODO: check errors
-
- return true;
+ return checkCuda(cudaMemcpy3D(&p), "transferProjectionsToArray 3D");
}
+
bool transferHostProjectionsToArray(const float *projData, cudaArray* array, const SDimensions3D& dims)
{
cudaExtent extentA;
@@ -413,12 +375,7 @@ bool transferHostProjectionsToArray(const float *projData, cudaArray* array, con
p.extent = extentA;
p.kind = cudaMemcpyHostToDevice;
- cudaError err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- // TODO: check errors
-
- return true;
+ return checkCuda(cudaMemcpy3D(&p), "transferHostProjectionsToArray 3D");
}
@@ -430,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
}
-bool cudaTextForceKernelsCompletion()
-{
- cudaError_t returnedCudaError = cudaThreadSynchronize();
-
- if(returnedCudaError != cudaSuccess) {
- ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
- return false;
- }
-
- return true;
-}
-
int calcNextPowerOfTwo(int _iValue)
{
int iOutput = 1;