From d59619737b79ca3bd732fedaff6665e600ee1335 Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Tue, 16 Nov 2021 11:28:05 +0100 Subject: Add new checkCuda function to simplify error handling --- cuda/2d/util.cu | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'cuda/2d') diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index 2a47472..ce652cb 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -274,5 +274,14 @@ void reportCudaError(cudaError_t err) ASTRA_ERROR("CUDA error %d: %s.", err, cudaGetErrorString(err)); } +bool checkCuda(cudaError_t err, const char *msg) +{ + if (err != cudaSuccess) { + ASTRA_ERROR("%s: CUDA error %d: %s.", msg, err, cudaGetErrorString(err)); + return false; + } else { + return true; + } +} } -- cgit v1.2.3 From 1875e824a0358a7e7510b31f5e87708b304652bc Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Tue, 16 Nov 2021 11:38:02 +0100 Subject: Remove reportCudaError function --- cuda/2d/util.cu | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'cuda/2d') diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index ce652cb..1c7f6f0 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -90,9 +90,7 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch) { size_t p; - cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height); - if (ret != cudaSuccess) { - reportCudaError(ret); + if (!checkCuda(cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height), "allocateVolume")) { ASTRA_ERROR("Failed to allocate %dx%d GPU buffer", width, height); return false; } @@ -268,12 +266,6 @@ bool cudaTextForceKernelsCompletion() return true; } -void reportCudaError(cudaError_t err) -{ - if(err != cudaSuccess) - ASTRA_ERROR("CUDA error %d: %s.", err, cudaGetErrorString(err)); -} - bool checkCuda(cudaError_t err, const char *msg) { if (err != cudaSuccess) { -- cgit v1.2.3 From b492e3d049e300132d2f22eee7922ff308342a84 Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Tue, 16 Nov 2021 11:46:17 +0100 Subject: Remove ASTRA_CUDA_ASSERT --- cuda/2d/util.cu | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) (limited to 'cuda/2d') diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index 1c7f6f0..a75e5ab 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -40,12 +40,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, { size_t width = dims.iVolWidth; size_t height = dims.iVolHeight; - // TODO: memory order - cudaError_t err; - err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); - ASTRA_CUDA_ASSERT(err); - assert(err == cudaSuccess); - return true; + + return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copyVolumeToDevice"); } bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, @@ -54,10 +50,8 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, { size_t width = dims.iVolWidth; size_t height = dims.iVolHeight; - // TODO: memory order - cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); - ASTRA_CUDA_ASSERT(err); - return true; + + return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copyVolumeFromDevice"); } @@ -67,10 +61,8 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, { size_t width = dims.iProjDets; size_t height = dims.iProjAngles; - // TODO: memory order - cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); - ASTRA_CUDA_ASSERT(err); - return true; + + return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copySinogramFromDevice"); } bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, @@ -79,11 +71,8 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, { size_t width = dims.iProjDets; size_t height = dims.iProjAngles; - // TODO: memory order - cudaError_t err; - err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); - ASTRA_CUDA_ASSERT(err); - return true; + + return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copySinogramToDevice"); } @@ -102,11 +91,9 @@ bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsign return true; } -void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height) +bool zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height) { - cudaError_t err; - err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height); - ASTRA_CUDA_ASSERT(err); + return checkCuda(cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height), "zeroVolume"); } bool allocateVolumeData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims) @@ -119,14 +106,14 @@ bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimension return allocateVolume(D_ptr, dims.iProjDets, dims.iProjAngles, pitch); } -void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims) +bool zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims) { - zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight); + return zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight); } -void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims) +bool zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims) { - zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles); + return zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles); } void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims) -- cgit v1.2.3 From 39582115bc93b5435d25e56891815ae7cb1898fd Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Tue, 16 Nov 2021 13:44:13 +0100 Subject: Remove cudaTextForceKernelsCompletion --- cuda/2d/arith.cu | 14 +++++++------- cuda/2d/fan_bp.cu | 15 +++++---------- cuda/2d/fan_fp.cu | 11 ++++++----- cuda/2d/par_bp.cu | 10 +++------- cuda/2d/par_fp.cu | 19 ++++++++----------- cuda/2d/sart.cu | 2 +- cuda/2d/util.cu | 17 ++--------------- 7 files changed, 32 insertions(+), 56 deletions(-) (limited to 'cuda/2d') diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index aa0edae..45622d0 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -451,7 +451,7 @@ void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned devtoD<<>>(pfOut, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template @@ -462,7 +462,7 @@ void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int wi devFtoD<<>>(pfOut, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template @@ -473,7 +473,7 @@ void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, uns devFFtoDD<<>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } @@ -485,7 +485,7 @@ void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned i devDtoD<<>>(pfOut, pfIn, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template @@ -496,7 +496,7 @@ void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pit devDFtoD<<>>(pfOut, pfIn, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template @@ -507,7 +507,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fPa devDDFtoD<<>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } template @@ -518,7 +518,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned devDDtoD<<>>(pfOut, pfIn1, pfIn2, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), __FUNCTION__); } diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu index fc42456..2068d03 100644 --- a/cuda/2d/fan_bp.cu +++ b/cuda/2d/fan_bp.cu @@ -322,13 +322,12 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch, else devFanBP<<>>(D_volumeData, volumePitch, i, dims, fOutputScale); } - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); + ok = checkCuda(cudaStreamSynchronize(stream), "FanBP"); cudaStreamDestroy(stream); - return true; + return ok; } bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch, @@ -354,13 +353,12 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch, for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { devFanBP<<>>(D_volumeData, volumePitch, i, dims, fOutputScale); } - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); + ok = checkCuda(cudaStreamSynchronize(stream), "FanBP_FBPWeighted"); cudaStreamDestroy(stream); - return true; + return ok; } // D_projData is a pointer to one padded sinogram line @@ -382,11 +380,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); devFanBP_SART<<>>(D_volumeData, volumePitch, dims, fOutputScale); - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); - - return true; + return checkCuda(cudaThreadSynchronize(), "FanBP_SART"); } bool FanBP(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu index 9bbb401..342ca4c 100644 --- a/cuda/2d/fan_fp.cu +++ b/cuda/2d/fan_fp.cu @@ -268,16 +268,17 @@ bool FanFP_internal(float* D_volumeData, unsigned int volumePitch, for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) FanFPvertical<<>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); - cudaStreamDestroy(stream1); - cudaStreamDestroy(stream2); + bool ok = true; - cudaThreadSynchronize(); + ok &= checkCuda(cudaStreamSynchronize(stream1), "fan_fp hor"); + cudaStreamDestroy(stream1); - cudaTextForceKernelsCompletion(); + ok &= checkCuda(cudaStreamSynchronize(stream2), "fan_fp ver"); + cudaStreamDestroy(stream2); cudaFreeArray(D_dataArray); - return true; + return ok; } bool FanFP(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index b50b5a5..d7c3ab0 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -231,13 +231,12 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch, else devBP<<>>(D_volumeData, volumePitch, i, dims, fOutputScale); } - cudaThreadSynchronize(); - cudaTextForceKernelsCompletion(); + bool ok = checkCuda(cudaStreamSynchronize(stream), "par_bp"); cudaStreamDestroy(stream); - return true; + return ok; } bool BP(float* D_volumeData, unsigned int volumePitch, @@ -284,11 +283,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch, (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); devBP_SART<<>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale); - cudaThreadSynchronize(); - - cudaTextForceKernelsCompletion(); - return true; + return checkCuda(cudaThreadSynchronize(), "BP_SART"); } diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu index 8c48280..e947428 100644 --- a/cuda/2d/par_fp.cu +++ b/cuda/2d/par_fp.cu @@ -305,8 +305,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks - // TODO: check if we can't immediately - // destroy the stream after use + // TODO: consider limiting number of handle (chaotic) geoms + // with many alternating directions cudaStream_t stream; cudaStreamCreate(&stream); streams.push_back(stream); @@ -323,19 +323,16 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch, } } - for (std::list::iterator iter = streams.begin(); iter != streams.end(); ++iter) - cudaStreamDestroy(*iter); - - streams.clear(); - - cudaThreadSynchronize(); + bool ok = true; - cudaTextForceKernelsCompletion(); + for (std::list::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "par_fp"); + cudaStreamDestroy(*iter); + } cudaFreeArray(D_dataArray); - - return true; + return ok; } bool FP_simple(float* D_volumeData, unsigned int volumePitch, diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index 29f5b43..89d58c2 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -54,7 +54,7 @@ void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int devMUL_SART<<>>(pfOut, pfIn, pitch, width); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), "MUL_SART"); } diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index a75e5ab..ac360f0 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -216,7 +216,7 @@ float dotProduct2D(float* D_data, unsigned int pitch, // Step 1: reduce 2D from image to a single vector, taking sum of squares reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), "dotProduct2D reduce2D"); // Step 2: reduce 1D: add up elements in vector if (bx * by > 512) @@ -233,26 +233,13 @@ float dotProduct2D(float* D_data, unsigned int pitch, float x; cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost); - cudaTextForceKernelsCompletion(); + checkCuda(cudaThreadSynchronize(), "dotProduct2D"); cudaFree(D_buf); return x; } - -bool cudaTextForceKernelsCompletion() -{ - cudaError_t returnedCudaError = cudaThreadSynchronize(); - - if(returnedCudaError != cudaSuccess) { - ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError)); - return false; - } - - return true; -} - bool checkCuda(cudaError_t err, const char *msg) { if (err != cudaSuccess) { -- cgit v1.2.3