diff options
author | Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> | 2021-11-16 14:10:21 +0100 |
---|---|---|
committer | Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> | 2021-11-16 14:10:21 +0100 |
commit | ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6 (patch) | |
tree | 13a82cdad2602b8fd8ce5e861c5133a4c791e6ac /cuda/2d | |
parent | c6e203411abf3dad3e677aaa1186b927086f8ba7 (diff) | |
parent | 063c97d04a757e3c288dcf156a63bf1e0ffd074e (diff) | |
download | astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.gz astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.bz2 astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.tar.xz astra-ea9703e63e9d3976e89bc1d81bdd1ec3e76b68b6.zip |
Merge branch 'cufft'
This makes FFT/FDK more robust and faster by better sync handling,
and cleans up error logging/handling.
Diffstat (limited to 'cuda/2d')
-rw-r--r-- | cuda/2d/fft.cu | 124 |
1 files changed, 58 insertions, 66 deletions
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu index 2cdb7c3..08acfd4 100644 --- a/cuda/2d/fft.cu +++ b/cuda/2d/fft.cu @@ -40,33 +40,18 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. using namespace astra; -// TODO: evaluate what we want to do in these situations: - -#define CHECK_ERROR(errorMessage) do { \ - cudaError_t err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - ASTRA_ERROR("Cuda error %s : %s", \ - errorMessage,cudaGetErrorString( err)); \ - exit(EXIT_FAILURE); \ - } } while (0) - -#define SAFE_CALL( call) do { \ - cudaError err = call; \ - if( cudaSuccess != err) { \ - ASTRA_ERROR("Cuda error: %s ", \ - cudaGetErrorString( err)); \ - exit(EXIT_FAILURE); \ - } \ - err = cudaThreadSynchronize(); \ - if( cudaSuccess != err) { \ - ASTRA_ERROR("Cuda error: %s : ", \ - cudaGetErrorString( err)); \ - exit(EXIT_FAILURE); \ - } } while (0) - - namespace astraCUDA { +bool checkCufft(cufftResult err, const char *msg) +{ + if (err != CUFFT_SUCCESS) { + ASTRA_ERROR("%s: CUFFT error %d.", msg, err); + return false; + } else { + return true; + } +} + __global__ static void applyFilter_kernel(int _iProjectionCount, int _iFreqBinCount, cufftComplex * _pSinogram, @@ -115,7 +100,8 @@ static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount, rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, _iDetectorCount, _pfInFourierOutput); - CHECK_ERROR("rescaleInverseFourier_kernel failed"); + + checkCuda(cudaThreadSynchronize(), "rescaleInverseFourier"); } void applyFilter(int _iProjectionCount, int _iFreqBinCount, @@ -128,7 +114,8 @@ void applyFilter(int _iProjectionCount, int _iFreqBinCount, applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, _iFreqBinCount, _pSinogram, _pFilter); - CHECK_ERROR("applyFilter_kernel failed"); + + checkCuda(cudaThreadSynchronize(), "applyFilter"); } static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount, @@ -136,24 +123,22 @@ static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount, cufftComplex * _pDevTargetComplex) { cufftHandle plan; - cufftResult result; - result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount); - if(result != CUFFT_SUCCESS) - { - ASTRA_ERROR("Failed to plan 1d r2c fft"); + if (!checkCufft(cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount), "invokeCudaFFT plan")) { return false; } - result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex); - cufftDestroy(plan); + if (!checkCufft(cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex), "invokeCudaFFT exec")) { + cufftDestroy(plan); + return false; + } - if(result != CUFFT_SUCCESS) - { - ASTRA_ERROR("Failed to exec 1d r2c fft"); + if (!checkCuda(cudaDeviceSynchronize(), "invokeCudaFFT sync")) { + cufftDestroy(plan); return false; } + cufftDestroy(plan); return true; } @@ -162,26 +147,25 @@ static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount, float * _pfDevTarget) { cufftHandle plan; - cufftResult result; - result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount); - if(result != CUFFT_SUCCESS) - { - ASTRA_ERROR("Failed to plan 1d c2r fft"); + if (!checkCufft(cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount), "invokeCudaIFFT plan")) { return false; } - // todo: why do we have to get rid of the const qualifier? - result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex, - (cufftReal *)_pfDevTarget); - cufftDestroy(plan); - - if(result != CUFFT_SUCCESS) + // Getting rid of the const qualifier is due to cufft API issue? + if (!checkCufft(cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex, + (cufftReal *)_pfDevTarget), "invokeCudaIFFT exec")) { - ASTRA_ERROR("Failed to exec 1d c2r fft"); + cufftDestroy(plan); return false; } + if (!checkCuda(cudaDeviceSynchronize(), "invokeCudaIFFT sync")) { + cufftDestroy(plan); + return false; + } + + cufftDestroy(plan); return true; } @@ -189,14 +173,12 @@ bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount, cufftComplex ** _ppDevComplex) { size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; - SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize)); - return true; + return checkCuda(cudaMalloc((void **)_ppDevComplex, bufferSize), "fft allocateComplexOnDevice"); } bool freeComplexOnDevice(cufftComplex * _pDevComplex) { - SAFE_CALL(cudaFree(_pDevComplex)); - return true; + return checkCuda(cudaFree(_pDevComplex), "fft freeComplexOnDevice"); } bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, @@ -204,9 +186,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, cufftComplex * _pDevComplexTarget) { size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; - SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice)); - - return true; + return checkCuda(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice), "fft uploadComplexArrayToDevice"); } bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, @@ -217,25 +197,30 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, float * pfDevRealFFTSource = NULL; size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; - SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize)); - SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize)); + if (!checkCuda(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize), "runCudaFFT malloc")) + return false; + if (!checkCuda(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize), "runCudaFFT memset")) { + cudaFree(pfDevRealFFTSource); + return false; + } for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch; float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount; - SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); + if (!checkCuda(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice), "runCudaFFT memcpy")) { + cudaFree(pfDevRealFFTSource); + return false; + } } bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount, pfDevRealFFTSource, _pDevTargetComplex); if(!bResult) - { return false; - } - SAFE_CALL(cudaFree(pfDevRealFFTSource)); + cudaFree(pfDevRealFFTSource); return true; } @@ -248,7 +233,8 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, float * pfDevRealFFTTarget = NULL; size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; - SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize)); + if (!checkCuda(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize), "runCudaIFFT malloc")) + return false; bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount, _pDevSourceComplex, pfDevRealFFTTarget); @@ -260,17 +246,23 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount, pfDevRealFFTTarget); - SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch)); + if (!checkCuda(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch), "runCudaIFFT memset")) { + cudaFree(pfDevRealFFTTarget); + return false; + } for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount; float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch; - SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); + if (!checkCuda(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice), "runCudaIFFT memcpy")) { + cudaFree(pfDevRealFFTTarget); + return false; + } } - SAFE_CALL(cudaFree(pfDevRealFFTTarget)); + cudaFree(pfDevRealFFTTarget); return true; } |