diff options
author | Suren A. Chilingaryan <csa@ipecompute4.ands.kit.edu> | 2022-09-06 19:12:09 +0200 |
---|---|---|
committer | Suren A. Chilingaryan <csa@ipecompute4.ands.kit.edu> | 2022-09-06 19:12:09 +0200 |
commit | efa4313aa57e4c3511eb1d5d88edc37e99f899fa (patch) | |
tree | b4061f583770608a19a313a9ef40cef71cc053d2 /patches/ccpi-regularisation-toolkit-fast-tnv | |
parent | 4616a04086c1f9248008add524a9cf74ffecca33 (diff) | |
download | ccpi-efa4313aa57e4c3511eb1d5d88edc37e99f899fa.tar.gz ccpi-efa4313aa57e4c3511eb1d5d88edc37e99f899fa.tar.bz2 ccpi-efa4313aa57e4c3511eb1d5d88edc37e99f899fa.tar.xz ccpi-efa4313aa57e4c3511eb1d5d88edc37e99f899fa.zip |
Add all CCPi patches (patches are not applied automatically, but just collected)
Diffstat (limited to 'patches/ccpi-regularisation-toolkit-fast-tnv')
4 files changed, 991 insertions, 0 deletions
diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt new file mode 100644 index 0000000..1a3b37a --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt @@ -0,0 +1,169 @@ +# Copyright 2018 Edoardo Pasca +#cmake_minimum_required (VERSION 3.0) + +project(RGL_core) +#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py + +# The version number. + +set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE) + +# conda orchestrated build +message("CIL_VERSION ${CIL_VERSION}") +#include (GenerateExportHeader) + +## Build the regularisers package as a library +message("Creating Regularisers as a shared library") + +find_package(GLIB2 REQUIRED) +find_package(OpenMP REQUIRED) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}") +message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}") +message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}") + +set(CMAKE_BUILD_TYPE "Release") + +if(WIN32) + set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib") + + set (EXTRA_LIBRARIES) + + message("library lib: ${LIBRARY_LIB}") + +elseif(UNIX) + set (FLAGS "-O2 -funsigned-char -Wall -Wl,--no-undefined -DCCPiReconstructionIterative_EXPORTS ") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + + set (EXTRA_LIBRARIES + "gomp" + "m" + ) + +endif() +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") + +## Build the regularisers package as a library +message("Adding regularisers as a shared library") + +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_COMPILER gcc-9) +set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") + +#set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") +#set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC") +add_library(cilreg SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c + ) +target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} ) +include_directories(cilreg PUBLIC + ${GLIB2_INCLUDE_DIR} + ${LIBRARY_INC}/include + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/ ) + +## Install + +if (UNIX) +message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +install(TARGETS cilreg + LIBRARY DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) +elseif(WIN32) +message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") + install(TARGETS cilreg + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) +endif() + + + +# GPU Regularisers +if (BUILD_CUDA) + find_package(CUDA) + if (CUDA_FOUND) + set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES -allow-unsupported-compiler") + message("CUDA FLAGS ${CUDA_NVCC_FLAGS}") + CUDA_ADD_LIBRARY(cilregcuda SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu + ) + if (UNIX) + message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") + install(TARGETS cilregcuda + LIBRARY DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) + elseif(WIN32) + message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") + install(TARGETS cilregcuda + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) + endif() + else() + message("CUDA NOT FOUND") + endif() +endif() + +if (${BUILD_MATLAB_WRAPPER}) + if (WIN32) + install(TARGETS cilreg DESTINATION ${MATLAB_DEST}) + if (CUDA_FOUND) + install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST}) + endif() + endif() +endif() diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c new file mode 100755 index 0000000..cbce96a --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c @@ -0,0 +1,668 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <malloc.h> +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef _Float16 floatxx; // Large arrays, allways float16 if we go mixed-precision. +//typedef _Float16 floatyy; // Small arrays which we can do both ways. +typedef float floatyy; + +typedef struct { + int offY, stepY, copY; + floatxx *Input, *u, *qx, *qy, *gradx, *grady, *div; + floatyy *div0, *udiff0; + floatyy *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->qx); + free(ctx->qy); + free(ctx->gradx); + free(ctx->grady); + free(ctx->div); + + free(ctx->div0); + free(ctx->udiff0); + + free(ctx->gradxdiff); + free(ctx->gradydiff); + free(ctx->ubarx); + free(ctx->ubary); + free(ctx->udiff); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + long DimRow = (long)(dimX * padZ); + long DimCell = (long)(padZ); + + // Auxiliar vectors + ctx->Input = memalign(64, Dim1Total * sizeof(floatxx)); + ctx->u = memalign(64, Dim1Total * sizeof(floatxx)); + ctx->qx = memalign(64, DimTotal * sizeof(floatxx)); + ctx->qy = memalign(64, DimTotal * sizeof(floatxx)); + ctx->gradx = memalign(64, DimTotal * sizeof(floatxx)); + ctx->grady = memalign(64, DimTotal * sizeof(floatxx)); + ctx->div = memalign(64, Dim1Total * sizeof(floatxx)); + + ctx->div0 = memalign(64, DimRow * sizeof(floatyy)); + ctx->udiff0 = memalign(64, DimRow * sizeof(floatyy)); + + ctx->gradxdiff = memalign(64, DimCell * sizeof(floatyy)); + ctx->gradydiff = memalign(64, DimCell * sizeof(floatyy)); + ctx->ubarx = memalign(64, DimCell * sizeof(floatyy)); + ctx->ubary = memalign(64, DimCell * sizeof(floatyy)); + ctx->udiff = memalign(64, DimCell * sizeof(floatyy)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(floatxx)); + memset(ctx->qx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->qy, 0, DimTotal * sizeof(floatxx)); + memset(ctx->gradx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->grady, 0, DimTotal * sizeof(floatxx)); + memset(ctx->div, 0, Dim1Total * sizeof(floatxx)); + + for(k=0; k<dimZ; k++) { + for(j=0; j<copY; j++) { + for(i=0; i<dimX; i++) { + ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; k<dimZ; k++) { + for(j=0; j<stepY; j++) { + for(i=0; i<dimX; i++) { + tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(floatxx)); + memset(ctx->qx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->qy, 0, DimTotal * sizeof(floatxx)); + memset(ctx->gradx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->grady, 0, DimTotal * sizeof(floatxx)); + memset(ctx->div, 0, Dim1Total * sizeof(floatxx)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + floatxx *Input = ctx->Input; + floatxx *u = ctx->u; + floatxx *qx = ctx->qx; + floatxx *qy = ctx->qy; + floatxx *gradx = ctx->gradx; + floatxx *grady = ctx->grady; + floatxx *div = ctx->div; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + floatyy qxdiff; + floatyy qydiff; + floatyy divdiff; + floatyy *gradxdiff = ctx->gradxdiff; + floatyy *gradydiff = ctx->gradydiff; + floatyy *ubarx = ctx->ubarx; + floatyy *ubary = ctx->ubary; + floatyy *udiff = ctx->udiff; + + floatyy *udiff0 = ctx->udiff0; + floatyy *div0 = ctx->div0; + + + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J + } + + + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_loop.h" + } + } + } // i + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + + + + ctx->resprimal = resprimal; + ctx->resdual = resdual1 + resdual2; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + int started = 0; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = (ctx0->stepY - 1) * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + floatyy divdiff = ctx->div0[l] - ctx->div[l]; + floatyy udiff = ctx->udiff0[l]; + + ctx->div[l] -= ctx0->qy[l + m]; + ctx0->div[m + l + dimX*padZ] = ctx->div[l]; + ctx0->u[m + l + dimX*padZ] = ctx->u[l]; + + divdiff += ctx0->qy[l + m]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + floatyy divdiff = ctx->div0[l] - ctx->div[l]; + floatyy udiff = ctx->udiff0[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; +// printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + if (started) { + fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } + } else { + started = 1; + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); +// printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h new file mode 100644 index 0000000..aa050a4 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h @@ -0,0 +1,47 @@ +#include <math.h> +#include <stdlib.h> +#include <memory.h> +#include <stdio.h> +#include "omp.h" +#include "utils.h" +#include "CCPiDefines.h" + +#define fTiny 0.00000001f +#define fLarge 100000000.0f +#define INFNORM -1 + +#define MAX(i,j) ((i)<(j) ? (j):(i)) +#define MIN(i,j) ((i)<(j) ? (i):(j)) + +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef __cplusplus +extern "C" { +#endif +CCPI_EXPORT float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ); + +/*float PDHG(float *A, float *B, float tau, float sigma, float theta, float lambda, int p, int q, int r, float tol, int maxIter, int d_c, int d_w, int d_h);*/ +CCPI_EXPORT float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ); +CCPI_EXPORT float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ); +CCPI_EXPORT float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ); +CCPI_EXPORT float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ); +#ifdef __cplusplus +} +#endif
\ No newline at end of file diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h new file mode 100644 index 0000000..86299cd --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h @@ -0,0 +1,107 @@ + { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + + floatxx *__restrict u_next = u + l + padZ; + floatxx *__restrict u_current = u + l; + floatxx *__restrict u_next_row = u + l + m; + + + floatxx *__restrict qx_current = qx + l; + floatxx *__restrict qy_current = qy + l; + floatxx *__restrict qx_prev = qx + l - padZ; + floatxx *__restrict qy_prev = qy + l - m; + + +// __assume(padZ%16==0); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + floatyy u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads + udiff[k] = u[l + k] - u_upd; // cache 1w + u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J + udiff0[l + k] = udiff[k]; + div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I + floatyy gradx_upd = 0; +#else + floatyy u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads + floatyy gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J + floatyy grady_upd = 0; +#else + floatyy u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads + floatyy grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w + gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w + gradx[l + k] = gradx_upd; // 1 write + grady[l + k] = grady_upd; // 1 write + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w + ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + + floatyy vx = ubarx[k] + divsigma * qx[l + k]; // 1 read + floatyy vy = ubary[k] + divsigma * qy[l + k]; // 1 read + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < padZ; k++) { + floatyy vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r + floatyy vy = ubary[k] + divsigma * qy_current[k]; // cache 2r + floatyy gx_upd = vx * t[0] + vy * t[1]; + floatyy gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx_current[k] += qxdiff; // write 1 + qy_current[k] += qydiff; // write 1 + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + floatyy div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I + div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J + div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I + div_upd += qx_current[k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd += qy_current[k]; +#endif + + divdiff = div[l + k] - div_upd; // 1 read + div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + + // We need to have two independent accumulators to allow gcc-autovectorization + resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r + resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + } +
\ No newline at end of file |