/*
-----------------------------------------------------------------------
Copyright 2012 iMinds-Vision Lab, University of Antwerp

Contact: astra@ua.ac.be
Website: http://astra.ua.ac.be


This file is part of the
All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").

The ASTRA Toolbox is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

The ASTRA Toolbox is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.

-----------------------------------------------------------------------
$Id$
*/

#include "util.h"
#include "darthelper.h"
#include <cassert>

namespace astraCUDA {

// CUDA function for the selection of ROI
__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height)
{
	float x = (float)(threadIdx.x + 16*blockIdx.x);
	float y = (float)(threadIdx.y + 16*blockIdx.y);

	float w = (width-1.0f)*0.5f;
	float h = (height-1.0f)*0.5f;

	if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f) 
	{
		float* d = (float*)in;
		unsigned int o = y*pitch+x; 
		d[o] = 0.0f;
	}
}

void roiSelect(float* out, float radius, unsigned int width, unsigned int height)
{
	float* D_data;

	unsigned int pitch;
	// We abuse dims here...
	SDimensions dims;
	dims.iVolWidth = width;
	dims.iVolHeight = width;

	allocateVolumeData(D_data, pitch, dims);
	copyVolumeToDevice(out, width, dims, D_data, pitch);

	dim3 blockSize(16,16);
	dim3 gridSize((width+15)/16, (height+15)/16);
	devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height);

	copyVolumeFromDevice(out, width, dims, D_data, pitch);

	cudaFree(D_data);
}




// CUDA function for the masking of DART with a radius == 1
__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height)
{
	unsigned int x = threadIdx.x + 16*blockIdx.x;
	unsigned int y = threadIdx.y + 16*blockIdx.y;

	// Sacrifice the border pixels to simplify the implementation. 
	if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
		float* d = (float*)in;
		float* m = (float*)mask;

		unsigned int o2 = y*pitch+x; // On this row.
		unsigned int o1 = o2 - pitch; // On previous row.
		unsigned int o3 = o2 + pitch; // On next row.

		if ((conn == 8 && // 8-connected
		        (d[o1 - 1] != d[o2] || d[o1] != d[o2] || d[o1 + 1] != d[o2] || 
		         d[o2 - 1] != d[o2] ||                   d[o2 + 1] != d[o2] ||
				 d[o3 - 1] != d[o2] || d[o3] != d[o2] || d[o3 + 1] != d[o2] ))
			|| 
			(conn == 4 && // 4-connected
		        (                      d[o1] != d[o2] ||                      
		         d[o2 - 1] != d[o2] ||                  d[o3 + 1] != d[o2] ||
				                       d[o3] != d[o2]                      )))
		{
			m[o2] = 1.0f;
		}
	}
}


// CUDA function for the masking of DART with a radius > 1
__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)
{
	unsigned int x = threadIdx.x + 16*blockIdx.x;
	unsigned int y = threadIdx.y + 16*blockIdx.y;

	// Sacrifice the border pixels to simplify the implementation. 
	if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) 
	{
		float* d = (float*)in;
		float* m = (float*)mask;

		int r = radius;

		// o2: index of the current center pixel
		int o2 = y*pitch+x;

		if (conn == 8) // 8-connected
		{
			for (int row = -r; row <= r; row++) 
			{
				int o1 = (y+row)*pitch+x; 
				for (int col = -r; col <= r; col++) 
				{
					if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
				}
			}
		}
		else if (conn == 4) // 4-connected
		{
			// horizontal
			unsigned int o1 = y*pitch+x; 
			for (int col = -r; col <= r; col++) 
			{
				if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
			}

			// vertical
			for (int row = -r; row <= r; row++) 
			{
				unsigned int o1 = (y+row)*pitch+x; 
				if (d[o1] != d[o2]) {m[o2] = 1.0f; return;}
			}
		}
	}
}


// CUDA function for the masking of ADART with a radius == 1
__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)
{
	unsigned int x = threadIdx.x + 16*blockIdx.x;
	unsigned int y = threadIdx.y + 16*blockIdx.y;

	// Sacrifice the border pixels to simplify the implementation. 
	if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
		float* d = (float*)in;
		float* m = (float*)mask;

		unsigned int o2 = y*pitch+x; // On this row.
		unsigned int o1 = o2 - pitch; // On previous row.
		unsigned int o3 = o2 + pitch; // On next row.

		if (conn == 8)
		{
			if (d[o1 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o1    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o1 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
		    if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
		    if (d[o3 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o3    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o3 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
		}
		else if (conn == 4)
		{
			if (d[o1    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
		    if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			if (d[o3    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
		}
	}
}


// CUDA function for the masking of ADART with a radius > 1
__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)
{
	unsigned int x = threadIdx.x + 16*blockIdx.x;
	unsigned int y = threadIdx.y + 16*blockIdx.y;

	// Sacrifice the border pixels to simplify the implementation. 
	if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
	{
		float* d = (float*)in;
		float* m = (float*)mask;
	
		int r = radius;

		unsigned int o2 = y*pitch+x; // On this row.

		if (conn == 8)
		{
			for (int row = -r; row <= r; row++) 
			{
				unsigned int o1 = (y+row)*pitch+x; 
				for (int col = -r; col <= r; col++) 
				{
					if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
				}
			}
		}
		else if (conn == 4)
		{
			// horizontal
			for (int col = -r; col <= r; col++) 
			{
				if (d[o2+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			}

			// vertical
			for (int row = -r; row <= r; row++) 
			{
				unsigned int o1 = (y+row)*pitch+x; 
				if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
			}
		}
	}
}


void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height)
{
	float* D_segmentationData;
	float* D_maskData;

	unsigned int pitch;
	// We abuse dims here...
	SDimensions dims;
	dims.iVolWidth = width;
	dims.iVolHeight = width;

	allocateVolumeData(D_segmentationData, pitch, dims);
	copyVolumeToDevice(segmentation, width, dims, D_segmentationData, pitch);

	allocateVolumeData(D_maskData, pitch, dims);
	zeroVolumeData(D_maskData, pitch, dims);

	dim3 blockSize(16,16);
	dim3 gridSize((width+15)/16, (height+15)/16);

	if (threshold == 1 && radius == 1)
		devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height);
	else if (threshold > 1 && radius == 1)
		devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height);
	else if (threshold == 1 && radius > 1)
		devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height);
	else 
		devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height);

	copyVolumeFromDevice(mask, width, dims, D_maskData, pitch);

	cudaFree(D_segmentationData);
	cudaFree(D_maskData);

}


__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)
{
	unsigned int x = threadIdx.x + 16*blockIdx.x;
	unsigned int y = threadIdx.y + 16*blockIdx.y;

	// Sacrifice the border pixels to simplify the implementation. 
	if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
	{
		float* d = (float*)in;
		float* m = (float*)out;

		unsigned int o2 = y*pitch+x;
		int r = radius;
		float res = -d[o2];

		for (int row = -r; row < r; row++) 
		{
			unsigned int o1 = (y+row)*pitch+x; 
			for (int col = -r; col <= r; col++) 
			{
				res += d[o1+col];
			}
		}

		res *= b / 4*r*(r+1);
		res += (1.0f-b) * d[o2];

		m[o2] = res;
	}
}


__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height)
{
	unsigned int x = threadIdx.x + 16*blockIdx.x;
	unsigned int y = threadIdx.y + 16*blockIdx.y;

	// Sacrifice the border pixels to simplify the implementation. 
	if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
		float* d = (float*)in;
		float* m = (float*)out;

		unsigned int o2 = y*pitch+x; // On this row.
		unsigned int o1 = o2 - pitch; // On previous row.
		unsigned int o3 = o2 + pitch; // On next row.

		m[o2] = (1.0f-b) * d[o2] + b * 0.125f * (d[o1 - 1] + d[o1] + d[o1 + 1] + d[o2 - 1] + d[o2 + 1] + d[o3 - 1] + d[o3] + d[o3 + 1]);
	}
}


void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height)
{
	float* D_inData;
	float* D_outData;

	unsigned int pitch;
	// We abuse dims here...
	SDimensions dims;
	dims.iVolWidth = width;
	dims.iVolHeight = width;

	allocateVolumeData(D_inData, pitch, dims);
	copyVolumeToDevice(in, width, dims, D_inData, pitch);

	allocateVolumeData(D_outData, pitch, dims);
	zeroVolumeData(D_outData, pitch, dims);

	dim3 blockSize(16,16);
	dim3 gridSize((width+15)/16, (height+15)/16);
	if (radius == 1)
		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height);
	else
		devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height);

	copyVolumeFromDevice(out, width, dims, D_outData, pitch);

	cudaFree(D_outData);
	cudaFree(D_inData);

}



_AstraExport bool setGPUIndex(int iGPUIndex)
{
	if (iGPUIndex != -1) {
		cudaSetDevice(iGPUIndex);
		cudaError_t err = cudaGetLastError();

		// Ignore errors caused by calling cudaSetDevice multiple times
		if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
			return false;
	}

	return true;
}


}