/*
-----------------------------------------------------------------------
Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp
           2014-2015, CWI, Amsterdam

Contact: astra@uantwerpen.be
Website: http://sf.net/projects/astra-toolbox

This file is part of the ASTRA Toolbox.


The ASTRA Toolbox is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

The ASTRA Toolbox is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.

-----------------------------------------------------------------------
$Id$
*/

#include <cstdio>
#include <cassert>
#include <iostream>
#include <list>

#include "util.h"
#include "arith.h"

#ifdef STANDALONE
#include "testutil.h"
#endif


typedef texture<float, 2, cudaReadModeElementType> texture2D;

static texture2D gT_FanVolumeTexture;


namespace astraCUDA {

static const unsigned g_MaxAngles = 2560;
__constant__ float gC_SrcX[g_MaxAngles];
__constant__ float gC_SrcY[g_MaxAngles];
__constant__ float gC_DetSX[g_MaxAngles];
__constant__ float gC_DetSY[g_MaxAngles];
__constant__ float gC_DetUX[g_MaxAngles];
__constant__ float gC_DetUY[g_MaxAngles];


// optimization parameters
static const unsigned int g_anglesPerBlock = 16;
static const unsigned int g_detBlockSize = 32;
static const unsigned int g_blockSlices = 64;

static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
{
	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
	dataArray = 0;
	cudaMallocArray(&dataArray, &channelDesc, width, height);
	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);

	gT_FanVolumeTexture.addressMode[0] = cudaAddressModeBorder;
	gT_FanVolumeTexture.addressMode[1] = cudaAddressModeBorder;
	gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;
	gT_FanVolumeTexture.normalized = false;

	// TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
	// not using an array is more efficient.
	//cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
	cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc);

	// TODO: error value?

	return true;
}

// projection for angles that are roughly horizontal
// (detector roughly vertical)
__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
{
	float* projData = (float*)D_projData;
	const int relDet = threadIdx.x;
	const int relAngle = threadIdx.y;

	const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
	if (angle >= endAngle)
		return;

	const int detector = blockIdx.y * g_detBlockSize + relDet;

	if (detector < 0 || detector >= dims.iProjDets)
		return;

	const float fSrcX = gC_SrcX[angle];
	const float fSrcY = gC_SrcY[angle];
	const float fDetSX = gC_DetSX[angle];
	const float fDetSY = gC_DetSY[angle];
	const float fDetUX = gC_DetUX[angle];
	const float fDetUY = gC_DetUY[angle];

	float fVal = 0.0f;

	const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
	const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);

	if (fdy > fdx)
		return;


	for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
		const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet;

		const float fDetX = fDetSX + fDet * fDetUX;
		const float fDetY = fDetSY + fDet * fDetUY;

		// ray: y = alpha * x + beta
		const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX);
		const float beta = fSrcY - alpha * fSrcX;
	
		const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet;

		// intersect ray with first slice

		float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
		float fX = startSlice + 0.5f;

		int endSlice = startSlice + g_blockSlices;
		if (endSlice > dims.iVolWidth)
			endSlice = dims.iVolWidth;

		float fV = 0.0f;
		for (int slice = startSlice; slice < endSlice; ++slice)
		{
			fV += tex2D(gT_FanVolumeTexture, fX, fY);
			fY -= alpha;
			fX += 1.0f;
		}

		fVal += fV * fDistCorr;

	}

	projData[angle*projPitch+detector] += fVal;
}


// projection for angles that are roughly vertical
// (detector roughly horizontal)
__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
{
	const int relDet = threadIdx.x;
	const int relAngle = threadIdx.y;

	const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;

	if (angle >= endAngle)
		return;

	const int detector = blockIdx.y * g_detBlockSize + relDet;

	if (detector < 0 || detector >= dims.iProjDets)
		return;

	float* projData = (float*)D_projData;

	const float fSrcX = gC_SrcX[angle];
	const float fSrcY = gC_SrcY[angle];
	const float fDetSX = gC_DetSX[angle];
	const float fDetSY = gC_DetSY[angle];
	const float fDetUX = gC_DetUX[angle];
	const float fDetUY = gC_DetUY[angle];

	float fVal = 0.0f;

	const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
	const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);

	if (fdy <= fdx)
		return;


	for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
		const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/;

		const float fDetX = fDetSX + fDet * fDetUX;
		const float fDetY = fDetSY + fDet * fDetUY;

		// ray: x = alpha * y + beta
		const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY);
		const float beta = fSrcX - alpha * fSrcY;
	
		const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet;

		// intersect ray with first slice

		float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
		float fY = startSlice + 0.5f;

		int endSlice = startSlice + g_blockSlices;
		if (endSlice > dims.iVolHeight)
			endSlice = dims.iVolHeight;

		float fV = 0.0f;

		for (int slice = startSlice; slice < endSlice; ++slice)
		{
			fV += tex2D(gT_FanVolumeTexture, fX, fY);
			fX -= alpha;
			fY += 1.0f;
		}

		fVal += fV * fDistCorr;

	}

	projData[angle*projPitch+detector] += fVal;
}

bool FanFP_internal(float* D_volumeData, unsigned int volumePitch,
           float* D_projData, unsigned int projPitch,
           const SDimensions& dims, const SFanProjection* angles,
           float outputScale)
{
	assert(dims.iProjAngles <= g_MaxAngles);

	cudaArray* D_dataArray;
	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);

	// transfer angles to constant memory
	float* tmp = new float[dims.iProjAngles];

#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)

	TRANSFER_TO_CONSTANT(SrcX);
	TRANSFER_TO_CONSTANT(SrcY);
	TRANSFER_TO_CONSTANT(DetSX);
	TRANSFER_TO_CONSTANT(DetSY);
	TRANSFER_TO_CONSTANT(DetUX);
	TRANSFER_TO_CONSTANT(DetUY);

#undef TRANSFER_TO_CONSTANT

	delete[] tmp;

	dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
	const unsigned int g_blockSliceSize = g_detBlockSize;

	std::list<cudaStream_t> streams;


	unsigned int blockStart = 0;
	unsigned int blockEnd = dims.iProjAngles;

	dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
	             (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions
	cudaStream_t stream1;
	cudaStreamCreate(&stream1);
	streams.push_back(stream1);
	for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
		FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);

	cudaStream_t stream2;
	cudaStreamCreate(&stream2);
	streams.push_back(stream2);
	for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
		FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);

	cudaStreamDestroy(stream1);
	cudaStreamDestroy(stream2);

	cudaThreadSynchronize();

	cudaTextForceKernelsCompletion();

	cudaFreeArray(D_dataArray);

	return true;
}

bool FanFP(float* D_volumeData, unsigned int volumePitch,
           float* D_projData, unsigned int projPitch,
           const SDimensions& dims, const SFanProjection* angles,
           float outputScale)
{
	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
		SDimensions subdims = dims;
		unsigned int iEndAngle = iAngle + g_MaxAngles;
		if (iEndAngle >= dims.iProjAngles)
			iEndAngle = dims.iProjAngles;
		subdims.iProjAngles = iEndAngle - iAngle;

		bool ret;
		ret = FanFP_internal(D_volumeData, volumePitch,
		                         D_projData + iAngle * projPitch, projPitch,
		                         subdims, angles + iAngle,
		                         outputScale);
		if (!ret)
			return false;
	}
	return true;
}

}

#ifdef STANDALONE

using namespace astraCUDA;

int main()
{
	float* D_volumeData;
	float* D_projData;

	SDimensions dims;
	dims.iVolWidth = 128;
	dims.iVolHeight = 128;
	dims.iProjAngles = 180;
	dims.iProjDets = 256;
	dims.fDetScale = 1.0f;
	dims.iRaysPerDet = 1;
	unsigned int volumePitch, projPitch;

	SFanProjection projs[180];

	projs[0].fSrcX = 0.0f;
	projs[0].fSrcY = 1536.0f;
	projs[0].fDetSX = 128.0f;
	projs[0].fDetSY = -512.0f;
	projs[0].fDetUX = -1.0f;
	projs[0].fDetUY = 0.0f;

#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)

	for (int i = 1; i < 180; ++i) {
		ROTATE0(Src, i, i*2*M_PI/180);
		ROTATE0(DetS, i, i*2*M_PI/180);
		ROTATE0(DetU, i, i*2*M_PI/180);
	}

#undef ROTATE0

	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
	printf("pitch: %u\n", volumePitch);

	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
	printf("pitch: %u\n", projPitch);

	unsigned int y, x;
	float* img = loadImage("phantom128.png", y, x);

	float* sino = new float[dims.iProjAngles * dims.iProjDets];

	memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));

	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);

	float* angle = new float[dims.iProjAngles];

	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
		angle[i] = i*(M_PI/dims.iProjAngles);

	FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);

	delete[] angle;

	copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);

	float s = 0.0f;
	for (unsigned int y = 0; y < dims.iProjAngles; ++y)
		for (unsigned int x = 0; x < dims.iProjDets; ++x)
			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
	printf("cpu norm: %f\n", s);

	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
	printf("gpu norm: %f\n", s);

	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);


	return 0;
}
#endif