/*================================================================================================
  MatVect.h
  Version 1: 12/1/2017

  Purpose: Sets of routine for defining matrix (Hessian) - Vect multiplication

Copyright (c) Patrice Koehl.

>>> SOURCE LICENSE >>>

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

>>> END OF LICENSE >>>

================================================================================================== */

/*================================================================================================
 Includes
================================================================================================== */

#include <math.h>
#include <cstdlib>
#include <vector>
#include <pthread.h>

/*================================================================================================
 BLAS prototypes
================================================================================================== */

extern "C" {
	void daxpy_(int * n ,double *alpha , double * X, int *incx, double * Y,int *incy);
	void dcopy_(int * n, double *Y, int *incy, double *X, int *incx);
	void dscal_(int * n, double *scale, double *Y, int *incy);
}

/*================================================================================================
 Mat_Vect_thread: computes the product of the Hessian with a vector, using the form in which the Hessian
	      is written as a sum of tensor products (parallel code)
================================================================================================== */


void* mat_vect_thread(void* data)
{
	int threadid = *((int *) data);

	int N1    = LBs[threadid].N1;
	int N2    = LBs[threadid].N2;
	int Npoint = LBs[threadid].Npoint;

	int i1,j1;
	double val;

	std::memset(LBs[threadid].Vect2, 0, Npoint*sizeof(double));

	double Xi, Xj, gi;

	for (int pair = N1; pair < N2; pair++)
	{
		i1 = std::get<0>(LBs[threadid].ListPair[pair]);
		j1 = std::get<1>(LBs[threadid].ListPair[pair]);
		gi = std::get<2>(LBs[threadid].ListPair[pair]);
			
		Xi = LBs[threadid].Vect1[i1] * LBs[threadid].invArea[i1];
		Xj = LBs[threadid].Vect1[j1] * LBs[threadid].invArea[j1];

		val = gi*(Xi-Xj);
		LBs[threadid].Vect2[i1] += val;
		LBs[threadid].Vect2[j1] -= val;
	}
	return 0;
}
/*================================================================================================
 MatVect: computes the product of the Hessian with multiple vectors, using the form in which the Hessian
	      is written as a sum of tensor products
================================================================================================== */

void matVect(int N, double *X, double *Y, int nthreads)
{
	int Npair = LBs[0].ListPair.size();
	int nval = Npair / nthreads;
	int N1, N2;
	double alpha=1.0;
	int inc = 1;

	for(int i = 0; i < nthreads; i++) 
	{
		N1 = i*nval;
		N2 = N1 + nval;
		if(i == nthreads-1) N2 = Npair;
		threadids[i]=i;
		LBs[i].N1 = N1;
		LBs[i].N2 = N2;
		LBs[i].Vect1  = X;

		pthread_create(&threads[i], NULL, mat_vect_thread, (void*) &threadids[i]);
	}
	
/*      ==========================================================================================
	Join all the threads (to make sure they are all finished)
        ========================================================================================== */

	memset(Y, 0, N*sizeof(double));
	for (int i=0; i < nthreads; i++)
	{
		pthread_join(threads[i], NULL);
		daxpy_(&N, &alpha, LBs[i].Vect2, &inc, Y, &inc);
	}

	for(int i = 0; i < N; i++) Y[i] *= LBs[0].invArea[i];
}

/*================================================================================================
 Mat_NVect_thread: computes the product of the Hessian with a vector, using the form in which the Hessian
	      is written as a sum of tensor products (parallel code)
================================================================================================== */


void* mat_nvect_thread(void* data)
{
	int threadid = *((int *) data);

	int Npair = LBs[threadid].ListPair.size();
	int Nvect = LBs[threadid].Nvect;
	int Npoint = LBs[threadid].Npoint;

	int i1,j1;
	double val;

	std::memset(LBs[threadid].Vect3, 0, Nvect*Npoint*sizeof(double));
	double Xi, Xj, gi;
	int offset = 0;

	for (int pair = 0; pair < Npair; pair++)
	{
		i1 = std::get<0>(LBs[threadid].ListPair[pair]);
		j1 = std::get<1>(LBs[threadid].ListPair[pair]);
		gi = std::get<2>(LBs[threadid].ListPair[pair]);

		offset = 0;
		for (int ivect = 0; ivect < Nvect; ivect++)
		{
			Xi = LBs[threadid].Vect1[offset+i1] * LBs[threadid].invArea[i1];
			Xj = LBs[threadid].Vect1[offset+j1] * LBs[threadid].invArea[j1];

			val = gi*(Xi-Xj);
			LBs[threadid].Vect3[offset+i1] += val;
			LBs[threadid].Vect3[offset+j1] -= val;
			offset = offset + Npoint;
		}
	}
	return 0;
}
/*================================================================================================
 matMultiVect: computes the product of the Hessian with multiple vectors, using the form in which the Hessian
	      is written as a sum of tensor products
================================================================================================== */

void matMultiVect(int N, int Nvect, double *X, double *Y, int nthreads)
{
	if(Nvect < nthreads) {
		for(int i = 0; i < Nvect; i++)
		{
			matVect(N, &X[i*N], &Y[i*N], nthreads);
		}
		return;
	}

	int nval = Nvect / nthreads;
	int N1, N2, N3;

	for(int i = 0; i < nthreads; i++) 
	{
		N1 = i*nval;
		N2 = N1 + nval;
		N3  = N2 - N1;

		threadids[i]=i;
		LBs[i].Nvect = N3;
		LBs[i].Vect1  = &X[N*N1];
		LBs[i].Vect3  = &Y[N*N1];

		pthread_create(&threads[i], NULL, mat_nvect_thread, (void*) &threadids[i]);
	}
	
/*      ==========================================================================================
	Join all the threads (to make sure they are all finished)
        ========================================================================================== */

	for (int i=0; i < nthreads; i++)
	{
		pthread_join(threads[i], NULL);
	}

/*      ==========================================================================================
	Compute last few (if any)
        ========================================================================================== */

	for(int i = nthreads*nval; i < Nvect; i++)
	{
		matVect(N, &X[i*N], &Y[i*N], nthreads);
	}

	for(int j = 0; j < Nvect; j++)
	{
		for(int i = 0; i < N; i++) Y[i+j*N] *= LBs[0].invArea[i];
	}

	return;
}

/*================================================================================================
 chebyshevFilterSlim: filters an input vector X by an m-degree Chebyshev polynomial which dampens
 		  on an interval [a,b]
================================================================================================== */

void chebishevFilterSlim(int N, int Nvect, double *X, int m, double a, double b, double *Y, 
double *Ynew, int nthreads)
{
	int inc = 1;
	double scale;
	int M = N*Nvect;

	double width  = (b-a)/2;
	double center = (b+a)/2;

	matMultiVect(N, Nvect, X, Y, nthreads);
	scale = -1.0;
	dscal_(&M, &scale, Y, &inc);
	daxpy_(&M , &center, X, &inc, Y, &inc);
	scale = 1.0/width;
	dscal_(&M, &scale, Y, &inc);

	for(int i = 1; i < m; i++)
	{
		matMultiVect(N, Nvect, Y, Ynew, nthreads);
		scale = -1.0;
		dscal_(&M, &scale, Ynew, &inc);
		daxpy_(&M , &center, Y, &inc, Ynew, &inc);
		scale = 2.0/width;
		dscal_(&M, &scale, Ynew, &inc);
		scale = -1.0;
		daxpy_(&M , &scale, X, &inc, Ynew, &inc);

		dcopy_(&M,Y,&inc,X,&inc);
		dcopy_(&M,Ynew,&inc,Y,&inc);
	}
}
/*================================================================================================
 chebShiftFilter: filters an input vector X by an m-degree Chebyshev polynomial which dampens
 		  on an interval [a,b]
================================================================================================== */

void chebShiftFilter(int N, double *X, int m, double a, double b, double shift, double *Y, double *Ynew, int nthreads)
{
	int inc = 1;
	double scale;

	double width  = (b-a)/2;
	double center = (b+a)/2;

	matVect(N, X, Y, nthreads);
	scale = -1.0;
	dscal_(&N, &scale, Y, &inc);
	daxpy_(&N , &center, X, &inc, Y, &inc);
	scale = 1.0/width;
	dscal_(&N, &scale, Y, &inc);

	for(int i = 1; i < m; i++)
	{
		matVect(N, Y, Ynew, nthreads);
		scale = -shift;
		daxpy_(&N , &scale, Y, &inc, Ynew, &inc);
		scale = -1.0;
		dscal_(&N, &scale, Ynew, &inc);
		daxpy_(&N , &center, Y, &inc, Ynew, &inc);
		scale = 2.0/width;
		dscal_(&N, &scale, Ynew, &inc);
		scale = -1.0;
		daxpy_(&N , &scale, X, &inc, Ynew, &inc);

		dcopy_(&N,Y,&inc,X,&inc);
		dcopy_(&N,Ynew,&inc,Y,&inc);
	}
}

/*================================================================================================
 chebyshevFilter: filters an input vector X by an m-degree Chebyshev polynomial which dampens
 		  on an interval [a,b]
================================================================================================== */

void chebishevFilter(int N, int Nvect, double *X, int m, double a, double b, double a0, double *Y, double *Ynew, int nthreads)
{
	int inc = 1;
	double scale, alpha;
	int M = N*Nvect;

	double width = (b-a)/2;
	double center = (b+a)/2;
	double sigma = width/(a0-center);
	double sigma1 = sigma;
	double sigma_new;

	matMultiVect(N, Nvect, X, Y, nthreads);
	alpha = -center;
	daxpy_(&M , &alpha, X, &inc, Y, &inc);
	scale = sigma1/width;
	dscal_(&M, &scale, Y, &inc);

	for(int i = 1; i < m; i++)
	{
		sigma_new = 1.0/(2.0/sigma1 - sigma);

		matMultiVect(N, Nvect, Y, Ynew, nthreads);
		daxpy_(&M , &alpha, Y, &inc, Ynew, &inc);
		scale = 2.0*sigma_new/width;
		dscal_(&M, &scale, Ynew, &inc);
		scale = -sigma*sigma_new;
		daxpy_(&M , &scale, X, &inc, Ynew, &inc);

		dcopy_(&M,Y,&inc,X,&inc);
		dcopy_(&M,Ynew,&inc,Y,&inc);
		sigma = sigma_new;
	}
}
