/*================================================================================================
  BlockChebDav.h
  Version 1: 12/1/2017

  Purpose: Finds the top N eigenvalues / eigenvectors of a Hessian that is given
           as a sum of tensors
  Method: Block Chebyshev-Davidson algorithm

  Y. Zhou and Y. Saad, A Chebyshev-Davidson algorithm for large symmetric
  eigenproblems, SIAM J. Matrix Anal. Appl., 29, 954-971 (2007)

  Y. Zhou. A block Chebyshev-Davidson method with inner-outer restart for
  large eigenvalue problems. J. Comput. Phys, 229, 9188-9200 (2010)

Copyright (c) Patrice Koehl.

>>> SOURCE LICENSE >>>

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

>>> END OF LICENSE >>>

================================================================================================== */

#ifndef _BLOCKCHEBDAV_
#define _BLOCKCHEBDAV_

/*================================================================================================
 Includes
================================================================================================== */

#include <math.h>
#include <iostream>
#include <chrono>
#include <cstdlib>

#include "MatVect.h"
#include "mt.h"

/*================================================================================================
  Prototypes for BLAS and LAPACK
================================================================================================== */

extern "C" {

	void daxpy_(int * n ,double *alpha , double * X, int *incx, double * Y,int *incy);
	double dnrm2_(int * n, double * X, int *incx);
	void dscal_(int * n, double * alpha, double * X, int *incx);
	void dcopy_(int * n, double * X, int *incx, double * Y, int *incy);
	double ddot_(int * n, double * u, int * incu, double * v, int *incv);

	void dgemv_(char * trans, int * m, int * n, double * alpha, double *A,
		int *lda, double * X, int * incx, double *beta, double * Y, int * incy);

	void dgemm_(char * transa, char * transb, int * m, int * n, int * k,
		double * alpha, double * A, int * lda,
		double * B, int * ldb, double * beta, double * C, int * ldc);

	void dsyevd_(char * JOBZ, char * UPLO, int *N, double *A, int *LDA, double *W, 
	double *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO);

}


/*================================================================================================
  Define a class for computing eigenvectors / eigenvalues of a symmetric sparse matrix
  stored as sum of outer products
================================================================================================== */

  class blockChebDav {

	public:

		// driver for computing some eigenpairs of a symmetric sparse matrix stored 
		// as sum of vector outer products
		void eigenpairs(int N, int NE, double *eigVal, double *eigVect, int *k_conv, 
		int nthreads);

	private:

		// check orthogonality of current vectors
		double checkOrtho(int N, int M, double *V, double *work);

		// GramSchmidt orthogonalization
		double gramSchmidt(int N, int M, double *V, double *coef, MersenneTwister *mt);

		// Gram Schmidt over a set of vectors
		void blockGramSchmidt(int N, int M, int Nadd, double *V, double *coef, 
		MersenneTwister *mt);
	
		// applies the DGKS ortho-normalization technique to ortho-normalize V(:, M:M+Madd)
		void orthoNormalize(int N, int M, int Madd, double *V, double *coef, 
		MersenneTwister *mt);

		// prepare the Rayleigh quotient matrix for next iteration
		void updateH(int Nlow, int Nupper, double *D, double *H);

		// init parameters based on size of the problem
		void init(int N, int NE);

		// initialize a random vector
		void randomVector(int N, double *Vect, MersenneTwister *mt);

		// finds the largest eigenvalue of the matrix using power lethod
		double powerMethod(int N, double *Temp1, double *Temp2, double tol, int nthreads, 
		int *mvp, MersenneTwister *mt);

		// Swap two arrays
		void swapArray(int N, double *Array1, double *Array2);

		// Swap eigenpairs if not in increasing order
		int swapPair(int Ncol, int Nrow, double *eigVal, double *eigVect);

	protected:

		int m;         // order of Chebyshev polynomials
		int dim_max;   // maximum subspace dimension
		int act_max;   // maximum active subspace dimension
		double tol;       //     convergence tolerance
		int iter_max;  // maximum number of iterations
		int Nblock;	   // number of vectors per block
		int lwork;  // sizes of auxiliart array for dsyev / dsyevd
		int liwork; // sizes of auxiliary array for dsyevd

		double *eigVal; // local array that stores eigenvalues
		double *eigVect; // local array that stores eigenvectors
		double *X, *H, *D; // local work arrays of doubles
		int *iwork;	 // work array of integers, for dsyevd
		double *dwork;	 // work array of doubles, for dsyevd

  };

/*================================================================================================
 eigenpair : find the NE smallest eigenvalues of a Hessian, using the Chebyshev-Davidson
	     algorithm

	    Full details in:
  		Y. Zhou and Y. Saad, A Chebyshev-Davidson algorithm for large symmetric
  		eigenproblems, SIAM J. Matrix Anal. Appl., 29, 954-971 (2007)

================================================================================================== */

  void blockChebDav::eigenpairs(int N, int NE, double *Val, double *Vect, int *k_conv,
		int nthreads)
  {

/*================================================================================================
	Input:
		N         : number of rows in the matrix
		NE        : number of eigenpairs to be computed
		nthreads  : number of threads for parallel computation
	Output:
		eigVal    : the NE eigenvalues
		eigVect   : the corresponding eigenvectors
		k_conv    : # of eigen pairs that have converged
================================================================================================== */

/*================================================================================================
	Declare some variables
================================================================================================== */

	int inc = 1;
	int k_c, k_found;
	int k_sub, k_act, k_ri;
	int h_size, n_mid;
	int info;
	int i_count, no_swap, n_swap;

	double norm;
	double alpha, beta, scale;
	double upperb, lowerb, a0;

	char Trans   = 'T';
	char NoTrans = 'N';
	char U       = 'U';
	char V       = 'V';

	int mvp = 0;

/*================================================================================================
	Initialize
================================================================================================== */

	init(N, NE);
	int NNblock = N*Nblock;

	MersenneTwister *mt = new MersenneTwister();
	unsigned long seed = (unsigned long) time(NULL);
	mt-> init_genrand(seed);

/*================================================================================================
	Initialize procedure
================================================================================================== */

	upperb = powerMethod(N, X, dwork, tol, nthreads, &mvp, mt);

	randomVector(NNblock, X, mt);

	lowerb = upperb/4;
	a0     = 0;

	k_ri = std::max(act_max/2, act_max - 3*Nblock);

	k_sub = 0;
	k_act = 0;
	k_c   = 0;

/*================================================================================================
	Iterate until we have enough eigenpairs...
================================================================================================== */

	beta = 0.0;
	alpha = 1.0;
	int M;

	for(int iter = 0; iter < iter_max; iter++)
	{

		if(iter % 10 == 0) {
			std::cout << "Iter #: " << iter << "# of eigenvals: " << k_c << std::endl;
		}

/*================================================================================================
		Apply polynomial filtering on current estimate of eigenvector
================================================================================================== */

		chebishevFilter(N, Nblock, X, m, lowerb, upperb, a0, &eigVect[N*k_sub], dwork, nthreads);

		mvp = mvp + m*Nblock;
		
/*================================================================================================
		Orthonormalize against vectors in the current sub-space
		(note that those vectors are expected to be orthonormal)
================================================================================================== */

		orthoNormalize(N, k_sub, Nblock, eigVect, dwork, mt);

/*================================================================================================
		Build new corresponding column in matrix H
		(note that we do not fill in the corresponding row, as dsyevd only reads the
		upper diagonal part of the matrix)
================================================================================================== */

		matMultiVect(N, Nblock, &eigVect[k_sub*N], dwork, nthreads);
		mvp = mvp + Nblock;

		k_act = k_act + Nblock;
		k_sub = k_sub + Nblock;
		h_size = k_act;
		dgemm_(&Trans, &NoTrans, &k_act, &Nblock, &N, &alpha, &eigVect[N*k_c], &N, 
		dwork, &N, &beta, &H[k_act*(k_act-Nblock)], &k_act);

		dsyevd_(&V, &U, &k_act, H, &k_act, D, dwork, &lwork, iwork, &liwork, &info);

/*================================================================================================
		Restart if the active subspace becomes too big (Inner restart)
================================================================================================== */

		if(k_act+Nblock >= act_max) {
			k_act = k_ri;
			k_sub = k_act + k_c;
		}

/*================================================================================================
		Perform Ritz-Raleigh refinement
================================================================================================== */

		M = N*h_size;
		dcopy_(&M, &eigVect[N*k_c], &inc, dwork, &inc);
		dgemm_(&NoTrans, &NoTrans, &N, &k_act, &h_size, &alpha, dwork,
		&N, H, &h_size, &beta, &eigVect[N*k_c], &N);

/*================================================================================================
		Check for convergence of a few current eigenvectors
================================================================================================== */

		k_found = 0;
		no_swap = 0;
//		i_count = std::max(Nblock, 3);
		i_count = Nblock;

		for (int ic = 0; ic < i_count ; ic++) {
			matVect(N, &eigVect[N*(k_c+ic)], dwork, nthreads);
			mvp = mvp + 1;
			scale = -D[ic];
			daxpy_(&N, &scale, &eigVect[N*(k_c+ic)], &inc, dwork, &inc);
			norm = ddot_(&N, dwork, &inc, dwork, &inc);
			norm = std::sqrt(norm);
			if(norm <= tol*D[h_size-1] ) {
				eigVal[k_c+k_found] = D[ic];
				n_swap = swapPair(k_c+k_found, N, eigVal, eigVect);
				k_found = k_found + 1;
				if(n_swap != 0) no_swap = 1;
			} else {
				break;
			}
		}
		k_c = k_c + k_found;

/*================================================================================================
		If we have enough eigenpairs, exit
================================================================================================== */

		if(k_c >= NE && no_swap == 0) {
			*k_conv = k_c;
			std::cout << "Iter #: " << iter << " # of eigenvals: " << k_c << std::endl;
			break;
		}

/*================================================================================================
		Shift k_act, if eigenvalues found
================================================================================================== */

		if(k_found > 0) {
			k_act = k_act - k_found;
		}

/*================================================================================================
		Restart if the active subspace becomes too big (Inner restart)
================================================================================================== */

		if(k_sub >= dim_max) {
			k_sub = dim_max - 2*Nblock;
			k_act = k_sub - k_c;
		}

/*================================================================================================
		Update boundaries for polynomial filtering
================================================================================================== */

		n_mid = (h_size-k_found)/2;
		if((h_size-k_found) % 2 == 0) {
			lowerb = (D[n_mid-1] + D[n_mid])/2;
		} else {
			lowerb = D[n_mid];
		}
		if(a0 > D[0]) a0 = std::abs(D[0]);

/*================================================================================================
		Prepare for next iteration
================================================================================================== */

		dcopy_(&NNblock, &eigVect[N*k_c], &inc, X, &inc);
		updateH(k_found, k_act + Nblock, D, H);

	}

/*================================================================================================
	Transfer eigenvalues / eigenvectors
================================================================================================== */

	for(int i = 0; i < std::min(*k_conv, NE); i++) {
		Val[i] = eigVal[i];
	}
	for(int i = 0; i < N*std::min(*k_conv, NE); i++) {
		Vect[i] = eigVect[i];
	}

/*================================================================================================
	Clean up all temporary arrays
================================================================================================== */

	delete [] eigVal; delete [] eigVect;
	delete [] X; delete [] H; delete [] D;
	delete [] dwork; 
//	delete [] iwork;

}

/*================================================================================================
  Check orthogonality
================================================================================================== */


double blockChebDav::checkOrtho(int N, int M, double *V, double *work)
{
	double alpha = 1.0;
	double beta  = 0.0;
	int inc = 1;
	char Trans   = 'T';
	char NoTrans = 'N';

	dgemm_(&Trans, &NoTrans, &M, &M, &N, &alpha, V, &N, V, &N, &beta, work, &M);

	for(int m = 0; m < M; m++) {
		work[m*M+m] = work[m*M+m] - 1.0;
	}
	int M2 = M*M;
	double norm = ddot_(&M2, work, &inc, work, &inc);
	norm = std::sqrt(norm);
	return norm;
  }

/*================================================================================================
 gramSchmidt: applies the Gram Schmidt ortho-normalization technique to ortho-normalize V(:, M + 1)
                 against V(:,0:M).
                 On output, V(:,0:M+1) should be orthonormal.

		It is important that the input matrix V(:, 0:M) is orthonormal

	Input:
		N: number of rows in matrix
		M: current # of columns that are orthonormal
		V : the matrix
		coef: work array of size (M+1)
	Output:
		V, the fully orthonormal matrix

================================================================================================== */

  double blockChebDav::gramSchmidt(int N, int M, double *V, double *coef, MersenneTwister *mt)
  {

	double eps = 2.2e-14;
	double reorth = 0.5;

/*      ==========================================================================================
	Loop over new vector
        ========================================================================================== */

	double normX, normY, scal, new_norm;
	int inc=1;
	char Trans = 'T';
	char NoTrans = 'N';
	double alpha = 1;
	double alpham = -1;
	double beta0 = 0;
	double beta1 = 1;
	double fact;
	int repeat;

	int current = M;

	double normC;

/*      ==========================================================================================
		Make sure vector is not 0; if it is, set to a random vector
        ========================================================================================== */

	normX = ddot_(&N, &V[current*N], &inc, &V[current*N], &inc);
	normX = std::sqrt(normX);

	fact = 1.0;
	if(normX <= eps) {
		fact = 0.0;
		randomVector(N, &V[current*N], mt);
		normX = ddot_(&N, &V[current*N], &inc, &V[current*N], &inc);
		normX = std::sqrt(normX);
	}
	scal = 1.0/normX;
	dscal_(&N, &scal, &V[current*N], &inc);
	normY = 1.0;

	if(current ==0) return normY;

/*      ==========================================================================================
		Allow for repeats (with random vectors) if orthogonalization fails twice
	        (which "should" not happen)
        ========================================================================================== */

	repeat = 0;
	while (repeat < 5 ) {

/*      ==========================================================================================
		Project new vector onto all preceeding vectors in matrix
        ========================================================================================== */

		dgemv_(&Trans, &N, &current, &alpha, V, &N, &V[current*N], &inc, &beta0, 
		coef, &inc);
		normC = ddot_(&current, coef, &inc, coef, &inc);
		normC = std::sqrt(normC);

/*      ==========================================================================================
			Remove projection
        ========================================================================================== */

		dgemv_(&NoTrans, &N, &current, &alpham, V, &N, coef, &inc, &beta1, 
		&V[current*N], &inc);

/*      ==========================================================================================
		Check new vector
        ========================================================================================== */

		new_norm = ddot_(&N, &V[current*N], &inc, &V[current*N], &inc);
		new_norm = std::sqrt(new_norm);

		if(new_norm > reorth*normY) {

/*      ==========================================================================================
			If pass renormalization test, normalize and we are done for this vector
        ========================================================================================== */

			scal = 1/new_norm;
			dscal_(&N, &scal, &V[current*N], &inc);
			return fact*new_norm/normY;

		} else {

/*      ==========================================================================================
			If not, repeat process
        ========================================================================================== */

			if(new_norm > eps*normY) {
				scal = 1/new_norm;
				dscal_(&N, &scal, &V[current*N], &inc);
			} else {
				randomVector(N, &V[current*N], mt);
				normX = ddot_(&N, &V[current*N], &inc, &V[current*N], &inc);
				normX = std::sqrt(normX);
				scal = 1.0/normX;
				dscal_(&N, &scal, &V[current*N], &inc);
			}
		}
		repeat++;

	}
	return 0;
  }

/*================================================================================================
 blockGramSchmidt: applies the Gram Schmidt ortho-normalization technique to ortho-normalize V(:, M:M+add)
                 against V(:,0:M-1).
                 On output, V(:,0:M+add) should be orthonormal.

		It is important that the input matrix V(:, 0:M-1) is orthonormal

	Input:
		N: number of rows in matrix
		M: current # of columns that are orthonormal
		V : the matrix
		coef: work array
	Output:
		V, the fully orthonormal matrix

================================================================================================== */

  void blockChebDav::blockGramSchmidt(int N, int M, int Nadd, double *V, double *coef, 
	MersenneTwister *mt)
  {

/*      ==========================================================================================
	Loop over all new vectors
        ========================================================================================== */

	double norm;
	char Trans = 'T';
	char NoTrans = 'N';
	double alpha = 1;
	double alpham = -1;
	double beta0 = 0;
	double beta1 = 1;

	if(M > 0) {
		dgemm_(&Trans, &NoTrans, &M, &Nadd, &N, &alpha, V,
		&N, &V[N*M], &N, &beta0, coef, &M);
		dgemm_(&NoTrans, &NoTrans, &N, &Nadd, &M, &alpham, V,
		&N, coef, &M, &beta1, &V[N*M], &N);
	}

//	int flag = 0;
	for(int i = 0; i < Nadd; i++)
	{
		norm = gramSchmidt(N, i, &V[N*M], coef, mt);
		if(norm < 0.5) {
//			flag = 1;
			break;
		}
	}

//	if(flag==0) return;

	if(M > 0) {
		dgemm_(&Trans, &NoTrans, &M, &Nadd, &N, &alpha, V,
		&N, &V[N*M], &N, &beta0, coef, &M);
		dgemm_(&NoTrans, &NoTrans, &N, &Nadd, &M, &alpham, V,
		&N, coef, &M, &beta1, &V[N*M], &N);
	}

	for(int i = 0; i < Nadd; i++)
	{
		norm = gramSchmidt(N, i, &V[N*M], coef, mt);
		if(norm < 0.5) {
			norm = gramSchmidt(N, M+i, V, coef, mt);
		}
	}
}


/*================================================================================================
 orthoNormalize: applies the DGKS ortho-normalization technique to ortho-normalize V(:, M:M+Madd)
                 against V(:,0:M).
                 On output, V(:,0:M+Madd) should be orthonormal.

		It is important that the input matrix V(:, 1:M) is orthonormal

	Input:
		N: number of rows in matrix
		M: current # of columns that are orthonormal
		Madd: number of columns not yet orthogonal, and scaled
		V : the matrix
		coef: work array of size (M+Madd)
	Output:
		V, the fully orthonormal matrix

================================================================================================== */

  void blockChebDav::orthoNormalize(int N, int M, int Madd, double *V, double *coef, MersenneTwister *mt)
{

	int current;
	for (int m = 0; m< Madd; m++)
	{
		current = M + m;
		gramSchmidt(N, current, V, coef, mt);
	}

  }

/*================================================================================================
  updateH: prepare the Rayleigh quotient matrix for next iteration
================================================================================================== */

  void blockChebDav::updateH(int Nlow, int Nupper, double *D, double *H)
  {
	int Nsize = Nupper;
	memset(H, 0, Nsize*Nsize*sizeof(double));
	for (int i=0; i < Nsize-Nblock; i++) {
		H[i*Nsize + i] = D[Nlow+i];
	}
  }

/*================================================================================================
  Initialize parameters / local arrays based on size of the problem
================================================================================================== */

  void blockChebDav::init(int N, int NE)
  {

	//Parameters
	m = 60;				// order of Chebishev polynomials;
	tol = 1.e-8;			// tolerance for convergence
	act_max = std::min(400, N/2);  // max size of the "active space"
	dim_max  = std::max(2*NE, NE + act_max); // max size of eigenspace: known + active space
	iter_max = 10000;			// total # of iterations
	Nblock   = 16;				// block size, for efficiency

	//local arrays
	eigVal = new double[dim_max];
	eigVect = new double[N*dim_max];

	long isize1 = N*Nblock;
	long isize2 = act_max*act_max;
	X     = new double[isize1];
	H     = new double[isize2];
	D     = new double[act_max];

	double sizeopt;
	int isizeopt;
	int Nsize = N*act_max;
	char U       = 'U';
	char V       = 'V';
	int info;

	lwork = -1;
	liwork = -1;
	dsyevd_(&V, &U, &dim_max, H, &dim_max, D, &sizeopt, &lwork, &isizeopt, &liwork, &info);
	lwork = (int) sizeopt;
	liwork = isizeopt;
	iwork = new int[liwork];
	Nsize = std::max(Nsize, lwork);
	dwork = new double[Nsize];

  }

/*================================================================================================
 RandomVector: Generate a random vector of size N, with each component in [0,1[
================================================================================================== */

void blockChebDav::randomVector(int N, double *Vect, MersenneTwister *mt)
{

	double rnd;
	for(int i = 0; i < N; i++)
	{
		rnd = mt->genrand_res53();
		Vect[i] = rnd;
	}
}

/*================================================================================================
 PowerMethod: finds the largest eigenvalue of the matrix
================================================================================================== */

double blockChebDav::powerMethod(int N, double *Temp1, double *Temp2, double tol, int nthreads, 
	int *mvp, MersenneTwister *mt)
{

	int iter_max = 200;
	int inc = 1;
	double eig0, eig1;
	double norm, fact;

/*================================================================================================
 	Initialize eigenvector
================================================================================================== */

	randomVector(N, Temp1, mt);
	norm = dnrm2_(&N, Temp1, &inc);

/*================================================================================================
 	Power iterations
================================================================================================== */

	eig0 = 0.0;

	for(int iter =0; iter < iter_max; iter++)
	{
/*================================================================================================
 		Hessian - vector multiply
================================================================================================== */

		matVect(N, Temp1, Temp2, nthreads);
		*mvp = *mvp + 1;

/*================================================================================================
 		Normalize new vector
================================================================================================== */

		norm = dnrm2_(&N, Temp2, &inc);
		eig1 = ddot_(&N, Temp1, &inc, Temp2, &inc);
		fact = 1.0/norm;
		dscal_(&N, &fact, Temp2, &inc);

/*================================================================================================
 		Check for convergence
================================================================================================== */

		if(std::abs(eig1-eig0) < tol) {
			break;
		}

/*================================================================================================
 		Prepare for next iteration
================================================================================================== */

		dcopy_(&N, Temp2, &inc, Temp1, &inc);
		eig0 = eig1;

	}

	return eig0;

}

/*================================================================================================
 swapArray:	Swap two arrays
================================================================================================== */

void blockChebDav::swapArray(int N, double *Array1, double *Array2)
{
	double element;
	for(int i = 0; i < N; i++)
	{
		element = Array1[i];
		Array1[i] = Array2[i];
		Array2[i] = element;
	}
}

/*================================================================================================
 swapPair:	Swap eigenpairs if not in increasing order
================================================================================================== */

int blockChebDav::swapPair(int Ncol, int Nrow, double *eigVal, double *eigVect)
{
	double mu = eigVal[Ncol];
	int nswap = 0;

	if(Ncol==0) return 0;

	for(int i = Ncol-1; i >=0; i--)
	{
		if (mu >= eigVal[i] ) break;

		eigVal[i+1] = eigVal[i];
		eigVal[i] = mu;
		swapArray(Nrow, &eigVect[Nrow*(i+1)], &eigVect[Nrow*i]);
	}
	return nswap;
}

#endif
