/*================================================================================================
  Filter.h
  Version 1: 08/10/2024

  Purpose: Implements different types of Chebishev filters

Copyright (c) Patrice Koehl.

>>> SOURCE LICENSE >>>

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

>>> END OF LICENSE >>>

================================================================================================== */

#ifndef _FILTER_
#define _FILTER_

/*================================================================================================
 Includes
================================================================================================== */

#include <math.h>
#include <iostream>
#include <chrono>
#include <cstdlib>

/*================================================================================================
 define opA
================================================================================================== */

typedef void (*opA) (int nrow, int ncol, T *x, int ldx, T *y, int ldy, void *mvparam);

/*================================================================================================
  Define a class for Chebyshev filters
================================================================================================== */

template <typename T>
  class FILTER {

	public:

		// Driver to all Chebishev filters
		void chebFilter(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space);

		// Build array for CA filter with scaling for type 1 filters
		void buildTS(eig_info<T> *info);

		// Build array for CA filter with scaling for type 2 filters
		void buildTS2(eig_info<T> *info);

	private:

		// Filter0: no filtering, directly A*x
		void Filter0(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y);

		// Filter 1: a single Chebyshev polynomial (used by chebDav)
		void Filter1(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space);
	
		// Filter 1 CA: a single Chebyshev polynomial (used by chebDav); used
		//		for communication avoiding
		void caFilter1(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space);
	
		// Filter 2: a sum of Chebyshev polynomials (used by chebLan)
		void Filter2(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space);
		void Filter3(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space);
	
		// Filter 2 CA: a sum of Chebyshev polynomials (used by chebLan); used
		//		for communication avoiding
		void caFilter2(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space);

		// Computes (Tm(x)-Tm(y))/Tm(z), where Tm is the chebyshev polynomial of the first kind of order m
		T chebXYZ(T x, T y, T z, int m);

		// Computes rho(x), where rho is a sum of chebyshev polynomials of the first kind
		T computeRHO(eig_info<T> *info, T X);

  };

/*================================================================================================
 chebFilter

	Purpose:
	========
	Driver to all Chebyshev filters

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem and the progress of the eigensolver

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	Space	(input) array of floats or double of size 3 * N * Nvect
		workspace for filter type 2, and N * Nvect for filter type 1

================================================================================================== */

template <typename T>
void FILTER<T>::chebFilter(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *space)
{

	int type = info->filter_type;

	if(type==0) {
		Filter0(op, info, N, Nvect, X, Y);
	} else if(type==1) {
		Filter1(op, info, N, Nvect, X, Y, space);
	} else if(type==2) {
		caFilter1(op, info, N, Nvect, X, Y, space);
	} else if(type==3) {
		Filter2(op, info, N, Nvect, X, Y, space);
	} else if(type==4) {
		caFilter2(op, info, N, Nvect, X, Y, space);
	}

}

/*================================================================================================
 Filter0

	Purpose:
	========
	filter0 is "no filter", i.e. computes Y = AX

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem and the progress of chebDav
	         On exit, points to the initialized data structure.

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	Space	(input) array of floats or double of size N * Nvect
		workspace

================================================================================================== */

template <typename T>
void FILTER<T>::Filter0(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y)
{
	clock_t clk1, clk2;

	clk1 = clock();
	op(N, Nvect, X, N, Y, N, info->mvparams);
	clk2 = clock();
	info->mvp += Nvect;
	info->clk_op += clk2 - clk1;

}

/*================================================================================================
 Filter1          

	Purpose:
	========
	filters an input vector X by a single m-degree Chebyshev polynomial which dampens
 	on an interval [a,b]

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem 

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	Space	(input) array of floats or double of size N * Nvect
		workspace

================================================================================================== */

template <typename T>
void FILTER<T>::Filter1(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *Space)
{
	int inc = 1;
	T scale, alpha;
	int M = N*Nvect;

/*================================================================================================
	Get parameters from "info"
================================================================================================== */

	int m = info->mpoly;
	T a   = info->a;
	T b   = info->b;
	T a0  = info->a0;

	clock_t clk1, clk2;

/*================================================================================================
	Define variables
================================================================================================== */

	T width = (b-a)/2;
	T center = (b+a)/2;
	T sigma = width/(a0-center);
	T sigma1 = sigma;
	T sigma_new;

/*================================================================================================
	Special case: i = 0 (zero-th term in polynomial)
================================================================================================== */


	clk1 = clock();
	op(N, Nvect, X, N, Y, N, info->mvparams);
	clk2 = clock();
	info->mvp += Nvect;
	info->clk_op += clk2 - clk1;

	alpha = -center;
	eig_daxpy_(&M , &alpha, X, &inc, Y, &inc);
	scale = sigma1/width;
	eig_dscal_(&M, &scale, Y, &inc);

/*================================================================================================
	Loop over polynomial monomials
================================================================================================== */

	for(int i = 2; i <= m; i++)
	{
		sigma_new = 1.0/(2.0/sigma1 - sigma);

		clk1 = clock();
		op(N, Nvect, Y, N, Space, N, info->mvparams);
		clk2 = clock();
		info->mvp += Nvect;
		info->clk_op += clk2 - clk1;

		eig_daxpy_(&M , &alpha, Y, &inc, Space, &inc);
		scale = 2.0*sigma_new/width;
		eig_dscal_(&M, &scale, Space, &inc);
		scale = -sigma*sigma_new;
		eig_daxpy_(&M , &scale, X, &inc, Space, &inc);

		eig_dcopy_(&M, Y, &inc, X, &inc);
		eig_dcopy_(&M, Space, &inc, Y, &inc);
		sigma = sigma_new;
	}
}

/*================================================================================================
 caFilter1

	Purpose:
	========
	filters an input vector X by a single m-degree Chebyshev polynomial which dampens
 	on an interval [a,b]

	In this version, we use a shift and implements a Communination Avoiding (CA) method 

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem and the progress of chebDav

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	Space	(input) array of floats or double of size 2*N * Nvect
		workspace

================================================================================================== */

template <typename T>
void FILTER<T>::caFilter1(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *Space)

{

/*================================================================================================
	Some definitions
================================================================================================== */

	int inc;
	char Trans = 'T';
	char noTrans = 'N';
	T beta;	
	T alpha = 1.0;

	opparams<T> *params;
	params    = (opparams<T> *) info->mvparams;
	T *Sigma2 = params->sigma2;
	T *UTX    = params->space;

	int n_eed = params->need;

/*      ==========================================================================================
        Compute UTX = U^T X using dgemm
        ========================================================================================== */

	alpha = 1.0;
	beta = 0.0;
	eig_dgemm_(&Trans, &noTrans, &n_eed, &Nvect, &N, &alpha, params->Ud, &N, X, &N,
		&beta, UTX, &n_eed);

/*      ==========================================================================================
        Compute Y = Tm(a1*A+*a2*I) X using chebFilter
        ========================================================================================== */

	Filter1(op, info, N, Nvect, X, Y, Space); 

/*      ==========================================================================================
	Compute Sigma2*UTX
	========================================================================================== */

	inc = n_eed;
	for(int i = 0; i < n_eed; i++) {
		alpha = Sigma2[i];
		eig_dscal_(&Nvect, &alpha, &UTX[i], &inc);
	} 

/*      ==========================================================================================
        Finally Compute Y = Y + U*S*U^T
        ========================================================================================== */

	beta = 1.0; alpha = 1;
	eig_dgemm_(&noTrans, &noTrans, &N, &Nvect, &n_eed, &alpha, params->Ud, &N,
		UTX, &n_eed, &beta, Y, &N);

 }

/*================================================================================================
 buildTS

	Purpose:
	========
	Builds the array Sigma for CA Chebyshev filter (with scaling)

	Arguments:
	==========

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem
================================================================================================== */

 template <typename T>
 void FILTER<T>::buildTS(eig_info<T> *info)
 {

/*================================================================================================
	Get parameters from "info"
================================================================================================== */

	int m = info->mpoly;
	T a   = info->a;
	T b   = info->b;
	T a0  = info->a0;

	opparams<T> *params;
	params    = (opparams<T> *) info->mvparams;
	T *D      = params->eigVal;
	T *Sigma2 = params->sigma2;

	int n_eed = params->need;
	T sigma   = info->anrm + params->eigVal[0];

	T e = (b-a)/2;
	T c = (b+a)/2;

	T val = (sigma-c)/e; 
	T vala0 = (a0-c)/e; 

	T x;
	for(int i = 0; i < n_eed; i++) {
		x = (D[i]-c)/e;
		Sigma2[i] = chebXYZ(val, x, vala0, m);
	}

 }

/*================================================================================================
 chebXYZ

	Purpose:
	========
	Computes (Tm(x)-Tm(y))/Tm(z), where Tm is the chebyshev polynomial of the first kind of order m

	Arguments:
	==========

	x,y,z	(input) floats or double
		on Entry, the three values x, y, and z 

	m	(input) integer
		on Entry, the order of the polynomial

================================================================================================== */

 template <typename T>
 T FILTER<T>::chebXYZ(T x, T y, T z, int m)
 {
	T p0, p1, p2;
	T s0, s1, s2;
	T sigma, sigma_new;

	p0 = 0; p1 = (x-y)/z; p2 = p1;
	s0 = 1; s1 = y/z;
	sigma = 1/z;

	for(int i = 2; i <= m; i++) {
		sigma_new = 1.0/(2*z-sigma);
		p2 = sigma_new*(2*x*p1 - p0*sigma + 2*(x-y)*s1);
		s2 = sigma_new*(2*y*s1 - s0*sigma);
		s0 = s1; s1 = s2;
		p0 = p1; p1 = p2;
		sigma = sigma_new;
	}

	return p2;
  }

/*================================================================================================
 Filter3

	Purpose:
	========
	filters an input vector X by an m-degree Chebyshev polynomial expansion
	that mimics a step function with value 1 on an interval [a, b]

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem and the progress of chebDav
	         On exit, points to the initialized data structure.

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors
		Note: X should not be modified!

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	work 	(input) array of floats or double of size 3* N * Nvect
		workspace

================================================================================================== */

template <typename T>
void FILTER<T>::Filter3(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *work)
{
	int inc = 1;
	T scale, alpha;
	int M = N*Nvect;

	int m    = info->mpoly;
	T   lmin = info->lmin;
	T   lmax = info->lmax;
	T *mu    = info->mu;
	int flag_zero = info->flag_zero;

	clock_t clk1, clk2;
/*
        m = 7;
        lmin = -0.995567;
        lmax = 36.7866;
        mu[0] = 0.0882825;
        mu[1] = -0.17317;
        mu[2] = 0.16328;
        mu[3] = -0.147755;
        mu[4] = 0.127911;
        mu[5] = -0.105376;
        mu[6] = 0.0819094;
        mu[7] = -0.112317;
*/
	T width = (lmax-lmin)/2;
	T center = (lmax+lmin)/2;
	T t1 = 1.0/width;
	T t2 = 2.0*t1;

	T dmone = -1.0;

	if(flag_zero==1) {
#if defined(GPU)
		cudaMemset(Y, 0, M*sizeof(T));
#else
		memset(Y, 0, M*sizeof(T));
#endif
	}

	T *Vk   = work;
	T *Vkp1 = Vk+M;
	T *Vkm1 = Vkp1+M;
	T *temp;

	eig_dcopy_(&M, X, &inc, Vk, &inc);

/*================================================================================================
	k = 0
================================================================================================== */

	scale = mu[0];
	eig_daxpy_(&M , &scale, Vk, &inc, Y, &inc);
	
/*================================================================================================
	k = 1
================================================================================================== */

	scale = mu[1];
	clk1 = clock();
	op(N, Nvect, Vk, N, Vkp1, N, info->mvparams);
	clk2 = clock();
	info->mvp += Nvect;
	info->clk_op += clk2 - clk1;

	alpha = -center;
	eig_daxpy_(&M , &alpha, Vk, &inc, Vkp1, &inc);
	eig_dscal_(&M, &t1, Vkp1, &inc);
	eig_daxpy_(&M , &scale, Vkp1, &inc, Y, &inc);

	temp = Vkm1; Vkm1 = Vk; Vk = Vkp1; Vkp1 = temp;

	for(int i = 2; i <= m; i++) {
		scale = mu[i];

		clk1 = clock();
		op(N, Nvect, Vk, N, Vkp1, N, info->mvparams);
		clk2 = clock();
		info->mvp += Nvect;
		info->clk_op += clk2 - clk1;

		alpha = -center;
		eig_daxpy_(&M , &alpha, Vk, &inc, Vkp1, &inc);
		eig_dscal_(&M, &t2, Vkp1, &inc);
		eig_daxpy_(&M , &dmone, Vkm1, &inc, Vkp1, &inc);
		eig_daxpy_(&M , &scale, Vkp1, &inc, Y, &inc);

		temp = Vkm1; Vkm1 = Vk; Vk = Vkp1; Vkp1 = temp;

	}
}

/*================================================================================================
 computeRHO

	Purpose:
	========
	filters a value t using a m-degree Chebyshev polynomial expansion
	that mimics a step function with value 1 on an interval [a, b]

	Arguments:
	==========

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem and the progress of chebDav

	X	(input) float or double
		On entry, input value X

	Y	(output) float or double 
		On exit, filtered value

================================================================================================== */

template <typename T>
T FILTER<T>::computeRHO(eig_info<T> *info, T X)
{
	T Xk, Xkp1, Xkm1;

	int m    = info->mpoly;
	T *mu    = info->mu;

	T yval = 0;

/*================================================================================================
	k = 0
================================================================================================== */

	Xk = 1;
	yval += mu[0] * Xk;
	
/*================================================================================================
	k = 1
================================================================================================== */

	Xkp1 = X;
	yval += mu[1]*Xkp1;
	Xkm1 = Xk; Xk = Xkp1;

/*================================================================================================
	k > 1
================================================================================================== */

	for(int i = 2; i <= m; i++) {
		Xkp1 = 2*X*Xk - Xkm1;
		yval += mu[i]*Xkp1;
		Xkm1 = Xk; Xk = Xkp1;
	}

	return yval;
}

/*================================================================================================
 caFilter2

	Purpose:
	========
	filters an input vector X by a sum of m-degree Chebyshev polynomial non zero 
 	on an interval [a,b]

	In this version, we use a shift and implements a Communination Avoiding (CA) method 

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem and the progress of chebDav

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	Space	(input) array of floats or double of size 4*N
		workspace

================================================================================================== */

template <typename T>
void FILTER<T>::caFilter2(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *Space)

{

/*================================================================================================
	Some definitions
================================================================================================== */

	int inc;
	char Trans = 'T';
	char noTrans = 'N';
	T beta;	
	T alpha = 1.0;

	opparams<T> *params;
	params    = (opparams<T> *) info->mvparams;
	T *Sigma2 = params->sigma2;
#if defined(GPU)
	T *dsigma = params->d_sigma2;
#endif
	T *UTX    = params->space;

	int n_eed = params->need;

/*      ==========================================================================================
        Compute UTX = U^T X using dgemm or dgemv
        ========================================================================================== */

	alpha = 1.0;
	beta = 0.0;
	inc = 1;

	if(Nvect>1) {
		eig_dgemm_(&Trans, &noTrans, &n_eed, &Nvect, &N, &alpha, params->Ud, &N, X, &N,
		&beta, UTX, &n_eed);
	} else {
		eig_dgemv_(&Trans, &N, &n_eed, &alpha, params->Ud, &N, X, &inc, &beta, UTX, &inc);
	}

/*      ==========================================================================================
        Compute Y = Tm(a1*A+*a2*I) X using chebFilter
        ========================================================================================== */

	Filter2(op, info, N, Nvect, X, Y, Space); 

/*      ==========================================================================================
	Compute Sigma2*UTX
	========================================================================================== */

	if(Nvect>1) {
		inc = n_eed;
		for(int i = 0; i < n_eed; i++) {
			alpha = Sigma2[i];
			eig_dscal_(&Nvect, &alpha, &UTX[i], &inc);
		} 
	} else {
#if defined(GPU)
		int NBLOCKS = (n_eed+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK;
		elementWise<T><<<NBLOCKS, THREADS_PER_BLOCK>>>(dsigma, UTX, n_eed);
#else
		for(int i = 0; i < n_eed; i++) {
			alpha = Sigma2[i];
			UTX[i] *= alpha;
		}
#endif
	}

/*      ==========================================================================================
        Finally Compute Y = Y + U*S*U^T
        ========================================================================================== */

	beta = 1.0; alpha = 1; inc = 1;

	if(Nvect > 1) {
		eig_dgemm_(&noTrans, &noTrans, &N, &Nvect, &n_eed, &alpha, params->Ud, &N,
			UTX, &n_eed, &beta, Y, &N);
	} else {
		eig_dgemv_(&noTrans, &N, &n_eed, &alpha, params->Ud, &N, UTX, &inc, &beta, Y, &inc);
	}

 }

/*================================================================================================
 buildTS2

	Purpose:
	========
	Builds the array Sigma for CA Chebyshev filter (with scaling)

	Arguments:
	==========

	info    (input) pointer to the structure eig_info
	         On entry, points to the data structure to store the information
	         about the eigenvalue problem
================================================================================================== */

 template <typename T>
 void FILTER<T>::buildTS2(eig_info<T> *info)
 {

/*================================================================================================
	Get parameters from "info"
================================================================================================== */

	T a   = info->lmin;
	T b   = info->lmax;

	opparams<T> *params;
	params      = (opparams<T> *) info->mvparams;
	T *D        = params->eigVal;
	T *Sigma2   = params->sigma2;

	int n_eed = params->need;

	T sigma   = info->anrm + params->eigVal[0];

	T e = (b-a)/2;
	T c = (b+a)/2;

	T val = (sigma-c)/e; 
	val = computeRHO(info, val);
	info->sigmaC = val;

	T x, y;
	for(int i = 0; i < n_eed; i++) {
		x = (D[i]-c)/e;
		y = computeRHO(info,x);
		Sigma2[i] = val - y;
	}

#if defined(GPU)
	T *d_Sigma2 = params->d_sigma2;
	cudaMemcpy(d_Sigma2, Sigma2, n_eed*sizeof(T), cudaMemcpyHostToDevice);
#endif

 }

/*================================================================================================
 Filter2

	Purpose:
	========
	filters an input vector X by an m-degree Chebyshev polynomial expansion
	that mimics a step function with value 1 on an interval [a, b]
	Uses Clenshaw's algorithm for efficient evaluation

	Arguments:
	==========

	op	(input) pointer
		On entry, pointer to the Matrix-Vector operator

	info    (input) pointer to the structure eig_info
		On entry, points to the data structure to store the information
		about the eigenvalue problem

	N	(input) integer
		On entry, size of the matrix

	Nvect	(input) integer
		On entry, number of vectors on which to perform the filtering

	X	(input) array of floats or double of size (N * Nvect)
		On entry, the unfiltered vectors
		Note: X should not be modified!

	Y	(output) array of floats or double of size N * Nvect
		On exit, filtered vectors

	work 	(input) array of floats or double of size 3* N * Nvect
		workspace

================================================================================================== */

template <typename T>
void FILTER<T>::Filter2(opA op, eig_info<T> *info, int N, int Nvect, T *X, T *Y, T *work)
{
	int inc = 1;
	T alpha;
	int M = N*Nvect;

	clock_t clk1, clk2;

	int m    = info->mpoly;
	T   lmin = info->lmin;
	T   lmax = info->lmax;
	T *mu    = info->mu;
	int flag_zero = info->flag_zero;
/*
        m = 7;
        lmin = -0.995567;
        lmax = 36.7866;
        mu[0] = 0.0882825;
        mu[1] = -0.17317;
        mu[2] = 0.16328;
        mu[3] = -0.147755;
        mu[4] = 0.127911;
        mu[5] = -0.105376;
        mu[6] = 0.0819094;
        mu[7] = -0.112317;
*/
	T width = (lmax-lmin)/2;
	T center = (lmax+lmin)/2;

/*================================================================================================
	Initialize workspace vectors:
		b_{k-1}, b_k, b_kp1, temp
================================================================================================== */

	T *b_km1 = work;       
	T *b_k   = b_km1 + M;   
	T *b_kp1 = b_k + M;

	T alpha2 = 2.0/width;
	T alpha1 = 1.0/width;
	T dmone  = -1.0;

/*================================================================================================
	Initialize output vector, unless it is cumulative
================================================================================================== */

	if(flag_zero==1) {
#if defined(GPU)
		cudaMemset(Y, 0, M*sizeof(T));
#else
		memset(Y, 0, M*sizeof(T));
#endif
	}

/*================================================================================================
	Special case: only T_0 term
================================================================================================== */

	if (m == 0) {
		alpha = mu[0];
		eig_daxpy_(&M, &alpha, X, &inc, Y, &inc);

		return;
	}

/*================================================================================================
	Initialize b_{m+1} = 0 and b_m = mu[m] X
================================================================================================== */

#if defined(GPU)
	cudaMemset(b_kp1, 0, M*sizeof(T));
#else
	memset(b_kp1, 0, M*sizeof(T));
#endif
	
	alpha = mu[m];
	eig_dcopy_(&M, X, &inc, b_k, &inc);
	eig_dscal_(&M, &alpha, b_k, &inc);

/*================================================================================================
	Clenshaw's algorithm: work backwards from k = m-1 to k = 1
================================================================================================== */

	for(int k = m-1; k >= 1; k--) {

		/*===============================================================================
		b_{k-1} = mu[k] * X + 2 * B * b_k - b_{k+1}
		================================================================================= */

		clk1 = clock();
		op(N, Nvect, b_k, N, b_km1, N, info->mvparams);
		clk2 = clock();
		info->mvp += Nvect;
		info->clk_op += (clk2-clk1);
		
		alpha = -center;
		eig_daxpy_(&M, &alpha, b_k, &inc, b_km1, &inc);
		eig_dscal_(&M, &alpha2, b_km1, &inc);
		
		alpha = mu[k];
		eig_daxpy_(&M, &alpha, X, &inc, b_km1, &inc);

		eig_daxpy_(&M, &dmone, b_kp1, &inc, b_km1, &inc);
		
		/*===============================================================================
		Shift: b_{k+1} = b_k, b_k = b_{k-1} 
		================================================================================= */

		eig_dcopy_(&M, b_k, &inc, b_kp1, &inc);
		eig_dcopy_(&M, b_km1, &inc, b_k, &inc);
	}

/*================================================================================================
	Final steps: k = 0
		Y = mu[0] * X + B * b_1 - b_2
================================================================================================== */

	op(N, Nvect, b_k, N, b_km1, N, info->mvparams);
	alpha = -center;
	eig_daxpy_(&M, &alpha, b_k, &inc, b_km1, &inc);
	
	alpha = mu[0];
	eig_daxpy_(&M, &alpha, X, &inc, Y, &inc); 
	eig_daxpy_(&M, &alpha1, b_km1, &inc, Y, &inc); 
	eig_daxpy_(&M, &dmone, b_kp1, &inc, Y, &inc);
}

#endif
