/* ====OT1_H ===================================================================================
 *
 * Author: Patrice Koehl (in collaboration with Henri Orland), November 2018
 * Department of Computer Science
 * University of California, Davis
 *
 * This file implements different methods needed to solve the optimal transport problem using
 * the minimization of a free energy
 *
 =============================================================================================== */

#ifndef _OT1_H_
#define _OT1_H_

  #include <cmath>
  #include <algorithm>
  #include <functional>
  #include "VectorOps.h"
  #include "ConjGradOT.h"

/* ===============================================================================================
   prototypes for BLAS and LAPACK functions 
   =============================================================================================== */

  extern "C" {

	// BLAS1: copy one vector into another
	void dcopy_(int * n, double * X, int *incx, double * Y, int *incy);

	// BLAS1: scale a vector
	void dscal_(int * n, double *scale, double *Y, int *incy);

	// BLAS1: dot product of two vectors	
	double ddot_(int * n, double * u, int * incu, double * v, int *incv);

	// BLAS1: norm of a vector
	double dnrm2_(int * n, double * X, int *incx);

	// BLAS2: perform Y := alpha*op( A )* B  + beta*Y
	void dgemv_(char * trans, int * m, int * n, double * alpha, double *A,
		int *lda, double * X, int * incx, double *beta, double * Y, int * incy);

	// BLAS3: perform C := alpha*op( A )* op(B)  + beta*C
	void dgemm_(char * transa, char * transb, int * m, int * n, int * k,
		double * alpha, double * A, int * lda,
		double * B, int * ldb, double * beta, double * C, int * ldc);

	// LAPACK: solve a real system of linear equations A * X = B, where A is a symmetric matrix 
	void dsysv_(char *uplo, int *n, int *nrhs, double *A, int *lda, int *ipiv, double *b,
		int *ldb, double *work, int *lwork, int *info);

	// LAPACK: diagonalize a symmetric matrix
        void dsyevd(char * JOBZ, char * UPLO, int *N, double *A, int *LDA, double *W,
        double *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO);

        // LAPACK: solve a real system of linear equations A * X = B, where A is a symmetric matrix 
	void dsysvx_(char *fact, char *uplo, int *n, int *nrhs, double *A, int *lda, double *AF,
		int *ldaf, int *ipiv, double *b, int *ldb, double *x, int *ldx,
		double *rcond, double *ferr, double *berr,
		double *work, int *lwork, int *iwork, int *info);
  }

/* ===============================================================================================
   The OT1 class
   =============================================================================================== */

  class OT1{

  public:

	// Solve for G and d at infinite beta (iterative over beta)
	double ot1(int npoint1, double *m1, int npoint2, double *m2, double *C,
	double *G, double *lambda, double *mu, double beta_init, double beta_final,
	double *Fopt, int iprint, int init, int nthreads);

	// Solve for G at a given beta value
	void solveG(int npoint1, double *m1, int npoint2, double *m2, double *C, double *G, double *lambda, 
	double *mu, double beta, double tol, int *niter, int nthreads);

	void initOT1(int npoint1, int npoint2);

  private:

	// compute Free Energy
	double computeF(int npoint1, double *m1, int npoint2, double *m2, double *C, double beta,
	double *lambda, double *mu);

	// Check marginals: row sums and column sums of the transport plan G
	void checkMarginals(int n1, double *m1, int n2,  double *m2, double *G, double *err_row,
		double *err_col);

	// Compute the tranport plan G based on the auxiliary variables lambda and mu
	void computeG(int npoint1, double *m1, int npoint2, double *m2, double *C, double beta,
	double *lambda, double *mu, double *G);

	// Check current Jacobian system
	void computeRC(int npoint1, double *m1, int npoint2, double *m2, double *C, double beta,
	double *lambda, double *mu, double *err, int nthreads);

	// Solve Jacobian system for updates in Lambda and Mu, using direct solver
	void computedX_direct(int n1, int n2, double beta, double *C, double *lambda, double *mu, 
	int nthreads);

	// Solve Jacobian system for updates in Lambda and Mu, using iterative solver (CG)
	void computedX_iter(int n1, int n2, double beta, double *C, double *lambda, double *mu, 
	int nthreads);

	// internal variables

	double *Work1, *Work2, *Jac;
	double *Jac2, *X;
	double *row, *col, *drow, *dcol;
	double *B;
	double *vone;
	int *IPIV;
	int *iwork;

  };

/* ===============================================================================================
   Common variables
   =============================================================================================== */

/* ===============================================================================================
   checkMarginals

   Notes:
   =====
   Check the marginals of a coupling matrix

   Input:
   =====
	n: 	number of points on space 1
	m1:	measure on space 1
	n2:	number of points on space 2
	m2:	measure on space 2
	G:	coupling matrix
  Output:
	err_row: error on row marginals
	err_col: error on col marginals
   =============================================================================================== */

  void OT1::checkMarginals(int n1, double *m1, int n2,  double *m2, double *G, double *err_row,
		double *err_col)
  {

	int nmax = std::max(n1,n2);
	double *ones = new double[nmax];
	double *vect = new double[nmax];
	for(int i = 0; i <nmax; i++) { ones[i]=1;};

	double alpha, beta;
	alpha = 1.0; beta = 0.0;
	int inc = 1;
	char Trans   = 'T';
	char NoTrans = 'N';

	dgemv_(&NoTrans, &n1, &n2, &alpha, G, &n1, 
			ones, &inc, &beta, vect, &inc);
	for(int i = 0; i <n1; i++) { vect[i] = vect[i] - m1[i];};
	double val = ddot_(&n1, vect, &inc, vect, &inc);
	*err_row = std::sqrt(val);

	dgemv_(&Trans, &n1, &n2, &alpha, G, &n1, 
			ones, &inc, &beta, vect, &inc);
	for(int i = 0; i <n2; i++) { vect[i] = vect[i] - m2[i];};
	val = ddot_(&n2, vect, &inc, vect, &inc);
	*err_col = std::sqrt(val);
  }
/* ===============================================================================================
   computeG

   Input:
   =====
	npoint1:	number of points for point set 1
	m1:		measure on points1
	npoint2:	number of points for point set 2
	m2:		measure on points2
	C:		cost matrix
	beta:		current beta
	lambda:		current values of Lagragians lambda
	mu:		current values of Lagragians mu
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	G:		coupling matrix
   =============================================================================================== */

  void OT1::computeG(int npoint1, double *m1, int npoint2, double *m2, double *C, double beta,
	double *lambda, double *mu, double *G)
  {

	double val;
	double tol = 1.e-8;

	for(int j = 0; j < npoint2; j++) {
		for(int i = 0; i < npoint1; i++) {
			val = C[i+j*npoint1] + lambda[i] + mu[j];
			val = coupling(val, beta);
			if(val < tol) val = 0;
			if(val > 1.- tol) val = 1;
			G[i+npoint1*j] = val;
		}
	}

  }
/* ===============================================================================================
   computeF

   Input:
   =====
	npoint1:	number of points for point set 1
	m1:		measure on points1
	npoint2:	number of points for point set 2
	m2:		measure on points2
	C:		cost matrix
	beta:		current beta
	lambda:		current values of Lagragians lambda
	mu:		current values of Lagragians mu
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	F:		free energy
   =============================================================================================== */

  double OT1::computeF(int npoint1, double *m1, int npoint2, double *m2, double *C, double beta,
	double *lambda, double *mu)
  {

	double val, vexp, x, s1, s2, s3;
	double tolv = 1.e-10;

	s1 = 0;
	for(int i = 0; i < npoint1; i++) {
		s1 += lambda[i]*m1[i];
	}

	s2 = 0;
	for(int j = 0; j < npoint2; j++) {
		s2 += mu[j]*m2[j];
	}

	s3 = 0;
	for(int j = 0; j < npoint2; j++) {
		for(int i = 0; i < npoint1; i++) {
			x = beta*(C[i+j*npoint1] + lambda[i] + mu[j]);
			if(std::abs(x) <tolv) {
				val = 0.5;
			} else if(x < 0) {
				vexp = std::exp(x);
				val = (vexp-1)/(x*vexp);
			} else {
				vexp = std::exp(-x);
				val = (1-vexp)/x;
			}
			s3 += std::log(val);
		}
	}
	double F = -(s1 + s2 + s3/beta);
	return F;

  }



/* ===============================================================================================
   computeRC

   Input:
   =====
	npoint1:	number of points for point set 1
	m1:		measure on points1
	npoint2:	number of points for point set 2
	m2:		measure on points2
	C:		cost matrix
	beta:		current beta
	lambda:		current values of Lagragians lambda
	mu:		current values of Lagragians mu
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	row:		errors on row sums
	col:		errors on col sums
	err_r:		total error on row sums
	err_m:		total error on col sums
   =============================================================================================== */

  void OT1::computeRC(int npoint1, double *m1, int npoint2, double *m2, double *C, double beta,
	double *lambda, double *mu, double *err, int nthreads)
  {

	int n1n2 = npoint1*npoint2;
	char Trans   = 'T';
	char NoTrans = 'N';
	int inc = 1;
	int one = 1;
	double a, b;
	
	dcopy_(&n1n2, C, &inc, Work1, &inc);
	a = 1.0; b = 1;
	dgemm_(&NoTrans, &Trans, &npoint1, &npoint2, &one, &a, lambda, &npoint1, vone, &npoint2, &b, 
		Work1, &npoint1);
	dgemm_(&NoTrans, &Trans, &npoint1, &npoint2, &one, &a, vone, &npoint1, mu, &npoint2, &b, 
		Work1, &npoint1);

	vect_coupling(n1n2, Work1, beta, nthreads);

	dcopy_(&npoint1, m1, &inc, row, &inc);
	a = 1; b = -1;
	dgemv_(&NoTrans, &npoint1, &npoint2, &a, Work1, &npoint1, vone, &inc, &b, row, &inc);
	dcopy_(&npoint2, m2, &inc, col, &inc);
	dgemv_(&Trans, &npoint1, &npoint2, &a, Work1, &npoint1, vone, &inc, &b, col, &inc);

	double err_l = 0;
	double val = ddot_(&npoint1, row, &inc, row, &inc);
	err_l = std::sqrt(val);
//	for(int i = 0; i < npoint1; i++) err_l += std::abs(row[i]);

	double err_m = 0;
	int n2 = npoint2-1;
	val = ddot_(&n2, col, &inc, col, &inc);
	err_m = std::sqrt(val);
//	for(int j = 0; j < npoint2-1; j++) err_m += std::abs(col[j]);

	*err = err_l+err_m;

  }

/* ===============================================================================================
   computedX_direct

   Input:
   =====
	npoint1:	number of points for point set 1
	npoint2:	number of points for point set 2
	beta:		current beta
	C:		Cost matrix
	lambda:		current values of Lagragians lambda
	mu:		current values of Lagragians mu
	row:		errors on row marginals
	col:		errors on col marginals
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	drow:		updates on lambda
	dcol:		updates on mu

   Method:
   ======
	We write the Jacobian as:
        Jac = [A     B]
              [B^T   D]
	where A and D are diagonal matrices
	To solve:
	Jac dF = B
	we write it as:
	A x + B y = a				(1)
	B^T x + D y = b				(2)
	We multiply (2) by BD^{-1}
	A x + B y = a
	BD^{-1}B^T x + B y = BD^{-1}b           (2')
	and (1) - (2') gives:
	(A - BD^{-1}B^T) x = a - BD^{-1}b
	which we solve using dsysv, and then:
	y = D^{-1}b - B^Tx

	Note that if dim(a) > dim(b), we reverse the system
	
   =============================================================================================== */

  void OT1::computedX_direct(int n1, int n2, double beta, double *C, double *lambda, double *mu, int nthreads)
  {

	int npoint1 = n1;
	int npoint2 = n2+1;
	int n1n2 = npoint1*npoint2;
	char Trans   = 'T';
	char NoTrans = 'N';
	int inc = 1;
	int one = 1;
	double a, b;
	
	dcopy_(&n1n2, C, &inc, Work1, &inc);
	a = 1.0; b = 1;
	dgemm_(&NoTrans, &Trans, &npoint1, &npoint2, &one, &a, lambda, &npoint1, vone, &npoint2, &b, 
		Work1, &npoint1);
	dgemm_(&NoTrans, &Trans, &npoint1, &npoint2, &one, &a, vone, &npoint1, mu, &npoint2, &b, 
		Work1, &npoint1);

	vect_dcoupling(n1n2, Work1, beta, nthreads);

	a = 1; b = 0;
	dgemv_(&NoTrans, &npoint1, &npoint2, &a, Work1, &npoint1, vone, &inc, &b, drow, &inc);
	dgemv_(&Trans, &npoint1, &npoint2, &a, Work1, &npoint1, vone, &inc, &b, dcol, &inc);

	int nrhs = 1; char U  = 'L'; int info;
	int lwork;

	if(n1 <= n2+1) {

		for(int j = 0; j < n2; j++) dcol[j] = 1.0/dcol[j];

		for(int j = 0; j < n2; j++) {
			for(int i = 0; i < n1; i++) {
				Work2[i+j*n1] = Work1[i+j*n1]*dcol[j];
			}
		}

		a = -1.0; b = 0;
		dgemm_(&NoTrans, &Trans, &n1, &n1, &n2, &a, Work2, &n1, Work1, &n1, &b, Jac, &n1);
		for(int i = 0; i < n1; i++) Jac[i+n1*i] += drow[i];

		a = 1.0; b = -1.0;
		dgemv_(&NoTrans, &n1, &n2, &a, Work2, &n1, col, &inc, &b, row, &inc);

		lwork = 128*n1;
		dsysv_(&U, &n1, &nrhs, Jac, &n1, IPIV, row, &n1, Work2, &lwork, &info);

		a = -1.0; b = -1.0;
		dgemv_(&Trans, &n1, &n2, &a, Work1, &n1, row, &inc, &b, col, &inc);

		dcopy_(&n1, row, &inc, drow, &inc);
		for(int j = 0; j < n2; j++) dcol[j] = col[j]*dcol[j];

	} else {

		for(int j = 0; j < n1; j++) drow[j] = 1.0/drow[j];

		for(int j = 0; j < n2; j++) {
			for(int i = 0; i < n1; i++) {
				Work2[i+j*n1] = Work1[i+j*n1]*drow[i];
			}
		}

		a = -1.0; b = 0;
		dgemm_(&Trans, &NoTrans, &n2, &n2, &n1, &a, Work1, &n1, Work2, &n1, &b, Jac, &n2);
		for(int i = 0; i < n2; i++) Jac[i+n2*i] += dcol[i];

		a = 1.0; b = -1.0;
		dgemv_(&Trans, &n1, &n2, &a, Work2, &n1, row, &inc, &b, col, &inc);

		lwork = 128*n2;
		dsysv_(&U, &n2, &nrhs, Jac, &n2, IPIV, col, &n2, Work2, &lwork, &info);

//		dsysvx_(&F, &U, &n2, &nrhs, Jac, &n2, Jac2, &n2, IPIV, col, &n2, X, &n2,
//		&Rcond, &Ferr, &Berr, Work2, &lwork, iwork, &info);

		a = -1.0; b = -1.0;
		dgemv_(&NoTrans, &n1, &n2, &a, Work1, &n1, col, &inc, &b, row, &inc);
//		dgemv_(&NoTrans, &n1, &n2, &a, Work1, &n1, X, &inc, &b, row, &inc);

		dcopy_(&n2, col, &inc, dcol, &inc);
//		dcopy_(&n2, X, &inc, dcol, &inc);
		for(int i = 0; i < n1; i++) drow[i] = row[i]*drow[i];
	}

  }

/* ===============================================================================================
   computedX_iter

   Input:
   =====
	npoint1:	number of points for point set 1
	npoint2:	number of points for point set 2
	beta:		current beta
	C:		Cost matrix
	lambda:		current values of Lagragians lambda
	mu:		current values of Lagragians mu
	row:		errors on row marginals
	col:		errors on col marginals
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	drow:		updates on lambda
	dcol:		updates on mu

   Method:
   ======
	We write the Jacobian as:
        Jac = [A     B]
              [B^T   D]
	where A and D are diagonal matrices
	We solve:
	Jac dF = B
	using preconditioned conjugate gradient

   =============================================================================================== */

  void OT1::computedX_iter(int n1, int n2, double beta, double *C, double *lambda, double *mu, 
	int nthreads)
  {

	int npoint1 = n1;
	int npoint2 = n2+1;
	int n1n2 = npoint1*npoint2;
	int n1pn2 = n1+n2;
	char Trans   = 'T';
	char NoTrans = 'N';
	int inc = 1;
	int one = 1;
	double a, b;
	double tol = 1.e-2;
	
	dcopy_(&n1n2, C, &inc, Work1, &inc);
	a = 1.0; b = 1;
	dgemm_(&NoTrans, &Trans, &npoint1, &npoint2, &one, &a, lambda, &npoint1, vone, &npoint2, &b, 
		Work1, &npoint1);
	dgemm_(&NoTrans, &Trans, &npoint1, &npoint2, &one, &a, vone, &npoint1, mu, &npoint2, &b, 
		Work1, &npoint1);

	vect_dcoupling(n1n2, Work1, beta, nthreads);

	a = 1; b = 0;
	dgemv_(&NoTrans, &npoint1, &npoint2, &a, Work1, &npoint1, vone, &inc, &b, drow, &inc);
	dgemv_(&Trans, &npoint1, &npoint2, &a, Work1, &npoint1, vone, &inc, &b, dcol, &inc);

	for(int j = 0; j < n1; j++) B[j] = -row[j];
	for(int j = 0; j < n2; j++) B[j+n1] = -col[j];

//	double t = 1.e-10;
//	for(int j = 0; j < n1; j++) drow[j] += t;
//	for(int j = 0; j < n2; j++) dcol[j] += t;

	conjgrad.cgDriver(n1, n2, drow, dcol, Work1, B, X, &Work2[0], &Work2[n1pn2],
	&Work2[2*n1pn2], &Work2[3*n1pn2], &Work2[4*n1pn2], &Work2[5*n1pn2], tol);

	dcopy_(&n1, X, &inc, drow, &inc);
	dcopy_(&n2, &X[n1], &inc, dcol, &inc);
	dcol[n2]=0;

/*
	for(int i = 0; i < n1; i++) {
		std::cout << "drow[i] = " << drow[i] << std::endl;
	}
	std::cout << " " << std::endl;
	for(int i = 0; i < n2; i++) {
		std::cout << "dcol[i] = " << dcol[i] << std::endl;
	}
	std::cout << " " << std::endl;
	exit(1);
*/

  }

/* ===============================================================================================
   solveG

   Input:
   =====
	npoint1:	number of points for point set 1
	m1:		measure on points1
	npoint2:	number of points for point set 2
	m2:		measure on points2
	C:		cost matrix
	beta:		parameter beta
	tol:		tolerance criteria
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	G:		coupling matrix
   =============================================================================================== */

  void OT1::solveG(int npoint1, double *m1, int npoint2, double *m2, double *C, double *G, double *lambda, 
	double *mu, double beta, double tol, int *niter, int nthreads)
  {

	double *l_try = new double[npoint1];
	double *m_try = new double[npoint2];
	memset(l_try, 0, npoint1*sizeof(double));
	memset(m_try, 0, npoint2*sizeof(double));

	int n1 = npoint1;
	int n2 = npoint2-1;

	double err, err_old;

	computeRC(npoint1, m1, npoint2, m2, C, beta, lambda, mu, &err, nthreads);

	int iter = 0;
	int nstep;
	double step;
	while (err > tol)
	{
		err_old = err;
  		computedX_direct(n1, n2, beta, C, lambda, mu, nthreads);
//  		computedX_iter(n1, n2, beta, C, lambda, mu, nthreads);

		step = 1.0;
		nstep = 0;
		for(int i = 0; i < 30; i++) {
			for(int i = 0; i < npoint1; i++) l_try[i] = lambda[i] + step*drow[i];
			for(int j = 0; j < npoint2-1; j++) m_try[j] = mu[j] + step*dcol[j];
			computeRC(npoint1, m1, npoint2, m2, C, beta, l_try, m_try, &err, nthreads);
//			std::cout << "err_old = " << err_old << " err = " << err << std::endl;
			if(err < err_old) break;
			step = step/2;
		}
		if(iter==50 || nstep==30) break;

		for(int i = 0; i < npoint1; i++) lambda[i] = l_try[i];
		for(int j = 0; j < npoint2-1; j++) mu[j] = m_try[j];

		iter++;
	}

	*niter = iter;

	computeG(npoint1, m1, npoint2, m2, C, beta, lambda, mu, G);

	delete [] l_try; delete [] m_try;
  }
		
/* ===============================================================================================
   earthMover

   Input:
   =====
	npoint1:	number of points for point set 1
	m1:		measure on points1
	npoint2:	number of points for point set 2
	m2:		measure on points2
	C:		cost matrix
	beta1:		starting beta
	nthreads:	number of threads for parallel computation
	
   Output:
   ======
	dist:		Optimal transport distance
	G:		coupling matrix
   =============================================================================================== */

   double OT1::ot1(int npoint1, double *m1, int npoint2, double *m2, double *C,
	double *G, double *lambda, double *mu, double beta1, double betaf, double *Fopt, 
	int iprint, int init, int nthreads)
  {

	// Define all variables needed to compute Earh Mover's distance

	double beta_val, tol;
	double dist, err_row, err_col;
	int n1n2 = npoint1*npoint2;
	int inc = 1;

	// Dimension and Initialize all arrays

	if(init==0) {
		initOT1(npoint1, npoint2);
	}

	// Initialize auxiliary variables lambda and mu
	double eps = 1.e-2;
	for(int i = 0; i < npoint1; i++) lambda[i] = eps*((double) std::rand())/RAND_MAX;
	for(int i = 0; i < npoint2; i++) mu[i] = eps*((double) std::rand())/RAND_MAX;
	mu[npoint2-1]=0;
	memset(lambda, 0, npoint1*sizeof(double));
	memset(mu, 0, npoint2*sizeof(double));


	dist = 1.0;

	int niter;

	beta_val = beta1;

	if(iprint == 1) {
		std::cout << " " << std::endl;
		std::cout << "        " << "=====================================================================================================" << std::endl;
		std::cout << "        " << "       Beta           Iter              U                  Ent             Err_row         Err_col   " << std::endl;
		std::cout << "        " << "=====================================================================================================" << std::endl;
	}

	double F=0;
	double coef = std::sqrt(std::sqrt(10));
	while(beta_val < betaf)
	{

		tol = std::max(1./beta_val, 1.e-5);
		tol = 1.e-5;
		solveG(npoint1, m1, npoint2, m2, C, G, lambda, mu, beta_val, tol, &niter, nthreads);

		checkMarginals(npoint1, m1, npoint2,  m2, G, &err_row, &err_col);
		dist = ddot_(&n1n2, G, &inc, C, &inc); 
		F = computeF(npoint1, m1, npoint2, m2, C, beta_val, lambda, mu);
		double ent = 0;
		for(int i = 0; i < npoint1*npoint2; i++) {
			if(G[i]>0.0) ent -= G[i]*std::log(G[i]);
		}

		if(iprint==1) {
			std::cout << "        " << "   " << std::setw(10)<< beta_val << "    ";
			std::cout << std::setw(10) << niter << "        " << std::setw(10) << dist << "        ";
			std::cout << std::setw(10) << ent <<  "        ";
			std::cout << std::setw(10) << err_row <<  "        " << err_col << std::endl;
		}

		beta_val = beta_val*coef;
	}


	if(iprint==1) {
		std::cout << "        " << "=====================================================================================================" << std::endl;
		std::cout << " " << std::endl;
	}

	*Fopt = F;
	return dist;

  }

/* ===============================================================================================
   Initialize arrays for EarchMover
   =============================================================================================== */

  void OT1::initOT1(int npoint1, int npoint2)
  {

	int nmax = std::max(npoint1, npoint2);
	int n = std::max(nmax, 128);
	Work1 = new double[npoint1*npoint2];
	Work2 = new double[nmax*n];
	Jac   = new double[nmax*nmax];
	Jac2  = new double[nmax*nmax];
	vone  = new double[nmax];
	X     = new double[nmax];
	iwork = new int[nmax];
	IPIV  = new int[nmax];
	row = new double[npoint1];
	col = new double[npoint2];
	drow = new double[npoint1];
	dcol = new double[npoint2];
	B = new double[2*nmax];
	X = new double[2*nmax];

	for(int i = 0; i < nmax; i++) vone[i] = 1.0;

  }


#endif
