/* ===============================================================================================
 * Device: add a single number to diagonal of matrix
   =============================================================================================== */

template <typename T>
  __global__ void kernel_add_diag(int shift, T val, T *H, int dsize)
  {
	int idx = threadIdx.x+blockDim.x*blockIdx.x;
	if (idx < dsize) {
		H[idx+shift*idx] += val;
	}
  }

/* ===============================================================================================
  copyDiag

	Purpose:
	========
		copy vector to diagonal of matrix

	Arguments:
	==========

	N	(input) integer
		on Entry, leading dimension of matrix H

	V	(input) array of floats or double
		on Entry, vector to be copied along diagonal of H

	H	(input) matrix of floats or double
		on Entry, matrix
		on Exit, matrix whose diagonal has been modified by v

	dsize	(input) integer
		on Entry, number of elements of V to be copied

	
   =============================================================================================== */

 template <typename T>
  __global__ void copyDiag(int N, T *V, T *H, int dsize)
  {
        int idx = threadIdx.x+blockDim.x*blockIdx.x;
        if (idx < dsize) {
                H[idx+N*idx] = V[idx];
        }
  }

/*================================================================================================
 elementWise

	Purpose:
	========
	Computes elementwise product of two vectors d_x, d_y, with output in d_y

	Arguments:
	==========

	d_x	(input) array of float or double
		on Entry, the first vector x

	d_y	(input/output) array of float or double
		on Entry, the second vector y
		on Exit, the  element wise product x .* y

	nval	(input) integer
		on Entry, the number of elements of x and y

================================================================================================== */

 template <typename T>
 __global__ void elementWise(T *d_x, T *d_y, int nval)
 {
	int idx = threadIdx.x+blockDim.x*blockIdx.x;

	if(idx < nval) {

		d_y[idx] = d_y[idx]*d_x[idx];
	}
  }

/*================================================================================================
 scale

	Purpose:
	========
	scale and add a constant to a vector: x == s*x + C

	Arguments:
	==========

	s	(input) float or double
		on Entry, the scale

	C	(input) float or double
		on Entry, the constant

	x	(input) array of float or double
		on Entry, the vector x
		on Exit, the  vector s*x + C

	nval	(input) integer
		on Entry, the number of elements of x

================================================================================================== */

 template <typename T>
 __global__ void scale(T s, T C, T *x, int nval)
 {
	int idx = threadIdx.x+blockDim.x*blockIdx.x;

	if(idx < nval) {

		x[idx]  = s*x[idx] + C;
	}
  }

/*================================================================================================
 zeroLowerTriangle

	Purpose:
	========
	Set to 0 the lower triangular part of a matrix

	Arguments:
	==========

	A	(input/output) array of float or double
		on Entry, the matrix (stored clolumn wise)

	N	(input) integer
		size of the matrix (considered square)

================================================================================================== */

template <typename T>
__global__ void zeroLowerTriangle(T *A, int N) {

	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < N && col < N && row > col) {
		A[col * N + row] = 0.0;
	}
}

/*================================================================================================
 extractSignDiag

	Purpose:
	========
	Extract the signs of the diagonal element of a matrix

	Arguments:
	==========

	A	(input/output) array of float or double
		on Entry, the matrix (stored clolumn wise)

	N	(input) integer
		size of the matrix (considered square)

================================================================================================== */

template <typename T>
__global__ void extractSignDiag(T *d_sdiag, T *d_A, int N) {

	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < N) {
		d_sdiag[i] = (T(0) < d_A[i * (N+1)]) - (d_A [i * (N+1)]  < T(0));
	}
}

template <typename T>
__global__ void processT0(T *d_TM, int N, T *d_A, T *d_B, int s, int offset) {

	T X, Y, X1;
	int k;
	if (threadIdx.x == 0 && blockIdx.x == 0) {

		X = d_B[0] / d_A[0];
		Y = d_A[1] / d_A[0];

		d_TM[offset*N+offset]     = X;
		d_TM[offset*N+offset+1]   = Y;
		d_TM[(offset+1)*N+offset] = Y;
        
		// Process remaining elements
		for (int i = 1; i < s + 1; i++) {
			k = offset+i;

			X1 = d_B[i]/d_A[i];
			Y = d_A[i+1]/d_A[i];
			
			d_TM[k*N+k]     = X1 - X;
			d_TM[k*N+k+1]   = Y;
			d_TM[(k+1)*N+k] = Y;

			X = X1;
		}
	}
}

template <typename T>
__global__ void processTk(T *d_TM, int N, T *d_A, T *d_B, T *d_C, int s, int offset) {

	T X, Y, X1;
	int k;

	if (threadIdx.x == 0 && blockIdx.x == 0) {

		d_TM[offset*N+offset]     = d_C[0];
		d_TM[offset*N+offset+1]   = d_A[0];
		d_TM[(offset+1)*N+offset] = d_A[0];

		if(s > 1) {
			X = d_B[0]/d_A[0];
			Y = d_A[1]/d_A[0];

			d_TM[(offset+1)*N+offset+1] = X - d_C[0];
			d_TM[(offset+1)*N+offset+2] = Y;
			d_TM[(offset+2)*N+offset+1] = Y;
		}
		for(int i = 2; i < s; i++) {
			k = offset+i;

			X1 = d_B[i-1]/d_A[i-1];
			Y   = d_A[i]/d_A[i-1];

			d_TM[k*N+k]     = X1 - X;
			d_TM[k*N+k+1]   = Y;
			d_TM[(k+1)*N+k] = Y;

			X = X1;
		}
	}
}

template <typename T, int MAX_M>
__global__ void inplace_dgemm_kernel(int N, int M, T* A, int lda, T* B, int ldb) 
{
	__shared__ T B_shared[MAX_M * MAX_M];  // B is M x M, column-major
	__shared__ T row[MAX_M];			   // temp row: A[row_id, :]

	int row_id = blockIdx.x;
	int tid = threadIdx.x;

	if (row_id >= N) return;

	// Load B into shared memory (collaboratively by threads)
	for (int idx = tid; idx < M * M; idx += blockDim.x) {
		B_shared[idx] = B[idx];  // column-major load
	}
	__syncthreads();

	// Load current row A[row_id, :] into shared row[]
	for (int j = tid; j < M; j += blockDim.x) {
		row[j] = A[row_id + j * lda];  // A(i,j)
	}
	__syncthreads();

	// Compute A[i,j] = dot(row, B[:,j])
	for (int j = tid; j < M; j += blockDim.x) {
		T sum = 0.;
		for (int k = 0; k < M; ++k) {
			sum += row[k] * B_shared[k + j * ldb];
		}
		A[row_id + j * lda] = sum;
	}
}

template <typename T>
__global__ void inplace_dgemm_largeM_kernel(int N, int M, T* A, int lda, T* B, int ldb) 
{
	extern __shared__ T row[];  // dynamic shared memory: only store one row

	int row_id = blockIdx.x;
	int tid = threadIdx.x;

	if (row_id >= N) return;

	// Load A[row_id, :] into row[]
	for (int j = tid; j < M; j += blockDim.x)
		row[j] = A[row_id + j * lda];

	__syncthreads();

	// Compute A[row_id, j] = dot(row, B[:,j])
	for (int j = tid; j < M; j += blockDim.x) {
		T sum = 0;
		for (int k = 0; k < M; ++k)
		sum += row[k] * B[k + j * ldb];  // B is still column-major
		A[row_id + j * lda] = sum;
	}
}

