#include <cstdio>
#include <cuda_runtime.h>
// #include <cublas_v2.h>
#include <cublas.h>

#include "multb.h"
#include "profile.h"

#define K 64

cuda_exception::cuda_exception(const char *file, int line, int code) { 
  snprintf(message, sizeof(message), "CUDA error #%i at %s:%i\n",
           code, file, line);
}

const char* cuda_exception::what() const throw() { return message; }

// void cublas_dgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
void cublas_dgemm(char transa, char transb,
  int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc) {
  cudaThreadSynchronize();
  PROFILE_BEGIN();
  // CUDACHECK(cublasDgemm(handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
  cublasDgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
  CUDACHECK(cublasGetError());
  cudaThreadSynchronize();
  PROFILE_END(profile_cudgemm, 2.0 * k * m * n);
}

__global__ void scalerow_kernel(int n, const double *h, const double *A, double *B)
{
  // B = diag(h) * A
  int j, i = blockIdx.x * K + threadIdx.x; // row index for this thread
  if (i < n) {
    double f = h[i];
/*
    for (j = 0; j < n; j++)
      B[i + j * n] = f * A[i + j * n];
*/
    A += i; B += i;
    for (j = 0; j < n; j++) {
      *B = *A * f;
      A += n; B += n;
    }
  }
}

void scalerow_gpu(int n, const double *h, const double *A, double *B)
{
  int nk = n % K ? n / K + 1 : n / K;
  dim3 gridDim(nk), blockDim(K);
  cudaThreadSynchronize();
  PROFILE_BEGIN();
  scalerow_kernel <<< gridDim , blockDim >>> (n, h, A, B);
  CUDACHECK(cudaGetLastError());
  cudaThreadSynchronize();
  PROFILE_END(profile_scalerowg, n * n);
}

texture<int2> tex_h;

__global__ void scalerowcol_kernel(int n, const double *h, double *A, int offset)
{
  // A = diag(h) * A * diag(h)^-1
  int j, i = blockIdx.x * K + threadIdx.x; // row index for this thread
  if (i < n) {
    double f = h[i];
    int2 v;
    for (j = 0; j < n; j++)
      A[i + j * n] = f * A[i + j * n] / h[j];
      // This texture do not work on the GTX480
      // v = tex1Dfetch(tex_h, j + offset); 
      // A[i + j * n] = f * A[i + j * n] / __hiloint2double(v.y, v.x);
  }
}

void scalerowcol_gpu(int n, const double *h, double *A)
{
  tex_h.normalized     = false;
  tex_h.addressMode[0] = cudaAddressModeClamp; 
  tex_h.filterMode     = cudaFilterModePoint; 
  size_t offset;
  CUDACHECK(cudaBindTexture(&offset, tex_h, h));
  int nk = n % K ? n / K + 1 : n / K;
  dim3 gridDim(nk), blockDim(K);
  cudaThreadSynchronize();
  PROFILE_BEGIN();
  scalerowcol_kernel <<< gridDim , blockDim >>> (n, h, A, offset / sizeof(double));
  CUDACHECK(cudaGetLastError());
  cudaThreadSynchronize();
  PROFILE_END(profile_scalerowcol, n * n * 2);
}

