#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <scash.h>
#include <scashcrt.h>
#include "ompsm.h"


#ifdef __HAS_ATOMIC_H
#ifdef __PGI
/*
 * replace <asm/atomic.h>
 * because PGI compiler can not compile "/usr/include/asm/atomic.h" of kernel header file. :<
 */
#include <atomic_for_pgi.h>
#else /* __PGI */
#define  __SMP__
#define  CONFIG_SMP
#define _LINUX_CONFIG_H
#define extern  static
#include  <asm/atomic.h>
#undef  extern
#endif /* __PGI */

#else

# ifdef OMNI_CPU_I386

typedef struct {
  volatile int counter;
} atomic_t;

#define atomic_set(v,i)		(((v)->counter) = (i))
#define atomic_read(v)		((v)->counter)

void atomic_inc(atomic_t *);
void atomic_dec(atomic_t *);


void
__func_atomic ()
{
  /* atomic_dec (atomic_t *); */
  asm(" 	.align 4				");
#ifndef __INTEL_COMPILER
  asm("		.type	 atomic_inc,@function		");
#endif
  asm("atomic_inc:					");
  asm("		pushl	%ebp				");
  asm("		movl	%esp, %ebp			");
  asm("		movl	8(%ebp), %eax			");
  asm("		lock ; incl (%eax)			");
  asm("		popl	%ebp				");
  asm("		ret					");
  asm(".Latomic_inc_end:				");
  asm("		.size	 atomic_inc,.Latomic_inc_end-atomic_inc	");

  /* atomic_dec (atomic_t *); */
  asm("		.align 4				");
#ifndef __INTEL_COMPILER
  asm("		.type	 atomic_dec,@function		");
#endif
  asm("atomic_dec:					");
  asm("		pushl	%ebp				");
  asm("		movl	%esp, %ebp			");
  asm("		movl	8(%ebp), %eax			");
  asm("		lock ; decl (%eax)			");
  asm("		popl	%ebp				");
  asm("		ret					");
  asm(".Latomic_dec_end:				");
  asm("		.size	 atomic_dec,.Latomic_dec_end-atomic_dec	");
}

#else /* OMNI_CPU_I386 */

#error "#########################################################"
#error "This processor's atomic operation is not defined, here"
#error "#########################################################"

#endif /* OMNI_CPU_I386 */
#endif /* __HAS_ATOMIC_H */


#define SCASH_MAX_ARGS_SIZE	SCASH_PAGE_SIZE  /* 4K */
#define SCASH_BUFF_SIZE		(SCASH_PAGE_SIZE * 8)

#define CACHE_LINE_SIZE		32


#define SCASH_LOCK0		scash_lock(LOCK1)
#define SCASH_UNLOCK0		scash_unlock(LOCK1)
#define IS_GLOBAL_MEMORY(addr)	(_ompsm_scash_memory_top <= (char *)(addr) && (char *)(addr) < _ompsm_scash_memory_tail)


/*
 * SCASH interface.
 */
#define SCASH_PAGE_OFFSET(X)	((int)(X) & ((SCASH_PAGE_SIZE) - 1))
#define SCASH_PAGE_MASK(X)	((int)(X) & ~((SCASH_PAGE_SIZE)-1))
#define SCASH_PAGE_ROUNDUP(X)	(SCASH_PAGE_MASK(((int)(X) + SCASH_PAGE_SIZE - 1)))


# ifdef OMNI_CPU_I386
# define MBAR() { /* asm("cpuid"); */ }

#else 
# ifdef OMNI_CPU_SPARC
# define MBAR() asm("stbar"); 
#else

#error "################################################################"
#error "This processor's memory barrier operation is not defined, here"
#error "################################################################"

#endif /* OMNI_CPU_SPARC */
#endif /* OMNI_CPU_I386 */

#define WAIT(cond){ scash_poll(); }


int 		_ompc_scash_dist_flag = TRUE;
static int	_ompsm_terminate_flag = FALSE;

/* private data this layer */
static volatile int _scash_in_flag = 0;

/* only in master */
int	_scash_args_size = SCASH_MAX_ARGS_SIZE;
caddr_t _scash_args_sp;
caddr_t _scash_args_endp;

int	_scash_tmp_buff_size = SCASH_BUFF_SIZE;
char	*_scash_tmp_buff;


/* 
 * shared data
 */
struct ompsm_scash_data {
  /* IN/OUT barrier */
  volatile int in_count;
  volatile int out_count;
};
static struct ompsm_scash_data *_ompsm_scash_data;


static int _scash_lock_sync;


/*static int _scash_lock_count = 2;*/
char * _ompsm_scash_memory_top;
char * _ompsm_scash_data_top;
char * _ompsm_scash_memory_tail;


void _ompsm_scash_slave_wrapper(void);

void _ompsm_scash_lock(volatile _ompc_lock_t *lp);
void _ompsm_scash_unlock(volatile _ompc_lock_t *lp);

void _ompsm_scash_data_dist_rec (struct data_map_entry *, char *, int);
void _ompsm_scash_cyclic_dist (char *, char *, int, int, int, int, int);


struct ompsm_nlock {
  volatile int	lock;
  volatile int	id;
  volatile int	count;
};

void _ompsm_scash_flush_lock_init (void);
void _ompsm_spin_init_lock (struct ompsm_nlock *lp);
void _ompsm_spin_lock (struct ompsm_nlock *lp);
void _ompsm_spin_unlock (struct ompsm_nlock *lp);
int  _ompsm_spin_islock (struct ompsm_nlock *lp);

void 
_ompsm_scash_init (int argc, char **argv)
{
  int	memsize, sz;

  /* initialize SCASH */
  sz = _scash_tmp_buff_size + sizeof(*__G__);
  memsize = (SCASH_PAGE_ROUNDUP(sz) + 
	     sizeof(struct ompsm_scash_data) +
	     _ompsm_gdata_size +
	     _scash_args_size +
	     _ompsm_free_shmem_size);

  if(_ompc_debug_flag)
    fprintf (stderr, "_ompc_scash_init shared memory size=%d(0x%x)\n",
	     memsize, memsize);

  scash_initialize (argc, argv, memsize);

  _ompc_node_id     = scash_pe ();
  _ompc_n_node      = scash_ps ();
  _ompc_num_threads = _ompc_n_node;
  _ompc_max_threads = _ompc_n_node;

  if (MAX_PROC < _ompc_max_threads) {
    _ompsm_scash_fatal ("# of PE is too large.");
  }

  _scash_lock_sync  = LOCK_TAIL + scash_pe ();

  _ompsm_scash_flush_lock_init ();

  /* initialize for MASTER node */
  if (scash_pe () == 0) {
    /* initialize shared data */
    _scash_tmp_buff = (char *) scash_alloc (_scash_tmp_buff_size);
    bzero (_scash_tmp_buff, _scash_tmp_buff_size);
    scash_distribute_caddr (&_scash_tmp_buff);

    __G__ = (struct _global_ *) scash_alloc (sizeof(*__G__));
    bzero (__G__,sizeof(*__G__));
    scash_distribute_caddr ((char **)&__G__);

    scash_page_align ();

    _ompsm_scash_data = 
      (struct ompsm_scash_data *)scash_alloc(sizeof(struct ompsm_scash_data));
    bzero (_ompsm_scash_data, sizeof(struct ompsm_scash_data));
    scash_distribute_caddr((char **)&_ompsm_scash_data);

    _scash_args_sp = (caddr_t)scash_alloc(_scash_args_size);
    _scash_args_endp = _scash_args_sp + _scash_args_size;

    __G__->heap_top = _scash_args_sp;
    __G__->heap_tail = scash_alloc(0);

    __G__->_ompsm_scash_locknum = LOCK_TAIL + scash_ps ();

    _ompsm_scash_memory_top  = (char *)_scash_tmp_buff;
    _ompsm_scash_data_top    = (char *)_ompsm_scash_data;
    _ompsm_scash_memory_tail = _ompsm_scash_memory_top + memsize;
    scash_distribute_caddr (&_ompsm_scash_data_top);
    scash_distribute_caddr (&_ompsm_scash_memory_tail);
    /*scash_align();*/

    if(_ompc_debug_flag) {
      fprintf(stderr,"_ompsm_scash_init end (master)...\n");
    }

  /* initialize for SLAVE node */
  } else {
    /* exec thread invocation routine, not returned */
    _ompsm_scash_slave_wrapper ();
  }
}


void
_ompsm_scash_start_slaves ()
{
  /* send trigger to _ompsm_slave_wrapper */
  scash_barrier (BARRIER_SLAVE_START);
}


void
_ompsm_scash_fatal (char *msg)
{
  fprintf(stderr,"fatal(id=%d):_ompsm_scash_fatal: %s\n", _ompc_node_id,msg);
  exit (1);
}


void
_ompsm_scash_slave_wrapper ()
{
  /* waiting trigger from _ompsm_scash_start_slaves */
  scash_barrier (BARRIER_SLAVE_START);

  _ompsm_slave_main();	/* call slave main */
  _ompsm_scash_fatal("??? slave_main return ???");
}


void
_ompsm_scash_finalize ()
{
  if (_ompsm_terminate_flag == TRUE) {
    /* normally terminate */
  } else {
    fflush (stdout);
    scash_exit (1);
  }
}


/* 
 * master and slave
 */
void
_ompsm_scash_master_begin()
{
  scash_barrier (BARRIER_THREAD_BEGIN);
}


void
_ompsm_scash_master_end ()
{
  scash_barrier (BARRIER_THREAD_END);
}


void
_ompsm_scash_slave_begin ()
{
  scash_barrier (BARRIER_THREAD_BEGIN);
}


void 
_ompsm_scash_slave_end ()
{
  scash_barrier (BARRIER_THREAD_END);
}

/* 
 * memory allocation
 */
caddr_t 
_ompsm_scash_alloc (int size, int mode, int args)
{
  caddr_t	p = 0;
  int		pages;


  if (_ompc_n_node == 1) {
    return malloc (size);
  }

  SCASH_LOCK0;

  /* allocate global memory */
  switch (mode) {
  case DEST_BLOCK:
  case DEST_DIRECT:
    p = scash_alloc_page_align (size);
    break;
  case DEST_NONE:
    p = scash_alloc (size);
    break;
  default:
    _ompsm_scash_fatal ("unknown destribute mode");
    break;
  }

  _ompsm_scash_flush_lock ();
  scash_refresh ((caddr_t)&__G__->heap_tail, sizeof(__G__->heap_tail));
  __G__->heap_tail = p+size;
  scash_omp_flush ((caddr_t)&__G__->heap_tail, sizeof(__G__->heap_tail));
  _ompsm_scash_flush_unlock ();

  if (__G__->heap_tail <= _ompsm_scash_memory_tail) {
    /* success : set home node */
    switch (mode) {
    case DEST_BLOCK:
      _ompsm_scash_default_block_dist (p, size);
      break;
    case DEST_DIRECT:
      pages = (size + SCASH_PAGE_SIZE - 1) / SCASH_PAGE_SIZE;
      scash_home_fo (args, p, pages);
      break;
    case DEST_NONE:
      break;
    default:
      _ompsm_scash_fatal ("unknown destribute mode");
      break;
    }
  } else {
    /* failed */
    if (_ompc_debug_flag)
      fprintf (stderr, "_ompsm_scash_alloc : can not allocate global memory.\n");
    return NULL;
  }

  SCASH_UNLOCK0;

  return p;
}


void 
_ompsm_scash_free(void *addr)
{
#ifdef ADDR_IS_64
   fprintf (stderr, "_ompsm_scash_free is not implement. memory leak (0x%qx)\n",
	    (_omAddrInt_t)addr);
#else
   fprintf (stderr, "_ompsm_scash_free is not implement. memory leak (0x%x)\n",
	    (_omAddrInt_t)addr);
#endif /* ADDR_IS_64 */
}

/* 8 byte alignment */
caddr_t _ompsm_scash_align_addr(caddr_t addr)
{
    return (caddr_t )((((_omAddrInt_t)addr)+7)&~7);
}

int _ompsm_scash_align_size(int size)
{
    return (size+7)&~7;
}

/* stack glow to higher address */
caddr_t _ompsm_scash_alloca(int size)
{
  caddr_t p;

  size = (size+7)&~7;
  p = _scash_args_sp;
  _scash_args_sp += size;
  if(_scash_args_sp > _scash_args_endp){
    _ompsm_scash_fatal("shared arg stack overflow");
  }

  return p;
}

void _ompsm_scash_freea(caddr_t p,int size)
{
  size = (size+7)&~7;
  _scash_args_sp -= size;
  if(_scash_args_sp != p){
    _ompsm_scash_fatal("shmem_freea: bad address");
  }
}


/*
 * Barrier
 */
void
_ompsm_scash_barrier()
{
  scash_barrier (BARRIER_RUNTIME);
}


void
_ompsm_scash_flush (void *addr, int sz)
{
  _ompsm_scash_flush_lock ();

#if 1
  scash_poll ();
#endif
  if (addr == NULL) { /* all global memory update */
    scash_omp_flush(_ompsm_scash_data_top,
		    ((int)__G__->heap_tail - (int)_ompsm_scash_data_top));
  } else {
    scash_omp_flush(addr, sz);
  }

  _ompsm_scash_flush_unlock ();
}


void _ompsm_scash_lock0()
{
  SCASH_LOCK0;
}

void _ompsm_scash_unlock0()
{
  SCASH_UNLOCK0;
}

int _ompsm_scash_loop_in_lock0()
{
    struct ompsm_scash_data *dp = _ompsm_scash_data;
    int r;
    
    WAIT(_scash_in_flag);
    OMPSM_LOCK0();
    _ompsm_scash_flush_lock ();
    scash_refresh ((char *)&(dp->in_count),sizeof(int));
    r = dp->in_count++;
    _scash_in_flag = 1;
    scash_omp_flush ((char *)&(dp->in_count),sizeof(int));
    _ompsm_scash_flush_unlock ();

    return r;
}

void _ompsm_scash_loop_out()
{
    struct ompsm_scash_data *dp = _ompsm_scash_data;
    
    OMPSM_LOCK0();
    _ompsm_scash_flush_lock ();

    scash_refresh ((char *)&(dp->out_count),sizeof(int));
    dp->out_count++;
    if(dp->out_count == _ompc_n_node){   /* all thread exit */
	dp->out_count = 0;
	dp->in_count = 0;
	_scash_in_flag = 0;
	scash_distribute_int((int *)&_scash_in_flag);
	scash_omp_flush ((char *)&(dp->in_count),sizeof(int));
    }
    scash_omp_flush ((char *)&(dp->out_count),sizeof(int));
    
    _ompsm_scash_flush_unlock ();
    OMPSM_UNLOCK0();
}

int _ompsm_scash_count_lock0()
{
    struct ompsm_scash_data *dp = _ompsm_scash_data;
    int r;

    WAIT(_scash_in_flag);
    OMPSM_LOCK0();
    _ompsm_scash_flush_lock ();

    scash_refresh ((char *)&(dp->out_count),sizeof(int));
    r = dp->out_count++;
    if(dp->out_count == _ompc_n_node){
      /* if all threads comes, clear flags */
      _scash_in_flag = 0;
      scash_distribute_int((int *)&_scash_in_flag);
      dp->out_count = 0;
    } else _scash_in_flag = 1;
    scash_omp_flush ((char *)&(dp->out_count),sizeof(int));

    _ompsm_scash_flush_unlock ();

    return r;
}


/*
 * SCASH lock function
 */
#if 1 /* SCASH_SUPPORT_TESTLOCK */
void 
_ompsm_scash_init_lock(volatile _ompc_lock_t *lp)
{
  scash_lock (LOCK_LOCKINIT);
  _ompsm_scash_flush_lock ();

  scash_refresh ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
  if (__G__->free_locks != NULL) {
    scash_refresh ((caddr_t)__G__->free_locks, sizeof(*(__G__->free_locks)));
    *lp = __G__->free_locks;
    __G__->free_locks = __G__->free_locks->next;
    scash_omp_flush ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
    (*lp)->owner = -1;
    scash_omp_flush ((caddr_t)&(*lp)->owner, sizeof((*lp)->owner));
  } else {
    scash_refresh ((caddr_t)&__G__->_ompsm_scash_locknum, sizeof(__G__->_ompsm_scash_locknum));
    *lp = (_ompc_lock_t) _ompsm_scash_alloc(sizeof(**lp), DEST_NONE, 0);
    if (*lp == NULL) {
      _ompsm_scash_fatal ("_ompsm_scash_init_lock : can not allocate global memory");
    }
    (*lp)->locknum1 = __G__->_ompsm_scash_locknum ++;
    (*lp)->owner    = -1;
    scash_omp_flush ((caddr_t)*lp, sizeof(**lp));
    scash_omp_flush ((caddr_t)&__G__->_ompsm_scash_locknum, sizeof(__G__->_ompsm_scash_locknum));
  }

  _ompsm_scash_flush_unlock ();
  scash_unlock (LOCK_LOCKINIT);
}


void 
_ompsm_scash_destroy_lock(volatile _ompc_lock_t *lp)
{
  scash_lock (LOCK_LOCKINIT);
  _ompsm_scash_flush_lock ();

  scash_refresh ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
  (*lp)->next = __G__->free_locks;
  __G__->free_locks = *lp;
  scash_omp_flush ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
  scash_omp_flush ((caddr_t)*lp, sizeof(**lp));

  _ompsm_scash_flush_unlock ();
  scash_unlock (LOCK_LOCKINIT);
}


void 
_ompsm_scash_lock(volatile _ompc_lock_t *lp)
{
  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_refresh ((char *)&((*lp)->locknum1),sizeof(int));
    _ompsm_scash_flush_unlock ();
  }
  scash_lock ((*lp)->locknum1);
}


void 
_ompsm_scash_unlock(volatile _ompc_lock_t *lp)
{
  scash_unlock ((*lp)->locknum1);
}


int 
_ompsm_scash_test_lock(volatile _ompc_lock_t *lp)
{
  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_refresh ((char *)&((*lp)->locknum1),sizeof(int));
    _ompsm_scash_flush_unlock ();
  }
  if (scash_try_lock ((*lp)->locknum1) == SCASH_LOCK_BUSY) {
    return 0;
  } else {
    return 1;
  }
}


void 
_ompsm_scash_lock_nlock (volatile _ompc_nest_lock_t *lp)
{
  _ompc_lock_t	l = *lp;


  if (l->owner == _ompc_node_id) {    /* already lock by this thread */
    _ompsm_scash_flush_lock ();
    l->count += 1;		      /* must be have count in SCASH Memory area */
    _ompsm_scash_flush_unlock ();

  } else {
    if ((*lp)->locknum1 == 0) {
      _ompsm_scash_flush_lock ();
      scash_refresh ((char *)&((*lp)->locknum1),sizeof(int));
      _ompsm_scash_flush_unlock ();
    }
    scash_lock (l->locknum1);
    _ompsm_scash_flush_lock ();
    scash_refresh ((char *)l, sizeof(*l));
    l->owner = _ompc_node_id;
    l->count = 1;
    scash_omp_flush ((caddr_t)l, sizeof(*l));
    _ompsm_scash_flush_unlock ();
  }
}


void 
_ompsm_scash_unlock_nlock (volatile _ompc_nest_lock_t *lp)
{
  _ompc_lock_t	l = *lp;
 

  _ompsm_scash_flush_lock ();
  l->count -= 1;
  if (l->count == 0) {
    l->owner = -1;
    scash_omp_flush ((caddr_t)l, sizeof(*l));
  } else {
    scash_omp_flush ((caddr_t)&l->count, sizeof(l->count)); /* flush to home. if not flush it, conflict */
  }
  _ompsm_scash_flush_unlock ();

  if (l->count == 0) {
    scash_unlock(l->locknum1);
  }
}


int
_ompsm_scash_testlock_nlock (volatile _ompc_nest_lock_t *lp)
{
  _ompc_lock_t	l = *lp;


  _ompsm_scash_flush_lock ();
  scash_refresh ((caddr_t)l, sizeof(*l));
  if (l->owner == _ompc_node_id) {    /* already lock by this thread */
    l->count += 1;
    scash_omp_flush ((caddr_t)&l->count, sizeof(l->count)); /* flush to home. if not flush it, conflict */
    _ompsm_scash_flush_unlock ();
    return l->count;

  } else if (l->owner < 0) {	      /* unlocked, now */
    _ompsm_scash_flush_unlock ();
    if (scash_try_lock (l->locknum1) != SCASH_LOCK_BUSY) {
      _ompsm_scash_flush_lock ();
      scash_refresh ((caddr_t)l, sizeof(*l));
      l->owner = _ompc_node_id;
      l->count = 1;
      scash_omp_flush ((caddr_t)l, sizeof(*l));
      _ompsm_scash_flush_unlock ();
      return 1;

    } else {
      return 0;
    }

  } else {
    _ompsm_scash_flush_unlock ();
  }

  return 0;
}


#else /* SCASH_SUPPORT_TESTLOCK */
/*
 *  obsolete code
 */
void 
_ompsm_scash_init_lock(volatile _ompc_lock_t *lp)
{
  scash_lock (LOCK_LOCKINIT);
  _ompsm_scash_flush_lock ();

  scash_reflush ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
  if (__G__->free_locks != NULL) {
    *lp = __G__->free_locks;
    __G__->free_locks = __G__->free_locks->next;
    scash_omp_flush ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
    (*lp)->owner = -1;
    scash_omp_flush ((caddr_t)&(*lp)->owner, sizeof((*lp)->owner));
  } else {
    scash_refresh ((caddr_t)&__G__->_ompsm_scash_locknum, sizeof(__G__->_ompsm_scash_locknum));
    *lp = (_ompc_lock_t) _ompsm_scash_alloc(sizeof(**lp), DEST_NONE, 0);
    if (*lp == NULL) {
      _ompsm_scash_fatal ("_ompsm_scash_init_lock : can not allocate global memory");
    }
    (*lp)->locknum1 = __G__->_ompsm_scash_locknum ++;
    (*lp)->locknum2 = __G__->_ompsm_scash_locknum ++;
    (*lp)->owner    = -1;
    scash_omp_flush ((caddr_t)*lp, sizeof(**lp));
    scash_omp_flush ((caddr_t)&__G__->_ompsm_scash_locknum, sizeof(__G__->_ompsm_scash_locknum));
  }

  _ompsm_scash_flush_unlock ();
  scash_unlock (LOCK_LOCKINIT);
}


void 
_ompsm_scash_destroy_lock(volatile _ompc_lock_t *lp)
{
  scash_lock (LOCK_LOCKINIT);
  _ompsm_scash_flush_lock ();

  scash_refresh ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));
  (*lp)->next = __G__->free_locks;
  __G__->free_locks = *lp;
  scash_omp_flush ((caddr_t)&__G__->free_locks, sizeof(__G__->free_locks));

  _ompsm_scash_flush_unlock ();
  scash_unlock (LOCK_LOCKINIT);
}


void 
_ompsm_scash_lock(volatile _ompc_lock_t *lp)
{
  _ompc_lock_t	l = *lp;


  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_refresh ((caddr_t)*lp, sizeof(**lp));
    _ompsm_scash_flush_unlock ();
  }
  for (;;) {
    scash_lock (l->locknum1);
    scash_lock (l->locknum2);
    _ompsm_scash_flush_lock ();
    scash_refresh ((caddr_t)&l->owner, sizeof(l->owner));
    if (l->owner == -1) {
      l->owner = _ompc_node_id;
      scash_omp_flush ((caddr_t)&l->owner, sizeof(l->owner));
      _ompsm_scash_flush_unlock ();
      scash_unlock (l->locknum2);
      break;
    }
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);
    scash_unlock (l->locknum1);
  }
}


void 
_ompsm_scash_unlock(volatile _ompc_lock_t *lp)
{
  _ompc_lock_t	l = *lp;


  _ompsm_scash_flush_lock ();
  l->owner = -1;
  scash_omp_flush ((caddr_t)&l->owner, sizeof(l->owner));
  _ompsm_scash_flush_unlock ();
  scash_unlock (l->locknum1);
}


int 
_ompsm_scash_test_lock(volatile _ompc_lock_t *lp)
{
  _ompc_lock_t	l = *lp;


  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_refresh ((caddr_t)*lp, sizeof(**lp));
    _ompsm_scash_flush_unlock ();
  }
  scash_lock (l->locknum2);
  _ompsm_scash_flush_lock ();
  scash_refresh ((caddr_t)&l->owner, sizeof(l->owner));
  if (l->owner == -1) {
    l->owner = _ompc_node_id;
    scash_omp_flush ((caddr_t)&l->owner, sizeof(l->owner));
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);
    scash_lock (l->locknum1);
    return 1;

  } else {
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);
    return 0;
  }
}


void 
_ompsm_scash_lock_nlock (volatile _ompc_nest_lock_t *lp)
{
  _ompc_lock_t	l = *lp;


  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_refresh ((caddr_t)*lp, sizeof(**lp));
    _ompsm_scash_flush_unlock ();
  }
  scash_lock (l->locknum2);
  _ompsm_scash_flush_lock ();
  scash_refresh ((caddr_t)&l->owner, sizeof(l->owner));
  if (l->owner == _ompc_node_id) {    /* already lock by this thread */
    l->count += 1;		      /* must be have count in SCASH Memory area */
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);
    return;
  }
  _ompsm_scash_flush_unlock ();
  scash_unlock (l->locknum2);

  for (;;) {
    scash_lock (l->locknum1);
    scash_lock (l->locknum2);
    _ompsm_scash_flush_lock ();
    scash_refresh ((caddr_t)&l->owner, sizeof(l->owner));
    if (l->owner == -1) {
      l->owner = _ompc_node_id;
      l->count = 1;
      scash_omp_flush ((caddr_t)&l->count, sizeof(l->count)); /* flush to home. if not flush it, conflict */
      scash_omp_flush ((caddr_t)&l->owner, sizeof(l->owner));
      _ompsm_scash_flush_unlock ();
      scash_unlock (l->locknum2);
      break;
    } else {
      _ompsm_scash_flush_unlock ();
      if (l->owner == _ompc_node_id) {
	_ompc_fatal ("nestable lock is dead lock.\n");
      }
    }
    scash_unlock (l->locknum2);
    scash_unlock (l->locknum1);
  }
}

void 
_ompsm_scash_unlock_nlock (volatile _ompc_nest_lock_t *lp)
{
  _ompc_lock_t	l = *lp;
 

  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_refresh ((caddr_t)*lp, sizeof(**lp));
    _ompsm_scash_flush_unlock ();
  }
  scash_lock(l->locknum2);

  _ompsm_scash_flush_lock ();
  l->count -= 1;
  scash_omp_flush ((caddr_t)&l->count, sizeof(l->count)); /* flush to home. if not flush it, conflict */
  if (l->count == 0) {
    l->owner = -1;
    scash_omp_flush ((caddr_t)&l->owner, sizeof(l->owner));
  }
  _ompsm_scash_flush_unlock ();

  if (l->count == 0) {
    scash_unlock(l->locknum2);
    scash_unlock(l->locknum1);
  } else {
    scash_unlock(l->locknum2);
  }
}

int
_ompsm_scash_testlock_nlock (volatile _ompc_nest_lock_t *lp)
{
  int		ret;
  _ompc_lock_t	l = *lp;


  if ((*lp)->locknum1 == 0) {
    _ompsm_scash_flush_lock ();
    scash_reflush ((caddr_t)*lp, sizeof(**lp));
    _ompsm_scash_flush_unlock ();
  }
  scash_lock (l->locknum2);
  _ompsm_scash_flush_lock ();
  scash_refresh ((caddr_t)&l->owner, sizeof(l->owner));
  if (l->owner == _ompc_node_id) {    /* already lock by this thread */
    l->count += 1;
    scash_omp_flush ((caddr_t)&l->count, sizeof(l->count)); /* flush to home. if not flush it, conflict */
    ret = l->count;
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);

  } else if (l->owner == -1) { /* unlocked, now */
    l->owner = _ompc_node_id;
    l->count = 1;
    scash_omp_flush ((caddr_t)&l->count, sizeof(l->count)); /* flush to home. if not flush it, conflict */
    scash_omp_flush ((caddr_t)&l->owner, sizeof(l->owner));
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);
    scash_lock (l->locknum1);
    ret = 1;

  } else {			      /* lock failed */
    _ompsm_scash_flush_unlock ();
    scash_unlock (l->locknum2);
    ret = 0;
  }

  return ret;
}
#endif /* SCASH_SUPPORT_TESTLOCK */


void
_ompsm_scash_bcast_thdprv(void *dst, void *src, int size)
{
  if((NOT_IN_PARALLEL) || (IN_NESTED_PARALLEL)) {
    if (src != dst) {
      bcopy (src,dst,size);
    }
  } else {
    char *s = src;
    char *d = dst;
    int  sz;

    while (0 < size) {
      sz = ((size < _scash_tmp_buff_size) ? (size) : (_scash_tmp_buff_size));
      if (IS_MASTER_THREAD) {
	bcopy (s, _scash_tmp_buff, sz);
	scash_barrier (BARRIER_THDPRV);
      } else {
	scash_barrier (BARRIER_THDPRV);
	bcopy (_scash_tmp_buff, d, sz);
      }
      
      s    += sz;
      d    += sz;
      size -= sz;

      scash_barrier (BARRIER_THDPRV);
    }
  }
}


/* distribution */
void 
_ompsm_scash_data_dist(struct data_map_entry *mp)
{
  if (_ompc_n_node == 1) {
    return ;
  }

  switch(mp->dist_mode) {
  case MAP_NONE:
    _ompsm_scash_default_block_dist (((char *)(*(mp->p)))+mp->offset, 
				     mp->size);
    break;
	
  case MAP_BLOCK:
  case MAP_CYCLIC:
    _ompsm_scash_data_dist_rec (mp, (char *)(*(mp->p))+mp->offset,
				mp->n_dim);
    break;
  }
}


void 
_ompsm_scash_data_dist_rec (struct data_map_entry *mp, char *p, int dim)
{
  int	d_size, d_offset, i, block;


  d_size = mp->dim_size[dim];
  d_offset = 1;
  for (i=0; i<dim; i++) {
    d_offset *= mp->dim_size[i];
  }
  if (dim == mp->dist_dim) {
    switch (mp->dist_mode) {
    case MAP_BLOCK:
      block = ((mp->dist_blk_size + _ompc_n_node - 1) / _ompc_n_node);
      if (mp->dist_scale == 0) {
	_ompsm_scash_cyclic_dist ("block", p, d_offset * d_size,
				  block, d_offset, 1, 0);
      } else {
	_ompsm_scash_cyclic_dist ("expr", p, d_offset * d_size,
				  block, d_offset,
				  mp->dist_scale, mp->dist_offset);
      }
      break;
    case MAP_CYCLIC:
      if (mp->dist_scale == 0) {
	_ompsm_scash_cyclic_dist ("cyclic", p, d_offset * d_size,
				  mp->dist_blk_size, d_offset, 1, 0);
      } else {
	_ompsm_scash_cyclic_dist ("expr", p, d_offset * d_size,
				  mp->dist_blk_size, d_offset,
				  mp->dist_scale, mp->dist_offset);
      }
      break;
    case MAP_NONE:
      _ompc_fatal ("bat distribute mode");
      break;
    }
  } else {
    for (i=0; i<d_size; i++) {
      _ompsm_scash_data_dist_rec (mp, p, dim - 1);
      p += d_offset;
    }
  }
}


void 
_ompsm_scash_cyclic_dist (char *name, char *pp, int size,
			  int block, int elmsz,
			  int scale, int offset)
{
  int page_st, page_ed, pages, chunksz, ptr, p;

  int	strp = (int)pp;
  int	endp = (int)pp + size;
  int	node = 0;


  if (_ompc_debug_flag) {
    printf ("map_dist / %s : addr=0x%x, size=0x%x(%d), block=0x%x(%d), element=0x%x(%d)",
	    name, (int)pp, size, size,
	    block * elmsz,  block * elmsz, elmsz, elmsz);
    if (scale != 1  ||  offset != 0) {
      printf (", scale = %d, offset = %d\n", scale, offset);
    }
    printf ("\n");
  }

  if(size > SCASH_PAGE_SIZE) {
    chunksz = block * elmsz * scale;
    if (0 <= offset) {
      ptr = strp - (offset % (block * scale * _ompc_n_node)) * elmsz;
    } else {
      ptr = strp - (block * scale * _ompc_n_node -
		    ((-offset) % (block * scale * _ompc_n_node))) * elmsz;
    }

    while (ptr < endp) {
      if (strp < (ptr + chunksz)) {
	p = (ptr < strp) ? (strp) : (ptr);
	page_st = (p + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
	if ((ptr + chunksz) < endp) {
	  page_ed = (ptr + chunksz + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
	} else {
	  page_ed = (endp + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
	}
	pages = page_ed - page_st;
	if (pages != 0) {
	  scash_home_fo (node, (char *)(page_st * SCASH_PAGE_SIZE), pages);
	}
      }

      ptr  += chunksz;
      node  = (node + 1) % _ompc_n_node;
    }
  }
}


void 
_ompsm_scash_default_block_dist (char *pp, int size)
{
    int sp, page_st, page_ed, pages, i;

    if (_ompc_debug_flag) {
	printf ("map_dist / block : addr=0x%x, size=0x%x\n", (int)pp, size);
    }

    if(SCASH_PAGE_SIZE < size){
	sp = size/_ompc_n_node;
	for(i = 0; i < _ompc_n_node; i++){
	    page_st = ((int)pp + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
	    page_ed = ((int)pp + sp + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
	    pages   = page_ed - page_st;
	    if (pages != 0) {
	      scash_home_fo (i, (char *)(page_st * SCASH_PAGE_SIZE), pages);
	    } 
	    pp += sp;
	}
    }
}


void 
_ompsm_scash_default_block_dist2 (char *pp, int size, int es)
{
  int page_st, page_ed, pages, tail, i, blks, nblk, rem, sz;

  if (_ompc_debug_flag) {
    printf ("map_dist / block : addr=0x%x, size=0x%x(%d) element-size=0x%x(%d)\n",
	    (int)pp, size, size, es, es);
  }

  tail = (int)pp + size;
  if(size > SCASH_PAGE_SIZE){
    blks = (size / es) + ((size%es)?(1):0);
    nblk = blks / _ompc_n_node;
    rem  = blks % _ompc_n_node;

    for(i = 0; i < _ompc_n_node; i++){
#ifdef DIST_BLOCK3
      int st = _ompc_n_node / rem;
      sz = (nblk + (((i%st == 0)&&(i<rem*st))?(1):(0))) * es;
#else
#ifdef DIST_BLOCK2
      sz = (nblk + ((i<rem)?(1):(0))) * es;
#else
      sz = (nblk + ((rem)?(1):(0))) * es;
#endif
#endif
      if (tail < (int)pp + sz) {
	sz = tail - (int)pp;
      }
      if (sz <= 0) {
	break;
      }
      page_st = ((int)pp + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
      page_ed = ((int)pp + sz + SCASH_PAGE_SIZE / 2) / SCASH_PAGE_SIZE;
      pages   = page_ed - page_st;
      if (pages != 0) {
	scash_home_fo (i, (char *)(page_st * SCASH_PAGE_SIZE), pages);
      }
      pp += sz;
    }
  }
}


int
_ompsm_scash_is_shared (char *p)
{
  return (IS_GLOBAL_MEMORY(p));
}



/* --------------------------------------------------- */
struct ompsm_proc_stat {
  pid_t		pid;
  atomic_t	count;
};

static struct ompsm_proc_stat	*_ompsm_pstat;
static struct ompsm_nlock 	*_ompsm_flush_lock;
static int			_ompsm_proc_id;
static int			_ompsm_proc_num;


# ifdef OMNI_CPU_I386
extern int _xchg_1 (volatile int *);


void
_dummy ()
{
  asm ("	.align 4			");
#  ifdef OMNI_OS_CYGWIN32
  asm (".def    __xchg_1                        ");
  asm ("        .scl    2                       ");
  asm ("        .type   32                      ");
  asm (".endef                                  ");
  asm (".globl __xchg_1				");
  asm ("__xchg_1:				");
#  else
#ifndef __INTEL_COMPILER
  asm ("	.type	 _xchg_1,@function	");
#endif
  asm (".globl _xchg_1				");
#  endif /* OMNI_OS_CYGWIN32, NTEL_COMPILER */
  asm ("_xchg_1:				");
  asm ("	pushl %ebp			");
  asm ("	movl %esp,%ebp			");
  asm ("	movl 8(%ebp),%edx		");
  asm ("	movl $1,%eax			");
  asm ("	xchgl 0(%edx),%eax		");
  asm ("	leave				");
  asm ("	ret				");
}


void
_ompsm_spin_init_lock (struct ompsm_nlock *lp)
{
  lp->lock  = 0;
  lp->id    = -1;
  lp->count = 0;
}


void
_ompsm_spin_lock (struct ompsm_nlock *lp)
{
  if (lp->id == _ompc_node_id) {
    lp->count ++;
    return;
  }

 again:
  while (lp->lock != 0) {
    scash_poll ();
  }
  if (_xchg_1(&lp->lock) != 0) {
    goto again;
  }
  lp->id = _ompc_node_id;
  lp->count = 1;
}


void
_ompsm_spin_unlock (struct ompsm_nlock *lp)
{
  lp->count --;
  if (lp->count == 0) {
    lp->id = -1;
    lp->lock = 0;
  }
}


int
_ompsm_spin_islock (struct ompsm_nlock *lp)
{
  return lp->lock;
}

# else

#error "#########################################################"
#error "This processor's spin lock operation is not defined, here"
#error "#########################################################"

# endif /* OMNI_CPU_I386 */


void
_ompsm_set_caddr_t (caddr_t *dest, caddr_t adr)
{
  *dest = adr;
}


void
_ompsm_scash_handler (int sig, struct sigcontext sc)
{
  _ompsm_scash_lib_in ();
  _ompsm_scash_lib_out ();
}


void
_ompsm_scash_flush_lock_init ()
{
  struct sigaction	sa, osa;
  int			size;
  char			*buf;


  /*
   * initialize parameter
   */
  _ompsm_proc_id    = scash_proc ();
  _ompsm_proc_num   = scash_smp ();


  /*
   * allocate shared memory for inbox communication
   */
  size = (sizeof (struct ompsm_nlock) + 
	  sizeof (struct ompsm_proc_stat) * _ompsm_proc_num);
  if (_ompc_n_node == 1) {
    buf = malloc (size);
  } else {
    buf = scash_alloc_omni_shmem (size);
  }
  if (buf == NULL) {
    _ompc_fatal ("can not allocate shared memory for inbox communication.\n");
  }

  _ompsm_flush_lock = (struct ompsm_nlock *) buf;
  _ompsm_pstat      = (struct ompsm_proc_stat *)(buf + sizeof(*_ompsm_flush_lock));


  /*
   * initialize allocate memory
   */
  _ompsm_pstat[_ompsm_proc_id].pid   = getpid ();
  atomic_set (&_ompsm_pstat[_ompsm_proc_id].count, 0);

  if (_ompsm_proc_id == 0) {
    _ompsm_spin_init_lock (_ompsm_flush_lock);
  }

  /*
   * set signal
   */
  if (_ompc_n_node == 1) {
    /* SCASH is not need SIGSEGV at 1 node */
  } else {
    sigemptyset (&sa.sa_mask);
    sa.sa_handler = SIG_DFL;
    sa.sa_flags = SA_STACK;
    sa.sa_restorer = 0;
    if (sigaction(SIGSEGV, &sa, &osa) < 0) {
      _ompc_fatal ("sigaction failed");
    }
    sigaddset(&osa.sa_mask, SIGUSR2);
    if (sigaction(SIGSEGV, &osa, NULL) < 0) {
      _ompc_fatal ("sigaction failed");
    }
  }

  sigemptyset (&sa.sa_mask);
  sigaddset(&sa.sa_mask, SIGUSR2);
  sa.sa_flags = SA_STACK;
  sa.sa_restorer = 0;
  sa.sa_handler = (void (*)(int))_ompsm_scash_handler;
  if (sigaction(SIGUSR2, &sa, NULL) < 0) {
    perror("sigaction");
  }

  scash_barrier (BARRIER_SYSTEM);
}


void
_ompsm_scash_lib_in ()
{
  atomic_inc (&_ompsm_pstat[_ompsm_proc_id].count);
}


void
_ompsm_scash_lib_out ()
{
  for (;;) {
    atomic_dec (&_ompsm_pstat[_ompsm_proc_id].count);
    if (atomic_read (&_ompsm_pstat[_ompsm_proc_id].count) == 0  &&
	_ompsm_spin_islock (_ompsm_flush_lock)) {
      atomic_inc (&_ompsm_pstat[_ompsm_proc_id].count);
      while (_ompsm_spin_islock (_ompsm_flush_lock)) { /* spin wait */
	scash_poll ();
      }
    } else {
      break;
    }
  }
}


void
_ompsm_scash_flush_lock ()
{
  int	proc;


  if (IN_PARALLEL) {
    _ompsm_spin_lock (_ompsm_flush_lock);
    for (proc = 0; proc < _ompsm_proc_num; proc ++) {
      if (atomic_read(&_ompsm_pstat[proc].count) == 0) {
	kill (_ompsm_pstat[proc].pid, SIGUSR2);
      }
    }
    for (proc = 0; proc < _ompsm_proc_num; proc ++) {
      while (atomic_read(&_ompsm_pstat[proc].count) == 0) { /* spin wait */
	scash_poll ();
      }
    }
  }
}


void
_ompsm_scash_flush_unlock ()
{
  if (IN_PARALLEL) {
    _ompsm_spin_unlock (_ompsm_flush_lock);
  }
}


void
_ompsm_scash_terminate_handler (void *args[])
{
  int	status = (int) args[0];

  fflush (stdout);
  scash_barrier (BARRIER_SYSTEM);

  _ompsm_terminate_flag = TRUE;
  exit (status);
}


void
_ompsm_scash_terminate (int status)
{
  void * args[1];

  args[0] = (void *)status;

  _ompc_do_parallel (_ompsm_scash_terminate_handler, args, 1);

  /* not reached here */
}

