#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <sys/stat.h>
#include <fcntl.h>
#include "ompsm.h"

/*
 * Unix shmem interface.
 */
/* #define USE_DEV_ZERO 1 */

/* shared memory mapping definitions */
#define SHEMEM_START_ADDR	((caddr_t)0)

#define CACHE_LINE_SIZE	32

# ifdef OMNI_CPU_I386
# define MBAR() { /* asm("cpuid"); */ }
#else 
# ifdef OMNI_CPU_SPARC
# define MBAR() asm("stbar"); 
#else
# define MBAR() 
#endif
#endif


#define OMPSM_CPYBUF_SIZE	4096
#define MAX_COUNT		10000


static void _ompsm_shmem_proc_yield();

#define WAIT(cond) \
{if(cond){ volatile int c = 0; while(cond){ \
if(c++>MAX_COUNT){ _ompsm_shmem_proc_yield(); c = 0; }}}}

/* 
 * shared data for unix_shmem.
 */
struct ompsm_shmem_data {
    volatile _ompc_lock_t lock;		/* spin lock on this structure */
    volatile _ompc_lock_t mlock;	/* spin lock for memory allocater */

    volatile caddr_t mem_freep;
    volatile caddr_t mem_endp;

    volatile caddr_t start,end;
    
    /* thread management */
    volatile int thd_start_flag;

    /* structure for barrier between thread */
    volatile int barrier_sense;
    volatile struct {
	int _v;
	char _padding[CACHE_LINE_SIZE-sizeof(int)];
    } barrier_flags[MAX_PROC];

    /* IN/OUT barrier */
    volatile struct {
	int _v;
	char _padding[CACHE_LINE_SIZE-sizeof(int)];
    } in_flags[MAX_PROC];
    volatile int in_count;
    volatile int out_count;

    /* copy buffer for thread private */
    char cpybuf[OMPSM_CPYBUF_SIZE];
};

static struct ompsm_shmem_data *_ompsm_shmem_data;

/* process id */
static int _ompsm_pids[MAX_PROC];

/* mapped file */
static int _ompsm_shared_map_fd;
/* static caddr_t _ompsm_shmem_base; */
#ifndef USE_DEV_ZERO
static char _ompsm_mmap_file[100];
#endif

static caddr_t _ompsm_shmem_mmap(int memsize);
static void _ompsm_shmem_fork_process();
static void _ompsm_shmem_SIGCHLD_handler();
static void _ompsm_shmem_fatal(char *msg);

void _ompsm_shmem_lock(volatile _ompc_lock_t *lp);
void _ompsm_shmem_unlock(volatile _ompc_lock_t *lp);
static void	_ompsm_shmem_par_barrier ();

void _ompsm_shmem_init(int argc, char **argv)
{
    int memsize;
    caddr_t base;

    memsize = sizeof(struct ompsm_shmem_data)+
	sizeof(*__G__)+_ompsm_gdata_size+_ompsm_free_shmem_size;
    base = _ompsm_shmem_mmap(memsize);

    if(_ompc_debug_flag)
	fprintf(stderr, "shared memory = 0x%lx, size=0x%x\n",
		(unsigned long int)base, memsize);

    /* initialize shared data */
    _ompsm_shmem_data = (struct ompsm_shmem_data *)base;
    bzero(_ompsm_shmem_data,sizeof(struct ompsm_shmem_data));
    _ompsm_shmem_data->mem_freep =
	base + sizeof(struct ompsm_shmem_data);
    _ompsm_shmem_data->mem_endp = base + memsize;
    
    _ompsm_shmem_data->start = _ompsm_shmem_data->mem_freep;
    _ompsm_shmem_data->end = _ompsm_shmem_data->mem_endp;

    __G__ = (struct _global_ *)_ompsm_shmem_alloc(sizeof(*__G__));
    bzero(__G__,sizeof(*__G__));

    /* fork process */
    _ompc_node_id = 0; 		/* for master */
    _ompc_n_node = _ompc_max_threads;
    _ompc_num_threads = _ompc_n_node;
    _ompsm_shmem_fork_process(_ompc_max_threads);
}

void _ompsm_shmem_fatal(char *msg)
{
    fprintf(stderr,"_ompsm_shmem_fatal: %s\n",msg);
    exit(1);
}

typedef void (*sighandler_t)(int);

/* 
 * process management
 */
void _ompsm_shmem_SIGCHLD_handler()
{
    int prog_stat,pid,i;

    while((pid = waitpid((pid_t)(-1), &prog_stat, WNOHANG)) > 0){
	for(i = 0; i < _ompc_n_node; i++){
	    if(_ompsm_pids[i] == pid) _ompsm_pids[i] = 0;
	}
    }
    /* Re-enable signals if necessary */
    signal(SIGCHLD, (sighandler_t)_ompsm_shmem_SIGCHLD_handler);
}

void _ompsm_shmem_abort_handler()
{
    _ompsm_shmem_finalize();
    exit(1);
}

void _ompsm_shmem_fork_process(int n_procs)
{
    int i, rc;

    /* set handler */
    signal(SIGCHLD, (sighandler_t) _ompsm_shmem_SIGCHLD_handler);
    signal(SIGSEGV, (sighandler_t) _ompsm_shmem_abort_handler);
    signal(SIGINT, (sighandler_t) _ompsm_shmem_abort_handler);
    signal(SIGABRT, (sighandler_t) _ompsm_shmem_abort_handler);
    
    _ompsm_pids[0] = 0;
    for (i = 1; i < n_procs; i++){
	rc = fork();
	if (rc == -1){
	    perror("fork:");
	    _ompsm_shmem_fatal("fork failed\n");
	} else if (rc == 0){ 	/* child side */
	    _ompc_node_id = i;
	    _ompsm_slave_main();	/* call slave main */
	    _ompsm_shmem_fatal("??? slave_main return ???");
	} else {
	    _ompsm_pids[i] = rc;
	}
    }
}

void _ompsm_shmem_finalize()
{
    int i;

    if(_ompc_node_id != 0) {
      kill (getppid (), SIGABRT);
      return;
    }

    /* else, send signal to kill children */
    for(i = 1; i < _ompc_n_node; i++){
	if(_ompsm_pids[i] != 0) kill(_ompsm_pids[i],SIGKILL);
    }

    for(i = 1; i < _ompc_n_node; i++){
	while(_ompsm_pids[i] != 0){
	    if(kill(_ompsm_pids[i],0) < 0) /* no longer exist */
		_ompsm_pids[i] = 0;
	}
    }

#ifndef USE_DEV_ZERO
    unlink(_ompsm_mmap_file);
#endif
}

/*     caddr_t mmap(caddr_t addr, size_t len, int prot, int flags,
 *         int fildes, off_t off);
 */
caddr_t _ompsm_shmem_mmap(memsize)
{
    caddr_t start_shared_area;

#ifdef USE_DEV_ZERO
    _ompsm_shared_map_fd = open("/dev/zero", O_RDWR);
    if (_ompsm_shared_map_fd < 0) {
	perror("Open of /dev/zero failed");
	_ompsm_shmem_fatal("OOPS: Could not open anonymous mmap area - check \
protections on /dev/zero\n");
    }
#else 

    sprintf(_ompsm_mmap_file,"/tmp/_ompsm_shmem.%d",(unsigned int)getpid());

    _ompsm_shared_map_fd = open(_ompsm_mmap_file, O_RDWR|O_CREAT,0666);
    if(_ompsm_shared_map_fd < 0){
	perror("Open of shmem mmap file");
	_ompsm_shmem_fatal("canot open shmem mmap file");
    }
#endif

    lseek(_ompsm_shared_map_fd,(off_t)memsize,SEEK_SET);
    write(_ompsm_shared_map_fd,&memsize,sizeof(memsize)); /* test write */
    lseek(_ompsm_shared_map_fd,(off_t)0,SEEK_SET);

    start_shared_area =
      (caddr_t) mmap((caddr_t)0, memsize,
		     PROT_READ|PROT_WRITE,MAP_SHARED,
		    _ompsm_shared_map_fd, (off_t) 0);
    if (start_shared_area == (caddr_t)-1){
	perror("mmap failed");
	_ompsm_shmem_fatal("OOPS: mmap failed: cannot map shared memory");
    }
    return start_shared_area;
}

static void _ompsm_shmem_proc_yield()
{
    struct timeval tv;
    tv.tv_sec = tv.tv_usec = 0;
    select(0,NULL,NULL,NULL,&tv);

    if(_ompc_debug_flag && !IS_MASTER_NODE){
	/* in debug mode, check parent status. */
	if(kill(getppid(),0) < 0){
	    _ompsm_shmem_fatal("parent may be dead. exit.\n");
	}
    }
}

/* 
 * master and slave
 */
void
_ompsm_shmem_master_begin ()
{
  struct ompsm_shmem_data *dp = _ompsm_shmem_data;
  dp->thd_start_flag = 1;
  MBAR ();
}


void
_ompsm_shmem_master_end ()
{
  _ompsm_shmem_par_barrier ();
}


void
_ompsm_shmem_slave_begin ()
{
  struct ompsm_shmem_data *dp = _ompsm_shmem_data;
  WAIT(dp->thd_start_flag == 0);
}


void
_ompsm_shmem_slave_end ()
{
  _ompsm_shmem_par_barrier ();
}


static void
_ompsm_shmem_par_barrier ()
{
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    int sen0,n,id;

    sen0 = dp->barrier_sense ^ 1;
    n = _ompc_n_node;
    id = _ompc_node_id;

    if (id == 0){
	int j;
	for ( j = 1 ; j < n ; j++ )
	    WAIT(dp->barrier_flags[j]._v != sen0);
	dp->thd_start_flag = 0;
	MBAR();
	dp->barrier_sense = sen0;
	MBAR();
    } else {
	dp->barrier_flags[id]._v = sen0;
	MBAR();
	WAIT (dp->barrier_sense != sen0);
    }
}

/* 
 * memory allocation
 */
caddr_t _ompsm_shmem_alloc(int size)
{
    caddr_t p;
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    _ompsm_shmem_lock(&(dp->mlock));
    p = _ompsm_shmem_align_addr((caddr_t )(dp->mem_freep));
    dp->mem_freep = p + size;
    if(dp->mem_freep >= dp->mem_endp){
	_ompsm_shmem_fatal("shared memory run out!");
    }
    _ompsm_shmem_unlock(&(dp->mlock));
    return p;
}

/* 8 byte alignment */
caddr_t _ompsm_shmem_align_addr(caddr_t addr)
{
    return (caddr_t )((((_omAddrInt_t)addr)+7)&~7);
}

int _ompsm_shmem_align_size(int size)
{
    return (size+7)&~7;
}

caddr_t _ompsm_shmem_alloca(int size)
{
    caddr_t p;
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;

    size = (size+7)&~7;
    p = (caddr_t )dp->mem_endp - size;
    if(dp->mem_freep >= p){
	_ompsm_shmem_fatal("shared memory run out!");
    }
    dp->mem_endp = p;
    return p;
}

void _ompsm_shmem_freea(caddr_t p,int size)
{
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;

    if(dp->mem_endp != p){
	_ompsm_shmem_fatal("shmem_freea: bad address");
    }
    size = (size+7)&~7;
    dp->mem_endp += size;
}

int _ompsm_shmem_is_shared(char *p)
{
    return (p >= _ompsm_shmem_data->start && p < _ompsm_shmem_data->end);
}

/*
 * Barrier
 */
void _ompsm_shmem_barrier()
{
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    int sen0,n,id;

    sen0 = dp->barrier_sense ^ 1;
    n = _ompc_n_node;
    id = _ompc_node_id;

    if (id == 0){
	int j;
	for ( j = 1 ; j < n ; j++ )
	    WAIT(dp->barrier_flags[j]._v != sen0);
	dp->barrier_sense = sen0;
	MBAR();
    } else {
	dp->barrier_flags[id]._v = sen0;
	MBAR();
	WAIT (dp->barrier_sense != sen0);
    }
}

int _ompsm_shmem_loop_in_lock0()
{
    int id = _ompc_node_id;
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    int r;
    
    WAIT((volatile int)dp->in_flags[id]._v);
    _ompsm_shmem_lock(&(dp->lock));
    r = dp->in_count++;
    dp->in_flags[id]._v = 1;
    return r;
}

void _ompsm_shmem_loop_out()
{
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    int id;
    
    _ompsm_shmem_lock(&(dp->lock));
    dp->out_count++;
    if(dp->out_count == _ompc_n_node){   /* all thread exit */
	dp->out_count = 0;
	dp->in_count = 0;
	for(id = 0; id < _ompc_n_node; id++)
	    dp->in_flags[id]._v = 0;
	MBAR();
    }
    _ompsm_shmem_unlock(&(dp->lock));
}

int _ompsm_shmem_count_lock0()
{
    int id = _ompc_node_id;
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    int r;

    WAIT((volatile int)dp->in_flags[id]._v);
    _ompsm_shmem_lock(&(dp->lock));
    r = dp->out_count++;
    if(dp->out_count == _ompc_n_node){
	/* if all threads comes, clear flags */
	for(id = 0; id < _ompc_n_node; id++) dp->in_flags[id]._v = 0;
	dp->out_count = 0;
    } else dp->in_flags[id]._v = 1;
    return r;
}

void _ompsm_shmem_lock0()
{
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    _ompsm_shmem_lock(&(dp->lock));
}

void _ompsm_shmem_unlock0()
{
    struct ompsm_shmem_data *dp = _ompsm_shmem_data;
    _ompsm_shmem_unlock(&(dp->lock));
}

void _ompsm_shmem_flush()
{
    MBAR();
}

/*
 * spin lock function
 */
void _ompsm_shmem_init_lock(_ompc_lock_t *lp)
{
    *lp = 0;
}

void _ompsm_shmem_destroy_lock(volatile _ompc_lock_t *lp){
    /* do nothing */
}

# ifdef OMNI_CPU_SPARC
void LockWithLdstUB(volatile int *);
void UnlockWithLdstUB(volatile int *);
int  TestLockWithLdstUB(volatile int *);

void _ompsm_shmem_lock(volatile _ompc_lock_t *lp)
{
    LockWithLdstUB(lp);
}

void _ompsm_shmem_unlock(volatile _ompc_lock_t *lp)
{
    UnlockWithLdstUB(lp);
}

int _ompsm_shmem_test_lock(volatile _ompc_lock_t *lp)
{
    return !TestLockWithLdstUB(lp);
}

void
__dummy()
{
    asm(".align  8");
    asm(".skip   16");
    asm(".type   LockWithLdstUB,#function");
    asm(".global LockWithLdstUB");
    asm("LockWithLdstUB:");
    asm("retry:  ldstub  [%o0],%o1       ! atomic load store");
    asm("tst     %o1");
    asm("be      out");
    asm("nop");
    asm("loop:   ldub    [%o0],%o1       ! load and test");
    asm("tst     %o1");
    asm("bne     loop");
    asm("nop");
    asm("ba,a    retry");
    asm("out:    nop");
    asm("jmp     %o7+8   ! return");
    asm("nop");

    asm(".type   TestLockWithLdstUB,#function");
    asm(".global TestLockWithLdstUB");
    asm("TestLockWithLdstUB:");
    asm("ldstub  [%o0],%o0       ! atomic load store");
    asm("jmp     %o7+8   ! return");
    asm("nop");

    asm(".type   UnlockWithLdstUB,#function");
    asm(".global UnlockWithLdstUB");
    asm("UnlockWithLdstUB:");
    asm("stbar");
    asm("stb     %g0,[%o0]       ! clear lock");
    asm("jmp     %o7+8           ! return");
    asm("nop");
}
# endif /* OMNI_CPU_SPARC */

# ifdef OMNI_CPU_I386
int _xchg_1 (volatile int *p);

void _dummy ()
{
  asm ("	.align 4			");
  asm ("	.type	 _xchg_1,@function	");
  asm (".globl _xchg_1				");
  asm ("_xchg_1:				");
  asm ("	pushl %ebp			");
  asm ("	movl %esp,%ebp			");
  asm ("	movl 8(%ebp),%edx		");
  asm ("	movl $1,%eax			");
  asm ("	xchgl 0(%edx),%eax		");
  asm ("	leave				");
  asm ("	ret				");
}

void _ompsm_shmem_lock(volatile _ompc_lock_t *lp)
{
 again:
    while(*lp != 0) /* spin wait */;
    if(_xchg_1(lp) != 0) goto again;
}

void _ompsm_shmem_unlock(volatile _ompc_lock_t *lp)
{
    *lp = 0;
}

int _ompsm_shmem_test_lock(volatile _ompc_lock_t *lp)
{
    if(_xchg_1(lp) != 0) return 0;
    else return 1;
}

# endif /* OMNI_CPU_I386 */

# ifdef OMNI_CPU_MIPS
/* call SGI library */
void _ompsm_shmem_lock(volatile _ompc_lock_t *lp)
{
  while (__lock_test_and_set(lp, 1) != 0);
}

void _ompsm_shmem_unlock(volatile _ompc_lock_t *lp)
{
  __lock_release(lp);
}

int _ompsm_shmem_test_lock(volatile _ompc_lock_t *lp)
{
  return __lock_test_and_set(lp, 1);
}

# endif /* OMNI_CPU_MIPS */


void
_ompsm_shmem_bcast_thdprv(void *dst, void *src, int size)
{
  struct ompsm_shmem_data	*dp = _ompsm_shmem_data;
  int	cpybufsz = sizeof(dp->cpybuf);


  if((NOT_IN_PARALLEL) || (IN_NESTED_PARALLEL)) {
    if (src != dst) {
      bcopy (src,dst,size);
    }
  } else {
    char *s = src;
    char *d = dst;
    int  sz;

    while (0 < size) {
      sz = ((size < cpybufsz) ? (size) : (cpybufsz));
      if (IS_MASTER_THREAD) {
	bcopy (s, dp->cpybuf, sz);
	_ompsm_shmem_barrier ();
      } else {
	_ompsm_shmem_barrier ();
	bcopy (dp->cpybuf, d, sz);
      }
      
      s    += sz;
      d    += sz;
      size -= sz;

      _ompsm_shmem_barrier ();
    }
  }
}


/*
 * dummy 
 */
void _ompsm_shmem_init_nlock (volatile _ompc_nest_lock_t *lp) {
  _ompsm_shmem_fatal ("nest lock is not implement");
}

void _ompsm_shmem_destroy_nlock (volatile _ompc_nest_lock_t *lp) {
  _ompsm_shmem_fatal ("nest lock is not implement");
}

void _ompsm_shmem_lock_nlock (volatile _ompc_nest_lock_t *lp) {
  _ompsm_shmem_fatal ("nest lock is not implement");
}

void _ompsm_shmem_unlock_nlock (volatile _ompc_nest_lock_t *lp) {
  _ompsm_shmem_fatal ("nest lock is not implement");
}

int _ompsm_shmem_testlock_nlock (volatile _ompc_nest_lock_t *lp) {
  _ompsm_shmem_fatal ("nest lock is not implement");
  return 0;
}

