    /*        Fast GEMM routine for Alpha 21164/21264      */
    /*         on  Linux, Digital UNIX                     */
    /*        by Kazushige Goto <goto@statabo.rim.or.jp>   */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/mman.h>
#include <pthread.h>
#include "common.h"
#include "version.h"

/* To allocate buffer memory, it's pretty large. */
#ifdef EV6
#define BUFFER_SIZE (16<<20)
#define ALIGN_SIZE 0x7fffff     /* 8MB Align */
#else
#define BUFFER_SIZE (1<<20)
#define ALIGN_SIZE  0x3ffff     /* 256kB Align */
#endif

static char *version = VERSION;

#ifdef SMP
typedef struct {
  int trans;
  int m, n, k;
  FLOAT alpha;
  FLOAT *a;
  int lda;
  FLOAT *b;
  int ldb;
  FLOAT *c;
  int ldc;
} GEMM_PARAM_T;

static GEMM_PARAM_T param;
static int          offset;

int GEMM_THREAD(void *arg){
  long current = (long)arg;
  int jn, info, pthread_flag;
  FLOAT *b, *buffer, *a_buffer;
  int (*gemm[])(int, int, int, FLOAT, FLOAT *, int, FLOAT* ,
		 int, FLOAT *, int, FLOAT *)
    ={GEMM_NN, GEMM_TN, GEMM_NT, GEMM_TT};
  
  if ((buffer = (FLOAT *)mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == NULL){
    info = INFO_NUM;
#ifdef DGEMM
    xerbla_("DGEMM ", &info, 6L);
#else
    xerbla_("SGEMM ", &info, 6L);
#endif
    return 1;
  }

  a_buffer = (FLOAT *)(((long)buffer + ALIGN_SIZE) & ~ALIGN_SIZE);

  pthread_flag = (current & 1);
  current >>= 1;

  jn = MIN(offset, param.n - current);

  if (((param.trans)>>1)==0){
    b = param.b + current*param.ldb;
  }else{
    b = param.b + current;
  }

  (gemm[param.trans])(param.m, jn, param.k,
			param.alpha, param.a, param.lda,
			b, param.ldb,
			param.c + current*param.ldc, param.ldc, a_buffer);

  munmap((void *)buffer, BUFFER_SIZE);

  if (pthread_flag) pthread_exit(NULL);
  return 0;
};
#endif

/*
    This routine is front end of compatibility of gemm.f.
    Checking the parameters, and initializing matrix C with BETA.
    At last, by Transposed or Non-Transposed, call subroutine
    separately.
*/

/*     C := alpha * A x B + beta * C */

int GEMM_(char *TRANSA, char *TRANSB,
	    int *M, int *N, int *K,
	    FLOAT *ALPHA,
	    FLOAT *a, int *ldA,
	    FLOAT *b, int *ldB,
	    FLOAT *BETA,
	    FLOAT *c, int *ldC){
    
  int nota, notb, nrowa, nrowb;
  int lda, ldb, ldc, m, n, k, info;
  FLOAT alpha, beta;
  char transA, transB;
#ifndef SMP
  FLOAT *buffer, *a_buffer;
#endif

#ifdef SMP
  int   i, j;
  pthread_t threads[CPU_NUM];
  int num_of_cpu;
#endif

#ifndef SMP
  int (*gemm[])(int, int, int, FLOAT, FLOAT *, int, FLOAT* ,
		 int, FLOAT *, int, FLOAT *)
    ={GEMM_NN, GEMM_TN, GEMM_NT, GEMM_TT};
#endif

  lda = *ldA;
  ldb = *ldB;
  ldc = *ldC;
  m   = *M;
  n   = *N;
  k   = *K;
  alpha = *ALPHA;
  beta  = *BETA;
  transA = *TRANSA;
  transB = *TRANSB;

  transA = toupper(transA);
  transB = toupper(transB);

  nota = ((transA != 'N') && (transA != 'R'));
  notb = ((transB != 'N') && (transB != 'R'));

  if (nota) nrowa = k; else nrowa = m;
  if (notb) nrowb = n; else nrowb = k;

/* Test the input parameters. */
  info = 0;
  if (nota && (transA != 'C') && (transA != 'T')) info = 1;
  else
    if (notb && (transB != 'C') && (transB != 'T')) info = 2;
  else
    if (m < 0) info = 3;
  else
    if (n < 0) info = 4;
  else
    if (k < 0) info = 5;
  else
    if (lda < nrowa) info = 8; 
  else
    if (ldb < nrowb) info = 10;
  else
    if (ldc < m) info = 13;

  if (info){
#ifdef DGEMM
    xerbla_("DGEMM ", &info, 6L);
#else
    xerbla_("SGEMM ", &info, 6L);
#endif
    return 0;
  }

  if ((m==0) || (n==0)) return 0;

  if (beta != ONE) GEMM_BETA(m, n, c, ldc, beta);

  if ((alpha==ZERO) || (k==0)) return 0;


#ifndef SMP
  if ((buffer = (FLOAT *)mmap(0, BUFFER_SIZE, PROT_READ | PROT_WRITE,
		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == NULL){
    info = INFO_NUM;
#ifdef DGEMM
    xerbla_("DGEMM ", &info, 6L);
#else
    xerbla_("SGEMM ", &info, 6L);
#endif
    return 0;
  }

  a_buffer = (FLOAT *)(((long)buffer + ALIGN_SIZE) & ~ALIGN_SIZE);

  (gemm[((notb<<1)|nota)])(m, n, k, alpha, a, lda, b, ldb, c, ldc, a_buffer);

  munmap((void *)buffer, BUFFER_SIZE);

#else  /* SMP */

  param.trans = ((notb<<1) | nota);
  param.m     = m;
  param.n     = n;
  param.k     = k;
  param.alpha = alpha;
  param.a     = a;
  param.lda   = lda;
  param.b     = b;
  param.ldb   = ldb;
  param.c     = c;
  param.ldc   = ldc;

  offset = (n-1)/(CPU_NUM)+1;
  if (offset < 8) offset = 8;

  num_of_cpu = (n-1)/offset + 1;

  for(j=0, i=0; i < num_of_cpu - 1; j+= offset, i++){
    pthread_create(&threads[i], NULL, 
		   (void *)&GEMM_THREAD, (void *)(long)((j<<1)|1));
  }

  GEMM_THREAD((void *)(long)((j<<1)));

  for(i=0; i < num_of_cpu - 1; i++){
    pthread_join(threads[i], NULL);
  }
#endif

  return 0;
}
