/*******************************************************
                        PFTOOLS
 *******************************************************
  Oct 3, 2011 numa_threads.h
 *******************************************************
 (C) 2011 Swiss Institute of Bioinformatics
     Thierry Schuepbach (thierry.schuepbach@isb-sib.ch)
 *******************************************************/
#ifndef NUMA_THREADS_H_
#define NUMA_THREADS_H_
#define _FILE_OFFSET_BITS 64
#include <numa.h>

#ifdef __USE_MMAP__
#include <numaif.h>
#include <sys/mman.h>
#endif

// #define volatile 

#ifdef NUMA_DEBUG
void numa_run_mask(char* mask)
{
 struct bitmask * bit = numa_get_run_node_mask();
 for (unsigned long i=0; i<bit->size; ++i) {
    if (numa_bitmask_isbitset(bit, i)) {
     mask[i] = '1' ; 
    } else {
      mask[i] = '0' ;
    }
 }
 mask[bit->size] = '\0';
}

#endif

struct numa_NodeData {
  const struct Profile * prf;
  const FASTAStructure * FASTA;
  char * SequenceFileName;
//   char * Sequences;
  __32bitData * Array;
  FILE ** TempOutputFile;
  volatile size_t *shares;
  volatile size_t shareStart;
//   volatile size_t maxShare;
  size_t NodeId;
//   size_t nNodes;
  volatile size_t nthreads;
};

struct numa_ThreadData {
  const struct Profile * prf;
  const FASTAStructure * FASTA;
#ifdef __USE_MMAP__
  volatile char * SequenceFileMap;
  volatile off_t InitialArrayOffset;
#ifdef MMAP_DEBUG
  volatile size_t * maplength;
#endif
#else
  char * SequenceFileName;
#endif
  char * Sequences;
  __32bitData * Array;
  TransposeMatrix TransposeMatch;
  int * FilterWork;
  union lScores * iop;
  union Positions * iom;
  union Positions * ioi;
  struct Alignment * alignment;
  _Bool * lock;
  FILE ** TempOutputFile;
  volatile int ** FilterScores; 
  size_t NodeFirstSeq;
  volatile size_t start;
  volatile size_t stop;
  unsigned int counter;
  size_t threadId;
#ifdef NUMA_DEBUG
  size_t NodeId;
#endif
  pthread_mutex_t *ThreadMutex;
  pthread_cond_t  *ThreadCond;
  volatile int * thread_counter;
  pthread_mutex_t *NodeMutex;
  pthread_cond_t  *NodeCond;
  volatile _Bool *NodeDone;
};


/* Mutex for Node to trigger Master */
pthread_mutex_t NodeMutex;
pthread_cond_t  NodeCond;
volatile int NodeCounter;

/* Mutex for Master to trigger Nodes */
pthread_mutex_t MasterMutex;
pthread_cond_t  MasterCond;
volatile _Bool MasterDone;

#endif


static void * SIMD(numa_thread)(void * _Data) 
{ 
/*#ifdef NUMA_DEBUG
  char mask[1024];
  numa_run_mask(mask);
  fprintf(stderr,"Thread %lu from node %lu started with cpu mask %s\n",
	  ((struct numa_ThreadData*) _Data)->threadId,
	  ((struct numa_ThreadData*) _Data)->NodeId,
	  mask);
#endif*/
  ////////////////////////////////////////////////////////////////////////////////////////
  // 1. Parse _Data according to numa_threadData
  const struct Profile * const restrict prf   = ((struct numa_ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct numa_ThreadData*) _Data)->FASTA;
  Sequence SeqData                            = { {Memory: ((struct numa_ThreadData*) _Data)->Sequences }, { NULL, 0}} ;
  pthread_mutex_t * ThreadMutex               = ((struct numa_ThreadData*) _Data)->ThreadMutex;
  pthread_cond_t * ThreadCond                 = ((struct numa_ThreadData*) _Data)->ThreadCond;
  pthread_mutex_t * NodeMutex                 = ((struct numa_ThreadData*) _Data)->NodeMutex;
  pthread_cond_t * NodeCond                   = ((struct numa_ThreadData*) _Data)->NodeCond;
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 2. Open database file
  // TODO: Add error check here
#ifndef __USE_MMAP__
  FILE * const inSequence = fopen(((struct numa_ThreadData*) _Data)->SequenceFileName, "r");
#else
  const char * const restrict SequenceFileMap = ((struct numa_ThreadData*) _Data)->SequenceFileMap;
  const size_t InitialArrayOffset             = ((struct numa_ThreadData*) _Data)->InitialArrayOffset;
#endif
  ////////////////////////////////////////////////////////////////////////////////////////
  // 3. Heuristic phase
  {
#if SIMD_VER  == 2
    const float * const restrict TransposeMatch = ((struct numa_ThreadData*) _Data)->TransposeMatch.f;
    float * const restrict Scores               = &( ((struct numa_ThreadData*) _Data)->Array[0].FloatScores );
#elif SIMD_VER == 41
    const int * const restrict TransposeMatch   = ((struct numa_ThreadData*) _Data)->TransposeMatch.i;
    unsigned int * const restrict Scores        = &( ((struct numa_ThreadData*) _Data)->Array[0].UnsignedScores );
#endif
#ifndef STACK
 #if SIMD_VER  == 2
    float * const Work                          = (float*) ((struct numa_ThreadData*) _Data)->FilterWork;
 #elif SIMD_VER == 41
    int * const Work                            = (int*) ((struct numa_ThreadData*) _Data)->FilterWork;   
 #endif
#endif
    const register size_t NodeFirstSeq          = ((struct numa_ThreadData*) _Data)->NodeFirstSeq;
    const register size_t Start                 = ((struct numa_ThreadData*) _Data)->start;
    const register size_t Stop                  = ((struct numa_ThreadData*) _Data)->stop;
#ifdef NUMA_DEBUG
    fprintf(stderr, "\t\tThread %lu of Node %lu heuristic sequences from %lu to %lu\n",
	    ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId,
	    NodeFirstSeq+Start, NodeFirstSeq+Stop);
#endif       
    /* LOOPS ON SEQUENCES */
    for (size_t i=Start; i<Stop; ++i) {
#ifdef __USE_MMAP__
      PFSequence * PFSeq = MMAPReadSequenceIndex(&SeqData, NodeFirstSeq+i, SequenceFileMap, FASTA->DataPtr, InitialArrayOffset
#ifdef MMAP_DEBUG
	, ((struct numa_ThreadData*) _Data)->threadId, ((struct numa_ThreadData*) _Data)->NodeId, *(((struct numa_ThreadData*) _Data)->maplength)
#endif
      );
//       fprintf(stderr, "Seq %lu\n%s\n",NodeFirstSeq+i, PFSeq->ProfileIndex);
#else
      PFSequence * PFSeq = ReadSequenceIndex(&SeqData, NodeFirstSeq+i, inSequence, FASTA->DataPtr);
#endif
      /* Translate first sequence */
      PFSeq = TranslateSequenceToIndex(PFSeq, prf->Alphabet_Mapping);
#ifdef STACK
      Scores[i] = SIMD(TransposeHeuristic)(TransposeMatch, prf->Alphabet_Length, prf->Length, PFSeq);
#else
      Scores[i] = SIMD(TransposeHeuristicGivenMemory)(TransposeMatch, Work,
						     prf->Alphabet_Length, prf->Length, PFSeq);                            
#endif
    }

    /* Notify node of finished stage */
    pthread_mutex_lock(ThreadMutex);
#ifdef NUMA_DEBUG
    fprintf(stderr, "\t\tThread %lu of Node %lu heuristic finished: counter at %i\n",
	    ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId,
	    *(((struct numa_ThreadData*) _Data)->thread_counter));
#endif    
    *(((struct numa_ThreadData*) _Data)->thread_counter) -= 1;
    if (*(((struct numa_ThreadData*) _Data)->thread_counter) == 0) {
      pthread_cond_signal(ThreadCond);
#ifdef NUMA_DEBUG
      fprintf(stderr, "\t\tThread %lu of Node %lu notify Node.\n", ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId);
#endif
    }
    pthread_mutex_unlock(ThreadMutex);
  }
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 3. Filter phase
  /* Wait for signal from master node */
  pthread_mutex_lock(NodeMutex);
  if (! *(((struct numa_ThreadData*) _Data)->NodeDone)) {
#ifdef NUMA_DEBUG
      fprintf(stderr, "\t\tThread %lu of Node %lu waiting on Node.\n", ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId);
#endif  
    pthread_cond_wait(NodeCond, NodeMutex);
  }
#ifdef NUMA_DEBUG 
  else {
    fprintf(stderr, "\t\tThread %lu of Node %lu DID NOT WAIT !!!\n",
	    ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId);
  }
#endif
  pthread_mutex_unlock(NodeMutex);
  
  {
    const unsigned int * const restrict SeqID   = &( ((struct numa_ThreadData*) _Data)->Array->ToDoID );
    int * const restrict Scores                 = *( ((struct numa_ThreadData*) _Data)->FilterScores );
    int * restrict Work                         = ((struct numa_ThreadData*) _Data)->FilterWork;
    const register size_t Start                 = ((struct numa_ThreadData*) _Data)->start;
    const register size_t Stop                  = ((struct numa_ThreadData*) _Data)->stop;
#ifdef __USE_MMAP__
    const char * const restrict SequenceFileMap2= ((struct numa_ThreadData*) _Data)->SequenceFileMap;
    const size_t InitialArrayOffset2            = ((struct numa_ThreadData*) _Data)->InitialArrayOffset;
#endif
#ifdef NUMA_DEBUG
      fprintf(stderr, "\t\tThread %lu of Node %lu filtering %lu to %lu.\n",
	      ((struct numa_ThreadData*) _Data)->threadId, ((struct numa_ThreadData*) _Data)->NodeId,
	      Start, Stop );
#endif    
    /* LOOPS ON SEQUENCES */
    for (size_t i=Start; i<Stop; ++i) {
#ifdef __USE_MMAP__
      PFSequence * PFSeq = MMAPReadSequenceIndex(&SeqData, (size_t) SeqID[i], SequenceFileMap2, FASTA->DataPtr, InitialArrayOffset2
#ifdef MMAP_DEBUG
	, ((struct numa_ThreadData*) _Data)->threadId, ((struct numa_ThreadData*) _Data)->NodeId, *(((struct numa_ThreadData*) _Data)->maplength)
#endif
      );
#else
      PFSequence * PFSeq = ReadSequenceIndex(&SeqData, (size_t) SeqID[i], inSequence, FASTA->DataPtr);
#endif

      /* Translate first sequence */
      PFSeq = TranslateSequenceToIndex(PFSeq, prf->Alphabet_Mapping);
      
      Scores[i] = xali1_ptr(prf, PFSeq->ProfileIndex, Work, 0,
			    PFSeq->Length, prf->CutOffData.ICUT[0], false);     
    }
  }
  
  /* Notify node of finished stage */
  pthread_mutex_lock(ThreadMutex);
#ifdef NUMA_DEBUG
  fprintf(stderr, "\t\tThread %lu of Node %lu filter finished: counter at %i\n",
	  ((struct numa_ThreadData*) _Data)->threadId,
	  ((struct numa_ThreadData*) _Data)->NodeId,
	  *(((struct numa_ThreadData*) _Data)->thread_counter));
#endif    
  *(((struct numa_ThreadData*) _Data)->thread_counter) -= 1;
  if (*(((struct numa_ThreadData*) _Data)->thread_counter) == 0) {
    pthread_cond_signal(ThreadCond);
#ifdef NUMA_DEBUG
    fprintf(stderr, "\t\tThread %lu of Node %lu notify Node.\n", ((struct numa_ThreadData*) _Data)->threadId,
	  ((struct numa_ThreadData*) _Data)->NodeId);
#endif
  }
  pthread_mutex_unlock(ThreadMutex);

  ////////////////////////////////////////////////////////////////////////////////////////
  // 4. Alignment phase
  /* Wait for signal from master node */

  pthread_mutex_lock(NodeMutex);
  if (! *((struct numa_ThreadData*) _Data)->NodeDone) {
    pthread_cond_wait(NodeCond, NodeMutex);
#ifdef NUMA_DEBUG
    fprintf(stderr, "\t\tThread %lu of Node %lu waiting for node.\n",
	    ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId);
#endif  
  } 
#ifdef NUMA_DEBUG 
  else {
    fprintf(stderr, "\t\tThread %lu of Node %lu DID NOT WAIT !!!\n",
	    ((struct numa_ThreadData*) _Data)->threadId,
	    ((struct numa_ThreadData*) _Data)->NodeId);
  }
#endif
  pthread_mutex_unlock(NodeMutex);
  
  {
    const unsigned int * const restrict SeqID   = &( ((struct numa_ThreadData*) _Data)->Array->ToDoID );
    union lScores * const restrict iop          = ((struct numa_ThreadData*) _Data)->iop;
    union Positions * const restrict iom        = ((struct numa_ThreadData*) _Data)->iom;
    union Positions * const restrict ioi        = ((struct numa_ThreadData*) _Data)->ioi;
    struct Alignment * const restrict alignment = ((struct numa_ThreadData*) _Data)->alignment;
    _Bool * const restrict Lock                 = ((struct numa_ThreadData*) _Data)->lock;
    const register size_t Start                 = ((struct numa_ThreadData*) _Data)->start;
    const register size_t Stop                  = ((struct numa_ThreadData*) _Data)->stop;
    // WARNING: reusing some moemory to old sequences result
    char * const Sequences                      = (char*) &(((struct numa_ThreadData*) _Data)->TransposeMatch);

#ifdef NUMA_DEBUG
      fprintf(stderr, "\t\tThread %lu of Node %lu aligning %lu to %lu.\n",
	      ((struct numa_ThreadData*) _Data)->threadId, ((struct numa_ThreadData*) _Data)->NodeId,
	      Start, Stop );
#endif        

    /* LOOPS ON SEQUENCES */
    for (size_t i=Start; i<Stop; ++i) {
#ifdef __USE_MMAP__
      PFSequence * PFSeq = MMAPReadSequenceIndex(&SeqData, SeqID[i], SequenceFileMap, FASTA->DataPtr, InitialArrayOffset
#ifdef MMAP_DEBUG
	, ((struct numa_ThreadData*) _Data)->threadId, ((struct numa_ThreadData*) _Data)->NodeId, *(((struct numa_ThreadData*) _Data)->maplength)
#endif	
	
      );
#else
      PFSequence * PFSeq = ReadSequenceIndex(&SeqData, SeqID[i], inSequence, FASTA->DataPtr);
#endif
    
      /* Translate first sequence */
      PFSeq = TranslateSequenceToIndex(PFSeq, prf->Alphabet_Mapping);

      /* Clear Lock */
      memset(Lock, 0, FASTA->MaxSequenceSize*sizeof(_Bool));
      
      // It seems we must have sequence starting from 1 here
      const int nali = xalip_ptr(prf, PFSeq->ProfileIndex, iop, iom, ioi, 1, PFSeq->Length, alignment,
			    Lock, prf->DisjointData.NDIP[0], prf->DisjointData.NDIP[1], false, 
			    prf->CutOffData.ICUT[0], NALI); 
			    
      if (nali <= 0) {
	fputs("Internal error xalip reported no possible alignment!\n",stderr);
	exit(1);          
      }
      int IPM[2];
      // Alignement is not filled from start !!!
      for ( int j=1; j<=nali; j++) {  
	/* Remove lock for aligned sequence generation */
	memset(Lock, 0, FASTA->MaxSequenceSize*sizeof(_Bool));
	memset(Sequences, 0, 15*(1+prf->Length)*sizeof(char));
	
	if (xalit_ptr(prf, prf->DisjointData.NDIP[0], prf->DisjointData.NDIP[1], 1, PFSeq->Length, &(PFSeq->ProfileIndex[0]),
		  Sequences, iop, &alignment[j], Lock, IPM) < 0 ) {
	  fputs("Internal error within xalit!\n", stderr);
	  exit(1);
	}
	char * cptr = SeqData.Data.Header;
	while ( *cptr != ' ') ++cptr;
	*cptr = '\0';
	fprintf(stdout, "%s  %i %i\n%s\n", SeqData.Data.Header, alignment[j].JALS, j, Sequences);           
      }
    }
  }
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 5. Close
#ifndef __USE_MMAP__
  fclose(inSequence);
#endif
  pthread_exit(0);
}

static void * SIMD(numa_node)(void * _Data) 
{
#ifdef NUMA_DEBUG
  fprintf(stderr,"\tNode %lu started with %lu child threads.\n",
	  ((struct numa_NodeData*) _Data)->NodeId,
	  ((struct numa_NodeData*) _Data)->nthreads);
#endif
  size_t i;
  volatile _Bool IamDone = false;
  int * FilterScores;
  pthread_mutex_t WaitForThreadMutex, WakeUpThreadMutex;
  pthread_cond_t  WaitForThreadCond, WakeUpThreadCond;
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 0. Bound thread to node
  struct bitmask *Mask = numa_allocate_cpumask();
  Mask = numa_bitmask_clearall(Mask);
  Mask = numa_bitmask_setbit(Mask, (int) ( ((struct numa_NodeData*) _Data)->NodeId));
  numa_bind(Mask);
#ifndef __USE_MMAP__
  numa_set_membind(Mask);
  numa_free_cpumask(Mask);
#else
  ////////////////////////////////////////////////////////////////////////////////////////
  // 0.1 Map sequence file to bound memory
  const FASTAStructure * const restrict FASTA = ((struct numa_NodeData*) _Data)->FASTA; 
  int thread_counter = (int) ((struct numa_NodeData*) _Data)->nthreads;
  const int fd = open(((struct numa_NodeData*) _Data)->SequenceFileName,
		       O_RDONLY );
  
  const size_t PageSize = sysconf(_SC_PAGE_SIZE);
  size_t StartIndex = ((struct numa_NodeData*) _Data)->shares[((struct numa_NodeData*) _Data)->shareStart];
  off_t Offset = FASTA->DataPtr[StartIndex].Offset;
  
  off_t InitialArrayOffset = Offset & ~(PageSize -1);
  size_t StopIndex = ((struct numa_NodeData*) _Data)->shares[((struct numa_NodeData*) _Data)->shareStart + thread_counter];
  size_t length = (size_t) (FASTA->DataPtr[StopIndex].Offset - InitialArrayOffset);
#ifdef NUMA_DEBUG
  fprintf(stderr, "\tNode %lu map file index: %lu-%lu offset: %lu to %lu that is %lu with length %lu\n",
	  ((struct numa_NodeData*) _Data)->NodeId,
	  StartIndex, StopIndex,
	  Offset, FASTA->DataPtr[StopIndex].Offset,
	  InitialArrayOffset, length);
#endif
  char * SequenceFileMap = (char *) mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, InitialArrayOffset);
  if (SequenceFileMap == NULL) {
    fputs("Unable to map sequence file to memory\n", stderr);
    exit(1);
  }
  if (mbind(SequenceFileMap, length, MPOL_BIND, Mask->maskp, Mask->size, MPOL_MF_STRICT) != 0) {
      munmap(SequenceFileMap,length);
      perror("mbind");
      exit(1);
  }
#endif
  
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 1. Initialize thread sync tools
  pthread_mutex_init(&WaitForThreadMutex, NULL);
  pthread_cond_init(&WaitForThreadCond, NULL);
  pthread_mutex_init(&WakeUpThreadMutex, NULL);
  pthread_cond_init(&WakeUpThreadCond, NULL);
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 2. Parse _Data according to numa_NodeData
  const struct Profile * const restrict prf   = ((struct numa_NodeData*) _Data)->prf;
#ifndef __USE_MMAP__
  const FASTAStructure * const restrict FASTA = ((struct numa_NodeData*) _Data)->FASTA;
  int thread_counter = (int) ((struct numa_NodeData*) _Data)->nthreads;
#endif
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 3. Allocates memory bound to node for all childs
  //    1        x ID / Score        | Maxshare given by master
  //    1        x TransposeMatrix   | (Profile_Length+1 + 63) & ~63 * Alphabet_Length
  //    nThreads x Sequences         | FASTA->MaxSequenceSize
  //    nThreads x Filter work array | (1+prf->Length)*4+63 & ~63 
  //    nthreads x Alignment needs
  //		union lScores * const restrict iop   = _mm_malloc((1+prf->Length)*sizeof(union lScores), 16);
  //		union Positions * const restrict iom = _mm_malloc((1+prf->Length)*sizeof(union Positions), 16);
  //		union Positions * const restrict ioi = _mm_malloc((1+prf->Length)*sizeof(union Positions), 16);
  //		struct Alignment * const restrict alignment = _mm_malloc(NALI*sizeof(struct Alignment),16);
  //		_Bool * const restrict Lock = _mm_malloc(FASTA->MaxSequenceSize*sizeof(_Bool), 16);
  
  // WARNING: it is assumed that numa allocates on page size boundaries, hence aligned for SSE
  const size_t nthreads                        = ((struct numa_NodeData*) _Data)->nthreads;
  const size_t Profile_Length                  = 1+prf->Length;
  const size_t TransposeMatrixLeadingDimension = (Profile_Length*sizeof(float)+63) & ~63;
  const size_t MaxSequenceSize                 = (FASTA->MaxSequenceSize*sizeof(char) + 63) & ~63;
  const size_t FilterWork                      = (4*Profile_Length*sizeof(float) + 63) & ~63;
  const size_t ScoreSize                       = (Profile_Length*sizeof(union lScores) + 16) & ~16;
  const size_t PositionSize                    = (Profile_Length*sizeof(union Positions) + 16) & ~16;
  const size_t AlignmentSize                   = (NALI*sizeof(struct Alignment)+16) & ~16;
  const size_t LockSize                        = (FASTA->MaxSequenceSize*sizeof(_Bool)+16) & ~16;
  // TODO: Avoid false share dependencies by padding to cache line the sequence data
  char * NodeMemory = numa_alloc_local( TransposeMatrixLeadingDimension*prf->Alphabet_Length
				      + nthreads*( MaxSequenceSize
					         + FilterWork
						 + ScoreSize
						 + 2*PositionSize
						 + AlignmentSize
						 + LockSize) );
  if (NodeMemory == NULL) {
      fprintf(stderr,"Node %lu cannot allocate sufficient local memory.\n",((struct numa_NodeData*) _Data)-> NodeId);
      exit(1);
  } 
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 4. Transpose Match matrix locally
#if SIMD_VER == 2
  float * const restrict TransposeMatchMatrix = (float*) ( NodeMemory );
  TransposeAndConvertToFloatMatchMatrixGivenMemory( TransposeMatchMatrix,
						    &(prf->Scores.Match),
						    prf->Alphabet_Length,
						    prf->Length,
						    TransposeMatrixLeadingDimension/sizeof(float)
						  );
#elif SIMD_VER == 41
  int * const restrict TransposeMatchMatrix = (int *) ( NodeMemory );
  TransposeAndConvertMatchMatrixGivenMemory( TransposeMatchMatrix,
					    &(prf->Scores.Match),
					    prf->Alphabet_Length,
					    prf->Length,
					    TransposeMatrixLeadingDimension/sizeof(float)
					    );
#endif
  ////////////////////////////////////////////////////////////////////////////////////////
  // 5. Run child thread for heuristic inheriting bound node
  struct numa_ThreadData *threads_arg = alloca(nthreads*sizeof(struct numa_ThreadData));
  pthread_t *threads = (pthread_t*) alloca(nthreads*sizeof(pthread_t));
  size_t *shares = (size_t*) alloca((1+nthreads)*sizeof(pthread_t));
  const size_t lShareStart = ((struct numa_NodeData*) _Data)->shareStart;
  size_t NodeFirstSeq;
  {
    {
      const size_t * const restrict SharePtr = &( ((struct numa_NodeData*) _Data)->shares[lShareStart] );
      NodeFirstSeq = SharePtr[0];
      i=0;
      do {
	shares[i] = SharePtr[i] - NodeFirstSeq; 
      } while (++i <= nthreads);
    }
    i=0;
    do {
      threads_arg[i].prf                       = prf;
      threads_arg[i].FASTA                     = FASTA;
      threads_arg[i].Array                     = ((struct numa_NodeData*) _Data)->Array;
#if SIMD_VER == 2
      threads_arg[i].TransposeMatch.f          = TransposeMatchMatrix;
#elif SIMD_VER == 41
      threads_arg[i].TransposeMatch.i          = TransposeMatchMatrix;
#endif
      threads_arg[i].Sequences                 = (char*)(NodeMemory + TransposeMatrixLeadingDimension*prf->Alphabet_Length + i*MaxSequenceSize);
      register char * tmp                      = (char*) (NodeMemory + TransposeMatrixLeadingDimension*prf->Alphabet_Length + nthreads*MaxSequenceSize);
      threads_arg[i].FilterWork                = (int*) ( tmp + i*FilterWork);
      register char * tmp2                     = tmp + nthreads*FilterWork;
      threads_arg[i].iop                       = (union lScores *) (tmp2 + i*ScoreSize);
      register char * tmp3                     = tmp2 + nthreads*ScoreSize;
      threads_arg[i].iom                       = (union Positions *) (tmp3 + i*PositionSize);
      register char * tmp4                     = tmp3 + nthreads*PositionSize;
      threads_arg[i].ioi                       = (union Positions *) (tmp4 + i*PositionSize);
      register char * tmp5                     = tmp4 + nthreads*PositionSize;
      threads_arg[i].alignment                 = (struct Alignment *) (tmp5 + i*AlignmentSize);
      register char * tmp6                     = tmp5 + nthreads*AlignmentSize;
      threads_arg[i].lock                      = (_Bool*) (tmp6 + i*LockSize);
#ifdef __USE_MMAP__
      threads_arg[i].SequenceFileMap           = SequenceFileMap;
      threads_arg[i].InitialArrayOffset        = InitialArrayOffset;
#else
      threads_arg[i].SequenceFileName          = ((struct numa_NodeData*) _Data)->SequenceFileName;
#endif
      threads_arg[i].FilterScores              = &FilterScores;
      threads_arg[i].NodeFirstSeq              = NodeFirstSeq;
      threads_arg[i].start                     = shares[i];
      threads_arg[i].stop                      = shares[i+1];
      threads_arg[i].threadId                  = i;
      threads_arg[i].ThreadMutex               = &WaitForThreadMutex;
      threads_arg[i].ThreadCond                = &WaitForThreadCond;
      threads_arg[i].NodeMutex                 = &WakeUpThreadMutex;
      threads_arg[i].NodeCond                  = &WakeUpThreadCond;
      threads_arg[i].thread_counter            = &thread_counter;
      threads_arg[i].NodeDone                  = &IamDone;
#ifdef NUMA_DEBUG
      threads_arg[i].NodeId                    = ((struct numa_NodeData*) _Data)->NodeId;
#endif
#ifdef MMAP_DEBUG
      threads_arg[i].maplength                 = &length;
#endif
      if (pthread_create (&threads[i],  NULL, SIMD(numa_thread),  (void*) &threads_arg[i]) != 0) {
	fprintf(stderr,"\tNode %lu failed to create thread %lu.\n", ((struct numa_NodeData*) _Data)->NodeId, i);
	exit(1);
      } 
    } while (++i < nthreads);
  }
  size_t MyMaxShare = shares[nthreads];
#if SIMD_VER == 2
  const float HeuristicCutOff = prf->HeuristicCutOff;
#elif SIMD_VER == 41
  const unsigned int HeuristicCutOff = (unsigned int) prf->HeuristicCutOff;
#endif
  
  pthread_mutex_lock(&WaitForThreadMutex);
  if ( thread_counter > 0 ) {
#ifdef NUMA_DEBUG
  fprintf(stderr, "\tNode %lu waiting for child threads to finish heuristic.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif
    pthread_cond_wait(&WaitForThreadCond, &WaitForThreadMutex);
  }
  pthread_mutex_unlock(&WaitForThreadMutex);
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 6. Gather and wait for master to trigger next phase
  size_t HeuristicCounter = 0;
  i=0;
  __32bitData * restrict YesNoID = ((struct numa_NodeData*) _Data)->Array;
  do {
#if SIMD_VER == 2
   if (YesNoID[i].FloatScores >= HeuristicCutOff) {
#elif SIMD_VER == 41
	if (YesNoID[i].UnsignedScores >= HeuristicCutOff) {
#endif
      YesNoID[HeuristicCounter].ToDoID = (unsigned int) (NodeFirstSeq + i);
      ++HeuristicCounter;
   }
  } while (++i < MyMaxShare);
  
  ((struct numa_NodeData*) _Data)->shares[((struct numa_NodeData*) _Data)->NodeId] = HeuristicCounter;
  fprintf(stderr, "\tNode %lu counted %lu/%lu heuristic passed sequences\n", 
	  ((struct numa_NodeData*) _Data)->NodeId, HeuristicCounter, MyMaxShare);
  
  /* Notify Master if phase is finished */
  pthread_mutex_lock(&NodeMutex);
  if (--NodeCounter <= 0 ) {
#ifdef NUMA_DEBUG    
    fprintf(stderr, "\tNode %lu signal master for heuristic finished.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif
    pthread_cond_signal(&NodeCond);
  }
  pthread_mutex_unlock(&NodeMutex);
        
  ////////////////////////////////////////////////////////////////////////////////////////
  // 7. Run child thread for filter inheriting bound node
  // TODO: check that if there is not enough to work on it will be ok

  /* Wait for master to start phase */
  pthread_mutex_lock(&MasterMutex);
  if (!MasterDone) {
#ifdef NUMA_DEBUG
    fprintf(stderr, "\tNode %lu waiting master to enter filter phase.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif
    pthread_cond_wait(&MasterCond, &MasterMutex);
  }
  pthread_mutex_unlock(&MasterMutex);
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 7.1 Map sequence file to bound memory
  unsigned int * const restrict SeqID = &(((struct numa_NodeData*) _Data)->Array[0].ToDoID);
  StartIndex = ((struct numa_NodeData*) _Data)->shares[lShareStart];
  StopIndex = ((struct numa_NodeData*) _Data)->shares[lShareStart + nthreads];
  MyMaxShare = StopIndex - StartIndex;
  size_t FilterScoresSize = MyMaxShare;
  /* Allocate memory for the filter scores */
  FilterScores = numa_alloc_local(MyMaxShare*sizeof(int));
  if (FilterScores == NULL) {
    fputs("Unable to allocate local node memory for filter scores\n", stderr);
    exit(1);
  }
  
  Offset = FASTA->DataPtr[SeqID[0]].Offset;
  InitialArrayOffset = Offset & ~(PageSize -1);
  length = (size_t) (FASTA->DataPtr[SeqID[MyMaxShare-1]+1].Offset - InitialArrayOffset);
#ifdef NUMA_DEBUG
  fprintf(stderr, "\tNode %lu share part of %lu SeqID: %lu(%u)-%lu(%u) file offset: %lu to %lu that is %lu with length %lu\n",
	  ((struct numa_NodeData*) _Data)->NodeId, MyMaxShare,
	  StartIndex, SeqID[0], StopIndex, SeqID[MyMaxShare-1],
	  Offset, FASTA->DataPtr[SeqID[MyMaxShare-1]+1].Offset,
	  InitialArrayOffset, length);
#endif
  SequenceFileMap = (char *) mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, InitialArrayOffset);
  if (SequenceFileMap == NULL) {
    fputs("Unable to map sequence file to memory\n", stderr);
    exit(1);
  }
  if (mbind(SequenceFileMap, length, MPOL_BIND, Mask->maskp, Mask->size, MPOL_MF_STRICT) != 0) {
      munmap(SequenceFileMap,length);
#ifdef NUMA_DEBUG
      fprintf(stderr, "\tNode %lu cannot bind memory\n", ((struct numa_NodeData*) _Data)->NodeId);
#endif
      perror("mbind");
      exit(1);
  }
  
  NodeFirstSeq = ((struct numa_NodeData*) _Data)->shares[lShareStart];
  i=0;
  do {
    shares[i] = ((struct numa_NodeData*) _Data)->shares[lShareStart + i] - NodeFirstSeq; 
  } while (++i <= nthreads);

  i=0;
  do {
    threads_arg[i].start = shares[i];
    threads_arg[i].stop  = shares[i+1];
#ifdef __USE_MMAP__
    threads_arg[i].SequenceFileMap    = SequenceFileMap;
    threads_arg[i].InitialArrayOffset = InitialArrayOffset;
#endif
  } while (++i < nthreads);
  
  /* Wake up child threads */
  pthread_mutex_lock(&WakeUpThreadMutex);
  thread_counter = (int) nthreads;
  IamDone = false;
  pthread_cond_broadcast(&WakeUpThreadCond);
  pthread_mutex_unlock(&WakeUpThreadMutex);

  ////////////////////////////////////////////////////////////////////////////////////////
  // 8. Gather / transfer new data and share
  pthread_mutex_lock(&WaitForThreadMutex);
  if ( thread_counter > 0 ) {
#ifdef NUMA_DEBUG
    fprintf(stderr, "\tNode %lu waiting for child threads to finish filter.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif
    pthread_cond_wait(&WaitForThreadCond, &WaitForThreadMutex);
  }
  pthread_mutex_unlock(&WaitForThreadMutex);

  size_t FilterCounter = 0;
  const int FilterCutOff = prf->CutOffData.ICUT[0];
  i=0; 
  do {
    if (FilterScores[i] >= FilterCutOff) {
      SeqID[FilterCounter] = SeqID[i];
      ++FilterCounter;
    }
  } while (++i < MyMaxShare);
  ((struct numa_NodeData*) _Data)->shares[((struct numa_NodeData*) _Data)->NodeId] = FilterCounter;
  
  fprintf(stderr, "\tNode %lu counted %lu/%lu filter passed sequences\n", 
	  ((struct numa_NodeData*) _Data)->NodeId, FilterCounter, MyMaxShare);
  
  /* Notify Master if phase is finished */
  pthread_mutex_lock(&NodeMutex);
  if (--NodeCounter <= 0 ) { 
#ifdef NUMA_DEBUG    
    fprintf(stderr, "\tNode %lu signal master for filter finished.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif
    pthread_cond_signal(&NodeCond);
  }
  pthread_mutex_unlock(&NodeMutex);
  
#ifdef __USE_MMAP__
  /* set back default memory policy and unmap file*/
  { 
    const int res = mbind(SequenceFileMap, length, MPOL_DEFAULT, 0, 0, MPOL_MF_STRICT);
    munmap(SequenceFileMap,length);
    if (res != 0) {
      perror("mbind");
      exit(1);
    }
  }
#endif      
  ////////////////////////////////////////////////////////////////////////////////////////
  // 9. Run child thread for alignment inheriting bound node
  
  /* Wait for master to start phase */
  pthread_mutex_lock(&MasterMutex);
  if (!MasterDone) { 
#ifdef NUMA_DEBUG
    fprintf(stderr, "\tNode %lu waiting master to enter alignment phase.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif 
    pthread_cond_wait(&MasterCond, &MasterMutex);
  }
  pthread_mutex_unlock(&MasterMutex);
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 9.1 Map sequence file to bound memory
  StartIndex = ((struct numa_NodeData*) _Data)->shares[lShareStart];
  StopIndex = ((struct numa_NodeData*) _Data)->shares[lShareStart + nthreads];
  MyMaxShare = StopIndex - StartIndex;
  
  Offset = FASTA->DataPtr[SeqID[0]].Offset;
  InitialArrayOffset = Offset & ~(PageSize -1);
  length = (size_t) (FASTA->DataPtr[SeqID[MyMaxShare-1]+1].Offset - InitialArrayOffset);
#ifdef NUMA_DEBUG
  fprintf(stderr, "\tNode %lu share part of %lu SeqID: %lu(%u)-%lu(%u) file offset: %lu to %lu that is %lu with length %lu\n",
	  ((struct numa_NodeData*) _Data)->NodeId, MyMaxShare,
	  StartIndex, SeqID[0], StopIndex, SeqID[MyMaxShare-1],
	  Offset, FASTA->DataPtr[SeqID[MyMaxShare-1]+1].Offset,
	  InitialArrayOffset, length);
#endif
  SequenceFileMap = (char *) mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, InitialArrayOffset);
  if (SequenceFileMap == NULL) {
    fputs("Unable to map sequence file to memory\n", stderr);
    exit(1);
  }
  if (mbind(SequenceFileMap, length, MPOL_BIND, Mask->maskp, Mask->size, MPOL_MF_STRICT) != 0) {
      munmap(SequenceFileMap,length);
#ifdef NUMA_DEBUG
      fprintf(stderr, "\tNode %lu cannot bind memory\n", ((struct numa_NodeData*) _Data)->NodeId);
#endif
      perror("mbind");
      exit(1);
  }
  
  NodeFirstSeq = ((struct numa_NodeData*) _Data)->shares[lShareStart];
  i=0;
  do {
    shares[i] = ((struct numa_NodeData*) _Data)->shares[lShareStart + i] - NodeFirstSeq; 
  } while (++i <= nthreads);

  i=0;
  do {
    threads_arg[i].start = shares[i];
    threads_arg[i].stop  = shares[i+1];
#ifdef __USE_MMAP__
    threads_arg[i].SequenceFileMap    = SequenceFileMap;
    threads_arg[i].InitialArrayOffset = InitialArrayOffset;
#endif
  } while (++i < nthreads);
  
  /* Wake up child threads */
#ifdef NUMA_DEBUG
  fprintf(stderr, "\tNode %lu triggers alignment.\n",
	  ((struct numa_NodeData*) _Data)->NodeId);
#endif
  
  pthread_mutex_lock(&WakeUpThreadMutex);
  thread_counter = nthreads;
  IamDone = true;
  pthread_cond_broadcast(&WakeUpThreadCond);
  pthread_mutex_unlock(&WakeUpThreadMutex);
 

  ////////////////////////////////////////////////////////////////////////////////////////
  // 10. Join child threads
  i=0;
  do {
    pthread_join(threads[i], NULL);
  } while (++i < nthreads);
  
#ifdef __USE_MMAP__
  ////////////////////////////////////////////////////////////////////////////////////////
  // 10.1 unmap file from memory
  /* set back default memory policy and unmap file*/
  { 
    const int res = mbind(SequenceFileMap, length, MPOL_DEFAULT, 0, 0, MPOL_MF_STRICT);
    munmap(SequenceFileMap,length);
    if (res != 0) {
      perror("mbind");
      exit(1);
    }
  }
#endif
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // 11. Free Memory and close
#ifdef __USE_MMAP__
  numa_free_cpumask(Mask);
#endif
  numa_free(FilterScores, FilterScoresSize);
  numa_free(NodeMemory, TransposeMatrixLeadingDimension*prf->Alphabet_Length
				      + nthreads*( MaxSequenceSize
					         + FilterWork
						 + ScoreSize
						 + 2*PositionSize
						 + AlignmentSize
						 + LockSize));
  pthread_mutex_destroy(&WakeUpThreadMutex);
  pthread_cond_destroy(&WakeUpThreadCond);
  pthread_mutex_destroy(&WaitForThreadMutex);
  pthread_cond_destroy(&WaitForThreadCond);
  pthread_exit(0);
}

