#include <stdlib.h>
#include <stdio.h>
#include <mm_malloc.h>
#include <sys/time.h>
#include <pthread.h>
#include <alloca.h>
#include <inttypes.h>
#ifdef __NUMA__
#include <numa.h>
#endif
#include "profile.h"
#include "sequence.h"
#define NALI 1000

static int (*xali1_ptr)(const struct Profile * const restrict, const unsigned char * const restrict,
			int * const, const size_t, const size_t, const int, const _Bool);

static void* (*thread_heuristic_ptr)(void*);
static void* (*thread_heuristic_cutoff_ptr)(void*);

#ifdef SMP
typedef union TransposeMatrix {const int * i; const float * f;} TransposeMatrix;

struct ThreadData {
  struct Profile * prf;
  FASTAStructure * FASTA;
  char * SequenceFileName;
  char * Sequences;
  union {
   unsigned int * ToDoID;
   int          * SignedScores;
   unsigned int * UnsignedScores;
   };
  TransposeMatrix TransposeMatch;
  FILE ** TempOutputFile;
  size_t start;
  size_t stop;
  size_t counter;
#ifdef __NUMA__
  size_t CoreId;
  size_t NodeId;
#endif
};

static void *thread_heuristic_cutoff_sse41(void * _Data) 
{
  Sequence SeqData;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA;
  const int * const restrict TransposeMatch   = ((struct ThreadData*) _Data)->TransposeMatch.i;
  unsigned int * const restrict Scores        = ((struct ThreadData*) _Data)->UnsignedScores;
  PFSequence * PFSeq;

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread Cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf->Length)*4*sizeof(int)+63,64);

  /* Open sequence file*/
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  size_t Start              = ((struct ThreadData*) _Data)->start;
  size_t Stop               = ((struct ThreadData*) _Data)->stop;
  const unsigned int CutOff = prf->HeuristicCutOff;

  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA->DataPtr);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);

    Scores[i] = heuristic(prf, PFSeq, CutOff);                              
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);
  
  return 0;
};

static void *thread_heuristic_sse41(void * _Data) 
{
  Sequence SeqData;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA;
  const int * const restrict TransposeMatch   = ((struct ThreadData*) _Data)->TransposeMatch.i;
  unsigned int * const restrict Scores        = ((struct ThreadData*) _Data)->UnsignedScores;
  PFSequence * PFSeq;

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread Cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf->Length)*4*sizeof(int)+63,64);
  if (Work == NULL) return (void*) 1;

  /* Open sequence file*/
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  size_t Start              = ((struct ThreadData*) _Data)->start;
  size_t Stop               = ((struct ThreadData*) _Data)->stop;
  //const unsigned int CutOff = prf->HeuristicCutOff;

   //fprintf(stderr,"Thread %lu - %lu\n", Start, Stop);
  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA->DataPtr);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);

    Scores[i] = TransposeHeuristic_sse41(TransposeMatch, prf->Alphabet_Length, prf->Length, PFSeq);                            
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);

  return 0;
};

static void *thread_heuristic_sse2(void * _Data) 
{
  Sequence SeqData;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA;
  const float * const restrict TransposeMatch = ((struct ThreadData*) _Data)->TransposeMatch.f;
  unsigned int * const restrict Scores        = ((struct ThreadData*) _Data)->UnsignedScores;
  PFSequence * PFSeq;

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread Cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf->Length)*4*sizeof(int)+63,64);
  if (Work == NULL) return (void*) 1;

  /* Open sequence file*/
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  size_t Start              = ((struct ThreadData*) _Data)->start;
  size_t Stop               = ((struct ThreadData*) _Data)->stop;
  //const unsigned int CutOff = prf->HeuristicCutOff;

  //fprintf(stderr,"Thread %lu - %lu\n", Start, Stop);
  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA->DataPtr);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);

    Scores[i] = TransposeHeuristic_sse2(TransposeMatch, prf->Alphabet_Length, prf->Length, PFSeq);                            
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);

  return 0;
};

static void *thread_xali1( void * _Data)
{
  Sequence SeqData;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA;
  int * const restrict Scores                 = ((struct ThreadData*) _Data)->SignedScores;
  PFSequence * PFSeq;
  
  const int CutOff = prf->CutOffData.ICUT[0];

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread Cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf->Length)*4*sizeof(int)+63,64);
  if (Work == NULL) return (void*) 1;
  
  /* Open sequence file*/
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  size_t Start   = ((struct ThreadData*) _Data)->start;
  size_t Stop   = ((struct ThreadData*) _Data)->stop;

  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, Scores[i], inSequence, FASTA->DataPtr);

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);

    Scores[i] = xali1_ptr(prf, PFSeq->ProfileIndex, Work, 0, PFSeq->Length, CutOff, false);
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);
  
  return 0;
}

static void *thread_xaliPT( void * _Data)
{
  Sequence SeqData;
  const struct Profile * const restrict prf   = ((struct ThreadData*) _Data)->prf;
  const FASTAStructure * const restrict FASTA = ((struct ThreadData*) _Data)->FASTA; 
  const unsigned int * const restrict SeqID   = ((struct ThreadData*) _Data)->ToDoID;
  char * restrict Sequences                   = ((struct ThreadData*) _Data)->Sequences;
  PFSequence * PFSeq;

  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA->MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Thread cannot allocate memory for sequence.\n", stderr);
    return (void*) 1;
  }
  /* Allocate work aligned memory for xali1 */

  union lScores * const restrict iop   = _mm_malloc((1+prf->Length)*sizeof(union lScores), 16);
  union Positions * const restrict iom = _mm_malloc((1+prf->Length)*sizeof(union Positions), 16);
  union Positions * const restrict ioi = _mm_malloc((1+prf->Length)*sizeof(union Positions), 16);
  struct Alignment * const restrict alignment = _mm_malloc(NALI*sizeof(struct Alignment),16);
  _Bool * const restrict Lock = _mm_malloc(FASTA->MaxSequenceSize*sizeof(_Bool), 16);
  if ( iop == NULL || iom == NULL || ioi == NULL || alignment == NULL || Lock == NULL) return (void*) 1;

  /* Open sequence file */
  FILE* inSequence = fopen(((struct ThreadData*) _Data)->SequenceFileName, "r");

  /* Open temporary output file */
  //FILE* outSequence = 

  size_t Start   = ((struct ThreadData*) _Data)->start;
  size_t Stop   = ((struct ThreadData*) _Data)->stop;
  //fprintf(stderr,"Thread %lu - %lu\n", Start, Stop);
  /* LOOPS ON SEQUENCES */
   for (size_t i=Start; i<Stop; ++i) {
    PFSeq = ReadSequenceIndex(&SeqData, (size_t) SeqID[i], inSequence, FASTA->DataPtr);
  
    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf->Alphabet_Mapping);

    /* Clear Lock */
    memset(Lock, 0, FASTA->MaxSequenceSize*sizeof(_Bool));
    
    // It seems we must have sequence starting from 1 here
    const int nali = xalip_sse41(prf, PFSeq->ProfileIndex, iop, iom, ioi, 1, PFSeq->Length, alignment,
                           Lock, prf->DisjointData.NDIP[0], prf->DisjointData.NDIP[1], false, 
                           prf->CutOffData.ICUT[0], NALI); 
                           
    if (nali < 0) exit(1);          
    
    int IPM[2];
    // Alignement is not filled from start !!!
    for ( int j=1; j<=nali; j++) {
    
      /* Remove lock for aligned sequence generation */
      memset(Lock, 0, FASTA->MaxSequenceSize*sizeof(_Bool));
      memset(&Sequences[i*(prf->Length+1)*3], 0, 3*(1+prf->Length)*sizeof(char));
      
       if (xalit_sse41(prf, prf->DisjointData.NDIP[0], prf->DisjointData.NDIP[1], 1, PFSeq->Length, &(PFSeq->ProfileIndex[0]),
                 &Sequences[i*(prf->Length+1)*3], iop, &alignment[j], Lock, IPM) < 0 ) exit(1);
       fprintf(stdout, "%s  %i %i\n%s\n", SeqData.Header, alignment[j].JALS, j, &Sequences[i*(prf->Length+1)*3 + 1]);           
    }
  }

  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(iop);
  _mm_free(iom);
  _mm_free(ioi);
  _mm_free(alignment);
  _mm_free(Lock); 

  return 0;
}


#endif

int main (int argc, char *argv[])
{
  struct Profile prf;
  FASTAStructure FASTA;
  Sequence SeqData;
  PFSequence * PFSeq;
  struct timeval _t0, _t1;
  int res, Score;
  int SSE41 = 0;

  if (argc < 4) { fputs("provide profile and FASTA file and heuristic cutoff\n", stderr); return 1;}
#ifdef __NUMA__
  if (numa_available() < 0) {
    fputs("NUMA architecture NOT available on this system.\n", stderr);
  } else {
    fputs("NUMA architecture available.\n", stderr);
  }
#endif
  const char * const SSE = getenv("SSE");
  if (SSE == NULL) {
    fputs("SSE 4.1 will be used\n" ,stderr);
    xali1_ptr = xali1_sse41;
    thread_heuristic_cutoff_ptr = thread_heuristic_cutoff_sse41;
    thread_heuristic_ptr = thread_heuristic_sse41;
  } else {
    if (strncmp(SSE,"SSE2",5) == 0) {
      xali1_ptr = xali1_sse2;
//       thread_heuristic_cutoff_ptr = thread_heuristic_cutoff_sse41;
      thread_heuristic_ptr = thread_heuristic_sse2;
      fputs("SSE 2 will be used\n" ,stderr);
      SSE41 = 1;
    } else {
      fputs("SSE 4.1 will be used\n" ,stderr);
      xali1_ptr = xali1_sse41;
      thread_heuristic_cutoff_ptr = thread_heuristic_cutoff_sse41;
      thread_heuristic_ptr = thread_heuristic_sse41;
    }
  }
  
  /* Read the profile and output some infos */
  gettimeofday(&_t0,0);
  res = ReadProfile(argv[1], &prf);
  gettimeofday(&_t1,0);
  double T = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr, "Profile reading took %lf seconds.\n", T);
  
  if (res != 0) {
    fputs("Error found.\n", stderr);
    return 1;
  }
  printf("Profile %s has length %lu and alphabet size of %lu\nCutoff value is set to %i\n",
         argv[1], prf.Length, prf.Alphabet_Length, prf.CutOffData.ICUT[0]);

  puts("Alphabet Mapping");
  for (size_t i=0; i<ALPHABET_SIZE; ++i) {
    printf("Map %c=%2u\t", (char) ((unsigned char) 'A' + (unsigned char) i), (unsigned int) prf.Alphabet_Mapping[i]);
    if ((i+1) % 8 == 0 ) puts("");
  }
  puts("\n");

  /* Read the FASTA file */
  gettimeofday(&_t0,0);
  res = AnalyzeFASTAStructure(argv[2], &FASTA);
  gettimeofday(&_t1,0);
  T = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr, "Sequence file indexing took %lf seconds.\n", T);
  if (res != 0) {
    fputs("Error found.\n", stderr);
    return 1;
  }
  
  printf("FASTA file %s analyzed\n\tFound %lu sequences within %lu bytes\n\tBiggest sequence entry is %lu bytes\n",
          argv[1], FASTA.SequenceCount, FASTA.FileSize, FASTA.MaxSequenceSize);

#ifndef SMP
  /* Allocate memory to hold sequence */
  SeqData.Memory = malloc(FASTA.MaxSequenceSize*sizeof(unsigned char));
  if (SeqData.Memory == NULL) {
    fputs("Cannot allocate menmory for sequence.\n", stderr);
    return 1;
  }
  /* Allocate work aligned memory for xali1 */
  int * Work = _mm_malloc((1+prf.Length)*4*sizeof(int)+63,64);
  
  /* Allocate true/false memory for xalip */
  _Bool * YesNo = _mm_malloc(FASTA.SequenceCount*sizeof(_Bool), 64);
  
  /* Open sequence file*/
  FILE* inSequence = fopen(argv[2], "r");

  size_t counter = 0;
  gettimeofday(&_t0,0);
  /* LOOPS ON SEQUENCES */
   for (size_t i=0; i<FASTA.SequenceCount; ++i) {
    if ( (PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA.DataPtr) ) == NULL ) {
	fprintf(stderr, "Error reading sequence %lu from database.\n", i);
	exit(1);
    }

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf.Alphabet_Mapping);

    Score = xali1_sse41(&prf, PFSeq->ProfileIndex, Work, 0, PFSeq->Length, prf.CutOffData.ICUT[0], false);
    if (Score >= prf.CutOffData.ICUT[0]) ++counter;
    YesNo[i] = Score >= prf.CutOffData.ICUT[0] ? true : false;
  }
  gettimeofday(&_t1,0);
  const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr,"XALI1 took %lf [s].\n%lu sequences satisfy alignment test.\n", t, counter);
  
  // Allocate memory for xalip */
  union lScores * const restrict iop   = _mm_malloc((1+prf.Length)*sizeof(union lScores), 16);
  union Positions * const restrict iom = _mm_malloc((1+prf.Length)*sizeof(union Positions), 16);
  union Positions * const restrict ioi = _mm_malloc((1+prf.Length)*sizeof(union Positions), 16);
  struct Alignment * const restrict alignment = _mm_malloc(NALI*sizeof(struct Alignment),16);
  _Bool * const restrict Lock = _mm_malloc(FASTA.MaxSequenceSize*sizeof(_Bool), 16);
  char * const restrict CALI = _mm_malloc(4*(1+prf.Length)*sizeof(char),16);
  
  /* LOOPS ON SEQUENCES */
  
  gettimeofday(&_t0,0);
  for (size_t i=0; i<FASTA.SequenceCount; ++i) {
    if (YesNo[i] == false) continue;
    
    if ( (PFSeq = ReadSequenceIndex(&SeqData, i, inSequence, FASTA.DataPtr)) == NULL) {
	fprintf(stderr, "Error reading sequence %lu from database.\n" , i);
	exit(1);
    }

    /* Translate first sequence */
    PFSeq = TranslateCharToIndex(PFSeq, prf.Alphabet_Mapping);
    
    /* Clear Lock */
    memset(Lock, 0, FASTA.MaxSequenceSize*sizeof(_Bool));
    
    // It seems we must have sequence starting from 1 here
    const int nali = xalip_sse41(&prf, &(PFSeq->ProfileIndex[0]), iop, iom, ioi, 1, PFSeq->Length, alignment,
                           Lock, prf.DisjointData.NDIP[0], prf.DisjointData.NDIP[1], false, 
                           prf.CutOffData.ICUT[0], NALI); 
                           
    if (nali < 0) exit(1);          
    
    //fprintf(stderr, "%i alignments found\n", nali);
    int IPM[2];
    // Alignement is not filled from start !!!
    for ( int j=1; j<=nali; j++) {
    
      /* Remove lock for aligned sequence generation */
      memset(Lock, 0, FASTA.MaxSequenceSize*sizeof(_Bool));
      memset(CALI, 0, 4*(1+prf.Length)*sizeof(char));
      
       if (xalit_sse41(&prf, prf.DisjointData.NDIP[0], prf.DisjointData.NDIP[1], 1, PFSeq->Length, &(PFSeq->ProfileIndex[0]),
                 CALI, iop, &alignment[j], Lock, IPM) < 0 ) exit(1);
                  
       fprintf(stdout,"Sequence %lu %i: %s\n", i,j, &CALI[1]);
    }
  }
  gettimeofday(&_t1,0);
  T = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr,"XALIP took %lf [s].\n", T);
  
  /* close sequence file */
  fclose(inSequence);

  /* Free Memory */
  free(SeqData.Memory);
  _mm_free(Work);
  _mm_free(YesNo);
  _mm_free(iop);
  _mm_free(iom);
  _mm_free(ioi);
  _mm_free(alignment);
  _mm_free(Lock);
  _mm_free(CALI);
  FreeProfile(&prf);
#else
  /* Retrieve number of cores */
  const size_t nCPUs = argc == 5 ? atoi(argv[4]) : (size_t) sysconf(_SC_NPROCESSORS_CONF);

  ////////////////////////////////////////////////////////////////////////////////////////////////
  // HEURISTIC
  ////////////////////////////////////////////////////////////////////////////////////////////////

  /* Get Heuristic cutoff from command line */
  prf.HeuristicCutOff = atoi(argv[3]);
  printf("Heuristic cutoff set to %u\n", prf.HeuristicCutOff);

  /* Compute Match Score Matrix transpose */
  gettimeofday(&_t0,0);
  TransposeMatrix TIMatch;
  if (SSE41) {
    TIMatch.f = TransposeAndConvertToFloatMatchMatrix(&(prf.Scores.Match), prf.Alphabet_Length, prf.Length);
  } else {
    TIMatch.i = TransposeAndConvertMatchMatrix(&(prf.Scores.Match), prf.Alphabet_Length, prf.Length);
  }
  gettimeofday(&_t1,0);
  double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr,"Transposing Match matrix took %lf seconds.\n", t);
   
  /* Share according to file size */
  size_t * shares = alloca((nCPUs+1)*sizeof(size_t));
  {
    const size_t FileShare = 1 + (size_t) FASTA.FileSize / nCPUs;
    const s_Data * DataPtr = FASTA.DataPtr;
    size_t counter = 0;
    shares[0] = 0;
    for (size_t i=1; i<nCPUs; ++i) {
      register size_t tmp = i*FileShare;
      while ( (size_t) DataPtr->Offset < tmp) { ++DataPtr; ++counter; }
      shares[i] = counter;
      //printf("share %i starts at %li and stops at %li\n", i, shares[i-1], counter);
    }
    shares[nCPUs] = FASTA.SequenceCount;
  }

  /* Allocate memory for sequence YesNo or ID to be done */
  unsigned int * const YesNoID = _mm_malloc( FASTA.SequenceCount*sizeof(unsigned int), 16);

  /* Allocate stack memory for posix thread structures */
  struct ThreadData *threads_arg = alloca(nCPUs*sizeof(struct ThreadData));
  pthread_t *threads = (pthread_t*) alloca(nCPUs*sizeof(pthread_t));

  /* Dispatch to threads */
  gettimeofday(&_t0,0);
  for (size_t i=0; i<nCPUs; ++i) {
    threads_arg[i].prf                       = &prf;
    threads_arg[i].FASTA                     = &FASTA;
    threads_arg[i].UnsignedScores            = YesNoID;
    threads_arg[i].TransposeMatch            = TIMatch;
    threads_arg[i].SequenceFileName          = argv[2];
    threads_arg[i].start                     = shares[i];
    threads_arg[i].stop                      = shares[i+1];
    if (pthread_create (&threads[i],  NULL, thread_heuristic_ptr,  (void*) &threads_arg[i]) != 0) {
      return 1;
    }
  }

  for (size_t i=0; i<nCPUs; i++) {
    pthread_join(threads[i], NULL);  
  }
  gettimeofday(&_t1,0);
  t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  
   /* Gather the one that passed th1e heuristic */
  unsigned int * restrict pID = YesNoID;
  register const unsigned int lHeuristicCutOff = prf.HeuristicCutOff;
  fprintf(stderr, "Heurstic cutoff set to %u\n", lHeuristicCutOff);
  /*for (size_t iseq=0; iseq<FASTA.SequenceCount; ++iseq) {
    //printf("HEURISTIC %u\t%i\n", iseq+1, YesNoID[iseq]);
    if (YesNoID[iseq] >= lHeuristicCutOff) {
      *pID++ = (unsigned int) iseq;
    }
  }
  const size_t HeuristicCounter =  (size_t) ( (uintptr_t) pID - (uintptr_t) YesNoID)/sizeof(unsigned int); 
  */
  
  size_t HeuristicCounter = 0;
  for (size_t iseq=0; iseq<FASTA.SequenceCount; ++iseq) {
    //printf("HEURISTIC %lu\t%i\n", iseq+1, YesNoID[iseq]);
    if (YesNoID[iseq] >= lHeuristicCutOff) {
      YesNoID[HeuristicCounter] = (unsigned int) iseq;
      ++HeuristicCounter;
    }
  }
  fprintf(stderr,"Overall there are %lu sequences passing heuritic. These took %lf seconds to treat on %li cores.\n",
          HeuristicCounter, t, nCPUs);
          

  ////////////////////////////////////////////////////////////////////////////////////////////////
  // FILTER
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  /* Compute the new share for each thread */
  {
      const size_t SequenceShare = 1 + HeuristicCounter / nCPUs;
      shares[0] = 0;
      for (size_t i=1; i<nCPUs; ++i) {
         shares[i] = i*SequenceShare;
         //printf("share %i starts at %li and stops at %li\n", i, shares[i-1], shares[i]);
      }
      shares[nCPUs] = HeuristicCounter;
  }
  
  /* Dispatch to threads */
  gettimeofday(&_t0,0);
  for (size_t i=0; i<nCPUs; ++i) {
    threads_arg[i].start = shares[i];
    threads_arg[i].stop  = shares[i+1];
    if (pthread_create (&threads[i],  NULL, thread_xali1,  (void*) &threads_arg[i]) != 0) {
      return 1;
    }
  }

  for (size_t i=0; i<nCPUs; i++) {
    pthread_join(threads[i], NULL);  
  }

  gettimeofday(&_t1,0);
  t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  
  /* Gather the one that passed xali1 */
  pID = YesNoID;
  register const int lFilterCutoff = (int) prf.CutOffData.ICUT[0];
  fprintf(stderr, "Filter cutoff set to %u\n", lFilterCutoff); 
  /*for (size_t iseq=0; iseq<HeuristicCounter; ++iseq) {
    printf("FILTER %u\t%i\n", iseq+1, YesNoID[iseq]);
    if (YesNoID[iseq] > lFilterCutoff) {
      *pID++ = (unsigned int) iseq;
    }
  }
  const size_t FilterCounter = (size_t) ( (uintptr_t) pID - (uintptr_t) YesNoID)/sizeof(unsigned int);
   */
  size_t FilterCounter = 0;
  for (size_t iseq=0; iseq<HeuristicCounter; ++iseq) {
      //printf("FILTER %lu\t%i\n", iseq+1, YesNoID[iseq]);
      if ( *((int*) &YesNoID[iseq]) > lFilterCutoff ) {
         YesNoID[FilterCounter] = (unsigned int) iseq;
         ++FilterCounter;
      }
  }
   
  fprintf(stderr,"Overall there are %lu sequences passing filter. These took %lf seconds to treat on %li cores.\n",
          FilterCounter, t, nCPUs);
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // ALIGNMENT
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  /* Allocate memory for the alignment */
  char * AlignedSequences = malloc(FilterCounter*(prf.Length+1)*3*sizeof(char));
  if (AlignedSequences == NULL) {
   fputs("Unable to allocate memory for resulting aligned sequences.\n", stderr);
   exit(1);
  }
  
  /* Compute the new share for each thread */
  {
      const size_t SequenceShare = 1 + FilterCounter / nCPUs;
      shares[0] = 0;
      for (size_t i=1; i<nCPUs; ++i) shares[i] = i*SequenceShare;
      shares[nCPUs] = FilterCounter;
  }
 
  /* Dispatch to threads */
  gettimeofday(&_t0,0); 
  for (size_t i=0; i<nCPUs; ++i) {
    threads_arg[i].Sequences   = AlignedSequences;
    threads_arg[i].start       = shares[i];
    threads_arg[i].stop        = shares[i+1];
    if (pthread_create (&threads[i],  NULL, thread_xaliPT,  (void*) &threads_arg[i]) != 0) {
      return 1;
    }
  }

  for (size_t i=0; i<nCPUs; i++) {
    pthread_join(threads[i], NULL);  
  }
  gettimeofday(&_t1,0);
  t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
  fprintf(stderr,"Overall there are %lu sequences. These took %lf seconds to align on %li cores.\n", FilterCounter, t, nCPUs);

  /* Print out aligned sequences */  
  //for (int iseq=0; iseq<(int)FilterCounter; ++iseq) printf("Sequence %7i : %s\n", iseq, &AlignedSequences[iseq*(prf.Length+1)*3+1]);
  

  /* Free Memory */
  FreeProfile(&prf);

#endif
  return 0;
}
