/*
 *             Automatically Tuned Linear Algebra Software v3.2
 *                      (C) Copyright 1999 Camm Maguire                      
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */

#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>

#include "camm_util.h"

#define PREFN 64
#define PREFN2 32
#define PREFA str(PREFN)
#define PREFA2 str(PREFN2)

#define la           "movl %%esi,%%eax\n\taddl $16,%%esi\n\t"
#define lb           "movl %%eax,%%ebx\n\taddl $" PREFA ",%%ebx\n\t"
#define prefetch     "prefetchnta (%%ebx)\n\t"
#define prefetcha    "prefetchnta (%%eax)\n\t"
#define loadc        "movups (%%eax),%%xmm7\n\t"\
                     "movaps %%xmm7,%%xmm6\n\t"\
                     "shufps $160,%%xmm7,%%xmm7\n\t"\
                     "shufps $245,%%xmm6,%%xmm6\n\t"
#define dump(a_)     "movups %%xmm" #a_ ",(%%ecx)\n\t"
#define shuf         "shufps $177,%%xmm4,%%xmm4\n\t"
#ifdef Conj_
#define sign         "mulps %%xmm5,%%xmm7\n\t"
#else
#define sign         "mulps %%xmm5,%%xmm6\n\t"
#endif
#define movc(a_)     "movaps %%xmm" #a_ ",%%xmm4\n\t"
#define loadb        "movups (%%eax),%%xmm3\n\t"
#define writeb       "movups %%xmm3,(%%eax)\n\t"
#define mul(a_)      "mulps  %%xmm" #a_ ",%%xmm4\n\t"
#define add          "addps %%xmm4,%%xmm3\n\t"
#define incx(a_,b_)  "addl %%e" #a_ "x,%%e" #b_ "x\n\t"

#define pref(a_)         incx(a_,b) prefetch 
#define prefa(a_)        incx(a_,a) prefetcha 

#define dp(a_,b_)        incx(a_,a) loadb movc(7) mul(b_) add \
                         movc(6) mul(b_) shuf add writeb 
#define dpp(a_,b_,c_)    incx(a_,a) loadb pref(c_) movc(7) mul(b_) add \
                         movc(6) mul(b_) shuf add writeb 

#define bla1          la lb prefetch loadc sign dp(d,0) 
#define bla2          bla1 dpp(c,1,d) 
#define bla3          bla2 dp(c,2) 

#define blb1          la pref(d) loadc sign dp(d,0) 
#define blb2          la loadc sign dpp(d,0,c) dp(c,1) 
#define blb3          la pref(c) loadc sign dp(d,0) dpp(c,1,c) dp(c,2) 

#undef DP1
#define DP1 Mjoin(bla,NDP) 

#undef DP2
#define DP2 DP1 Mjoin(blb,NDP) 

#undef DP4
#define DP4 DP2 DP2 

#define ipref(a_)  prefa(a_) pref(a_) 

#define ir1      prefetcha prefetch ipref(d) 
#define ir2      ir1 ipref(c) 
#define ir3      ir2 ipref(c) 

#define init_regs  Mjoin(ir,NDP)

#define inca "addl %%edx,%%ecx\n\t"

#define preload_reg(a_) "movss (%%ecx),%%xmm" #a_ "\n\t"\
                        "movss 4(%%ecx),%%xmm7\n\t"\
                        "unpcklps %%xmm7,%%xmm" #a_ "\n\t"\
                        "movlhps %%xmm" #a_ ",%%xmm" #a_ "\n\t"

#define pr1 preload_reg(0) 
#define pr2 pr1 inca preload_reg(1) 
#define pr3 pr2 inca preload_reg(2) 
     
#define preload_regs Mjoin(pr,NDP) 

     
static void
Mjoin(g,EXT)(const Complex *a,int ainc,Complex *b,int ldb,const Complex *c,int len) {

  const Complex *ce=c+len;
  int i,c2b,b2b,a2a;
  const Complex w[2]={{1.0,-1.0},{1.0,-1.0}};
  NO_INLINE;

  c2b=(b-c)*sizeof(*c);
  b2b=ldb*sizeof(*b);
  a2a=ainc*sizeof(*a);
  
  ASM (
       
       "movl %4,%%ecx\n\t"
       "movl %5,%%edx\n\t"
       "movups %6,%%xmm5\n\t"
       
       preload_regs
       
       "movl %0,%%esi\n\t"
       "movl %%esi,%%eax\n\t"
       "movl %%eax,%%ebx\n\t"
       "addl $" PREFA2 ",%%ebx\n\t"
       "movl %1,%%edi\n\t"
       "movl %2,%%edx\n\t"
       "movl %3,%%ecx\n\t"
       
       init_regs
       
       lab(loop)
       
       test(-8,di)
       je(8)
       sub(8,di)
       align
       
       DP4
       
       jmp(loop)
       align
       
       lab(8)
       
       test(4,di)
       je(4)
       
       DP2
       
       lab(4)
       
       test(2,di)
       je(1)
       
       DP1
       
       lab(1)
       
       "movl %%esi,%0\n\t"
       
       :"+m" (c):"m" (len),"m" (c2b),"m" (b2b),"m" (a),"m" (a2a),"m" (*w)
       :"si","di","ax","bx","cx","dx");
  
  if (c<ce) {
    Complex *tb;
    const Complex *ta;
    
    for (ta=a,tb=(void *)c+c2b,i=0;i<NDP;i++,ta+=ainc,tb+=ldb) {
#ifdef Conj_	
      tb->r+= ta->r * c->r + ta->i * c->i;
      tb->i+= + ta->r * c->i - ta->i * c->r;
#else
      tb->r+= ta->r * c->r - ta->i * c->i;
      tb->i+= ta->r * c->i + ta->i * c->r;
#endif
    }
  }
  
}
