


/*  The mening of the defined macros is as follows:
 *  VECLEN:         The length of a singleprecision vector register
 *  NREGS:          Number of vector registers available
 *  prefetch:       Standard prefetch 
 *  prefetchw:      Prefetch used for data to be overwritten soon.
 *  vec_add:        Add to single precision vectors. 
 *  vec_mul:        Multiply to single precision vectors.
 *  vec_mov:        Moves data around
 *  vec_load_one:   Load one element in a vector and zero all other entries!
 *  vec_load_scal:  Load one element relpicated in all positions in the vector.
 *  vec_load_apart: Load elements from different memory positions into a register. 
 *  vec_sum:        Sums a register.
 *  vec_store_one:  Stores lowest element in vector to memory, no zero-extend!
 * Meaning of suffixes is as follows:
 *    mr means memory to register
 *    rr means register to register 
 *    rm means register to memory
 *    a means that instruction needs aligned data
 *    1 means that the instructions only operates on the lowest element of the
 *      vector.
 * 
 * The _1 instructions work under one important assumption: That you never mix them
 * with regular instructions, e.g. loading into a register with a normal mov, and
 * then using add_rr_1 will not work under 3dnow! since it is in reality a normal add.
 * However, if using a mov_1 first, the upper part of the register will be zeroed,
 * and it will therefore work. The _1 system is more robust under SSE, but other
 * architectures might be implemented the same way as 3dnow!
 *
 */




#define gen_vec_rr(op,reg1,reg2) \
        __asm__ __volatile__ (#op " %%" #reg1 ", %%" #reg2 \
                              :  /* nothing */ \
                              : /* nothing */)


#define gen_prefetch(op,mem) \
        __asm__ __volatile__ (#op " %0" \
                              : /* nothing */ \
                              : "m")

#define w(p) p

#define nop()             __asm__ __volatile__ ("nop")


#ifdef SSE

/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
 * load/store from misaligned adresses using movups at a cost of some cycles. Loading
 * using mul/add must always be aligned. Alignment is 16 bytes.
 * No muladd.
 */



#define gen_vec_mr(op,mem,reg) \
        __asm__ __volatile__ (#op " %0, %%" #reg \
                              :  /* nothing */ \
                              : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))


#define gen_vec_rm(op,reg,mem) \
        __asm__ __volatile__ (#op " %%" #reg ", %0" \
                              : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
                              :  /* nothing */ )                          




#define VECLEN 4
#define NREGS 8

#define reg0 xmm0
#define reg1 xmm1
#define reg2 xmm2
#define reg3 xmm3
#define reg4 xmm4
#define reg5 xmm5
#define reg6 xmm6
#define reg7 xmm7


#define prefetch(mem)           gen_prefetch(prefetchnta,mem)
#define prefetchw(mem)          gen_prefetch(prefetchw,mem)

#define vec_mov_mr(mem,reg)     gen_vec_mr(movups,mem,reg)
#define vec_mov_rm(reg,mem)     gen_vec_rm(movups,reg,mem)
#define vec_mov_mr_a(mem,reg)   gen_vec_mr(movaps,mem,reg)
#define vec_mov_rm_a(reg,mem)   gen_vec_rm(movaps,reg,mem)
#define vec_mov_rr(reg1,reg2)   gen_vec_rr(movaps,reg1,reg2)

#define vec_add_mr_a(mem,reg)   gen_vec_mr(addps,mem,reg)
#define vec_mul_mr_a(mem,reg)   gen_vec_mr(mulps,mem,reg)

#define vec_add_rr(mem,reg)     gen_vec_rr(addps,mem,reg)
#define vec_mul_rr(mem,reg)     gen_vec_rr(mulps,mem,reg)

#define vec_mov_mr_1(mem,reg)   gen_vec_mr(movss,mem,reg)
#define vec_mov_rm_1(reg,mem)   gen_vec_rm(movss,reg,mem)
#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2)

#define vec_add_mr_1(mem,reg)   gen_vec_mr(addss,mem,reg)
#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2)

#define vec_mul_mr_1(mem,reg)   gen_vec_mr(mulss,mem,reg)
#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2)

#define vec_enter()           /*  vec_enter */
#define vec_exit()            /*  vec_exit */

/* To use this instruction be sure that register 7 is not in use!!! */
#define vec_sum(reg) vec_sum_wrap(reg)
#define vec_sum_wrap(reg) \
        __asm__ __volatile__ ("movhlps %%" #reg ", %%xmm7\n"\
			      "addps %%" #reg ", %%xmm7\n"\
			      "movaps %%xmm7, %%" #reg "\n"\
                              "shufps $1, %%" #reg ", %%xmm7\n"\
  			      "addss %%xmm7, %%" #reg "\n"\
			      : /* nothing */  \
                              : /* nothing */)

#endif



#ifdef THREEDNOW

/* Peculiarities of 3DNOW. Alignment is not an issue,
 * all alignments are legal, however I am 
 * not sure if alignment gives a speed increase.
 * The vec_acc instruction can be used to sum to registers at once more efficiently
 * than a series of vec_sum and vec_store_one
 * No muladd.
 */


#define gen_vec_mr(op,mem,reg) \
        __asm__ __volatile__ (#op " %0, %%" #reg \
                              :  /* nothing */ \
                              : "m" (((mem)[0])), "m" (((mem)[1])))

#define gen_vec_rm(op,reg,mem) \
        __asm__ __volatile__ (#op " %%" #reg ", %0" \
                              : "=m" (((mem)[0])), "=m" (((mem)[1])) \
			      :  /* nothing */ )                            




#define VECLEN 2
#define NREGS 8

#define reg0 mm0
#define reg1 mm1
#define reg2 mm2
#define reg3 mm3
#define reg4 mm4
#define reg5 mm5
#define reg6 mm6
#define reg7 mm7

#define prefetch(mem)           gen_prefetch(prefetch,mem)
#define prefetchw(mem)          gen_prefetch(prefetchw,mem)
#define vec_add_mr(mem,reg)     gen_vec_mr(pfadd,mem,reg)
#define vec_mul_mr(mem,reg)     gen_vec_mr(pfmul,mem,reg)
#define vec_mov_mr(mem,reg)     gen_vec_mr(movq,mem,reg)
#define vec_mov_rm(reg,mem)     gen_vec_rm(movq,reg,mem)
#define vec_add_rr(reg1,reg2)   gen_vec_rr(pfadd,reg1,reg2)
#define vec_mul_rr(reg1,reg2)   gen_vec_rr(pfmul,reg1,reg2)
#define vec_acc_rr(reg1,reg2)   gen_vec_rr(pfacc,reg1,reg2)
#define vec_mov_rr(reg1,reg2)   gen_vec_rr(movq,reg1,reg2)

#define vec_sum(reg)            gen_vec_rr(pfacc,reg,reg)

#define vec_mov_mr_1(mem,reg)   gen_vec_mr(movd,mem,reg)
#define vec_mov_rm_1(reg,mem)   gen_vec_rm(movd,reg,mem)
#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2)

#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2)
#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2)


#define vec_load_scal(mem,reg)  vec_load_scal_wrap(mem,reg)
#define vec_load_scal_wrap(mem,reg) \
        __asm__ __volatile__ ("movd %0, %%" #reg "\n"\
			      "punpckldq %%" #reg ", %%" #reg \
			      : /* nothing */ \
                              : "m" ((mem)[0]))


#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg)
#define vec_load_apart_wrap(mem1,mem2,reg) \
        __asm__ __volatile__ ("movd %0, %%" #reg "\n"\
			      "punpckldq %1, %%" #reg \
			      : /* nothing */ \
                              : "m" ((mem1)[0]), "m" (((mem2)[0])))


#define vec_zero(reg)           gen_vec_rr(pxor,reg,reg)     

#define vec_enter()             __asm__ __volatile__ ("femms")
#define vec_exit()              __asm__ __volatile__ ("femms")

#define align()                 __asm__ __volatile__ (".align 16")


#endif





#ifdef ALTIVEC

#define gen_alti3(op,reg1,reg2,regout) \
        __asm__ __volatile__ (#op " %%" #reg1 ", %%" #reg2 ", %%" #regout \
                              :  /* nothing */ \
                              : /* nothing */)

#define gen_alti_muladd(op,reg1,reg2,regout) \
        __asm__ __volatile__ (#op " %%" #reg1 ", %%" #reg2 ", %%" #regout ", %%" #regout \
                              :  /* nothing */ \
                              : /* nothing */)


#define vec_add_rr(reg1,reg2,regout) gen_alti3(vaddfp,reg1,reg2,regout)
#define vec_mul_rr(reg1,reg2,regout) gen_alti3(vmulfp,reg1,reg2,regout)
#define vec_muladd_rr(reg1,reg2,regout) gen_alti3(vmulfp,reg1,reg2,regout)





#endif


typedef float vector[VECLEN];








