/*
	1997-98 by H. Dietz and R. Fisher


*/

#ifndef _MMX_H
#define _MMX_H



/*	The type of an value that fits in an MMX register
	(note that long long constant values MUST be suffixed
	 by LL and unsigned long long values by ULL, lest
	 they be truncated by the compiler)
*/
typedef	union {
	long long		q;	/* Quadword 64-bit value */
	unsigned long long	uq;	/* Unsigned Quadword */
	int			d[2];	/* 2 Doubleword (32-bit) values */
	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
	short			w[4];	/* 4 Word (16-bit) values */
	unsigned short		uw[4];	/* 4 Unsigned Word */
	char			b[8];	/* 8 Byte (8-bit) values */
	unsigned char		ub[8];	/* 8 Unsigned Byte */
	float			s[2];	/* Single-precision (32-bit) value */
} mmx_t;

mmx_t mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;

#define mmx_q(src,op,dst)  ((dst).q op (src).q)
#define mmx_uq(src,op,dst) ((dst).uq op (src).uq)

#define mmx_d(src,op,dst)\
  ((dst).d[0] = (dst).d[0] op (src).d[0],\
   (dst).d[1] = (dst).d[1] op (src).d[1])

#define mmx_ud(src,op,dst)\
  ((dst).ud[0] = (dst).ud[0] op (src).ud[0],\
   (dst).ud[1] = (dst).ud[1] op (src).ud[1])

inline short _mmx_sat_w(int x)
{
  if(x > 0x07FFF)  return 0x7FFF;
  if(x < 0xFFFF8000) return 0x8000;
  return x;
}
inline short _mmx_sat_uw(int x)
{
  if(x > 0x0FFFF) return 0xFFFF;
  if(x < 0)  return 0;
  return x;
}

#define mmx_w(src,op,dst)\
  ((dst).w[0] = (dst).w[0] op (src).w[0],\
   (dst).w[1] = (dst).w[1] op (src).w[1],\
   (dst).w[2] = (dst).w[2] op (src).w[2],\
   (dst).w[3] = (dst).w[3] op (src).w[3])
#define mmx_sat_w(src,op,dst)\
  ((dst).w[0] = _mmx_sat_w((dst).w[0] op (src).w[0]),\
   (dst).w[1] = _mmx_sat_w((dst).w[1] op (src).w[1]),\
   (dst).w[2] = _mmx_sat_w((dst).w[2] op (src).w[2]),\
   (dst).w[3] = _mmx_sat_w((dst).w[3] op (src).w[3]))

#define mmx_uw(src,op,dst)\
  ((dst).uw[0] = (dst).uw[0] op (src).uw[0],\
   (dst).uw[1] = (dst).uw[1] op (src).uw[1],\
   (dst).uw[2] = (dst).uw[2] op (src).uw[2],\
   (dst).uw[3] = (dst).uw[3] op (src).uw[3])

#define mmx_sat_uw(src,op,dst)\
  ((dst).uw[0] = _mmx_sat_uw((dst).uw[0] op (src).uw[0]),\
   (dst).uw[1] = _mmx_sat_uw((dst).uw[1] op (src).uw[1]),\
   (dst).uw[2] = _mmx_sat_uw((dst).uw[2] op (src).uw[2]),\
   (dst).uw[3] = _mmx_sat_uw((dst).uw[3] op (src).uw[3]))

#define mmx_b(src,op,dst)\
  ((dst).b[0] = (dst).b[0] op (src).b[0],\
   (dst).b[1] = (dst).b[1] op (src).b[1],\
   (dst).b[2] = (dst).b[2] op (src).b[2],\
   (dst).b[3] = (dst).b[3] op (src).b[3],\
   (dst).b[4] = (dst).b[4] op (src).b[4],\
   (dst).b[5] = (dst).b[5] op (src).b[5],\
   (dst).b[6] = (dst).b[6] op (src).b[6],\
   (dst).b[7] = (dst).b[7] op (src).b[7])
#define mmx_ub(src,op,dst)\
  ((dst).ub[0] = (dst).ub[0] op (src).ub[0],\
   (dst).ub[1] = (dst).ub[1] op (src).ub[1],\
   (dst).ub[2] = (dst).ub[2] op (src).ub[2],\
   (dst).ub[3] = (dst).ub[3] op (src).ub[3],\
   (dst).ub[4] = (dst).ub[4] op (src).ub[4],\
   (dst).ub[5] = (dst).ub[5] op (src).ub[5],\
   (dst).ub[6] = (dst).ub[6] op (src).ub[6],\
   (dst).ub[7] = (dst).ub[7] op (src).ub[7])




/*	1x64 MOVe Quadword
	(this is both a load and a store...
	 in fact, it is the only way to store)
*/
#define	movq_i2r(var, reg)      (reg.q = (long long)var)
#define	movq_m2r(var, reg)	(reg.q = var.q)
#define	movq_r2m(reg, var)	(var.q = reg.q)
#define	movq_r2r(regs, regd)	(regd.q = regs.q)


/*	1x32 MOVe Doubleword
	(like movq, this is both load and store...
	 but is most useful for moving things between
	 mmx registers and ordinary registers)
*/
#define	movd_m2r(var, reg)	(reg.uw[0] = (unsigned)var, reg.uw[1]=0)
#define	movd_r2m(reg, var)	(var = reg.uw[0])
#define	movd_r2r(regs, regd)	(regd.uw[0] = regs.uw[0], regd.uw[1]=0)


/*	2x32, 4x16, and 8x8 Parallel ADDs
*/
#define	paddd_m2r(var, reg)	mmx_d(var, +, reg)
#define	paddd_r2r(regs, regd)	mmx_d(regs, +, regd)


#define	paddw_m2r(var, reg)	mmx_w(var, +, reg)
#define	paddw_r2r(regs, regd)	mmx_w(regs, +, regd)


#define	paddb_m2r(var, reg)	mmx_b(var, +, reg)
#define	paddb_r2r(regs, regd)	mmx_b(regs, +, regd)



/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
*/
#define	paddsw_m2r(var, reg)	mmx_sat_w(var,+,reg)
#define	paddsw_r2r(regs, regd)	mmx_sat_w(regs,+,regd)

/*
#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
*/

/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
*/
#define	paddusw_m2r(var, reg)	mmx_sat_uw(var,+,reg)
#define	paddusw_r2r(regs, regd)	mmx_sat_uw(regs,+,regd)

/*
#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
*/

/*	2x32, 4x16, and 8x8 Parallel SUBs
*/
#define	psubd_m2r(var, reg)	mmx_d(var,-,reg)
#define	psubd_r2r(regs, regd)	mmx_d(regs,-,regd)

#define	psubw_m2r(var, reg)	mmx_w(var,-,reg)
#define	psubw_r2r(regs, regd)	mmx_w(regs,-,regd)

/*
#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
*/


/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
*/
#define	psubsw_m2r(var, reg)	mmx_sat_w(var,-,reg)
#define	psubsw_r2r(regs, regd)	mmx_sat_w(regs,-,regd)

/*
#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
*/


/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
*/
#define	psubusw_m2r(var, reg)	mmx_sat_uw(var,-,reg)
#define	psubusw_r2r(regs, regd)	mmx_sat_uw(regs,-,regd)

/*
#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
*/

/*	4x16 Parallel MULs giving Low 4x16 portions of results
*/
#define	pmullw_m2r(var, reg)	mmx_w(var,*,reg)
#define	pmullw_r2r(regs, regd)	mmx_w(regs,*,regd)



/*	4x16 Parallel MULs giving High 4x16 portions of results
*/
#define	pmulhw_r2r(src, dst)\
  ((dst).w[0] = ((dst).w[0] * (src).w[0])>>16,\
   (dst).w[1] = ((dst).w[1] * (src).w[1])>>16,\
   (dst).w[2] = ((dst).w[2] * (src).w[2])>>16,\
   (dst).w[3] = ((dst).w[3] * (src).w[3])>>16)

#define	pmulhw_m2r(src, dst) pmulhw_r2r(src, dst)	



/*	4x16->2x32 Parallel Mul-ADD
	(muls like pmullw, then adds adjacent 16-bit fields
	 in the multiply result to make the final 2x32 result)
*/

/*
#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
*/


/*	1x64 bitwise AND
*/

#define	pand_m2r(var, reg)	mmx_uq(var,&,reg)
#define	pand_r2r(regs, regd)	mmx_uq(regs,&,regd)



/*	1x64 bitwise AND with Not the destination
*/
/*
#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
*/

/*	1x64 bitwise OR
*/
#define	por_m2r(var, reg)	mmx_uq(var,|,reg)
#define	por_r2r(regs, regd)	mmx_uq(regs,|,regd)



/*	1x64 bitwise eXclusive OR
*/
#define	pxor_m2r(var, reg)	mmx_uq(var,^,reg)
#define	pxor_r2r(regs, regd)	mmx_uq(regs,^,regd)



/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
	(resulting fields are either 0 or -1)
*/
#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)

#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)

#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)


/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
	(resulting fields are either 0 or -1)
*/
#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)

#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)

#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)


/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
*/
#define	psllq_i2r(imm, reg)	mmx_m2r(psllq, imm, reg)
#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)

#define	pslld_i2r(imm, reg)	mmx_m2r(pslld, imm, reg)
#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)

#define	psllw_i2r(imm, reg)	mmx_m2r(psllw, imm, reg)
#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)


/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
*/
#define	psrlq_i2r(imm, reg)	mmx_m2r(psrlq, imm, reg)
#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)

#define	psrld_i2r(imm, reg)	mmx_m2r(psrld, imm, reg)
#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)

#define	psrlw_i2r(imm, reg)	mmx_m2r(psrlw, imm, reg)
#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)


/*	2x32 and 4x16 Parallel Shift Right Arithmetic
*/
#define	psrad_i2r(imm, reg)	mmx_m2r(psrad, imm, reg)
#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)

#define	psraw_i2r(imm, reg)	mmx_m2r(psraw, imm, reg)
#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)


/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
	(packs source and dest fields into dest in that order)
*/
#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)

#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)


/*	4x16->8x8 PACK and Unsigned Saturate
	(packs source and dest fields into dest in that order)
*/
#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)


/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
	(interleaves low half of dest with low half of source
	 as padding in each result field)
*/
#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)

#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)

#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)


/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
	(interleaves high half of dest with high half of source
	 as padding in each result field)
*/
#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)

#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)

#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)


/*	Empty MMx State
	(used to clean-up when going from mmx to float use
	 of the registers that are shared by both; note that
	 there is no float-to-mmx operation needed, because
	 only the float tag word info is corruptible)
*/
#ifdef	MMX_TRACE

#define	emms() \
	{ \
		fprintf(stderr, "emms()\n"); \
		__asm__ __volatile__ ("emms"); \
	}

#else

#define	emms()			__asm__ __volatile__ ("emms")

#endif

#endif

