// LatThpt3.h : c't/Andreas Stiller, Sep 2006 
// based on latency.c and LatThpt.h from Intel 
// see "Measuring Instruction Latency and Throughput"
//http://www.intel.com/cd/ids/developer/asmo-na/eng/dc/pentium4/optimization/20243.htm


//******************************************************************************
// DEFINES
//******************************************************************************

#define LatThpt_LOOPCOUNT		8000
#define LatThpt_REP             25 
#define LatThpt_INSTRUCTIONS    8 
#define reploop 20
bool WithSleep=FALSE;
bool WithWait=false;
//
// XMM Macros
//
 
#define imm1 0xE4
#define imm2 0 
#define imm3 0x12345678 
#define icmd imul

#define	Lat_Xmm( x,b )			__asm x	xmm0,xmm7		\
								__asm x	xmm1,xmm0		\
								__asm x	xmm2,xmm1		\
								__asm x	xmm3,xmm2		\
								__asm x	xmm4,xmm3		\
								__asm x	xmm5,xmm4		\
								__asm x	xmm6,xmm5		\
								__asm x	xmm7,xmm6

#define	Lat_XmmImm( x,imm )		    __asm x	xmm0,xmm7,imm	\
								__asm x	xmm1,xmm0,imm	\
								__asm x	xmm2,xmm1,imm	\
								__asm x	xmm3,xmm2,imm	\
								__asm x	xmm4,xmm3,imm	\
								__asm x	xmm5,xmm4,imm	\
								__asm x	xmm6,xmm5,imm	\
								__asm x	xmm7,xmm6,imm

#define	Lat_XmmMem( x,b )	        __asm x	xmm0,aZero		\
								__asm x	aZero,xmm0		\
								__asm x	xmm1,aZero		\
								__asm x	aZero,xmm1		\
								__asm x	xmm2,aZero		\
								__asm x	aZero,xmm2		\
								__asm x	xmm3,aZero		\
								__asm x	aZero,xmm3

#define	Thpt_Xmm( x,b )			__asm x	xmm0,xmm1		\
								__asm x	xmm2,xmm3		\
								__asm x	xmm4,xmm5		\
								__asm x	xmm6,xmm7		\
								__asm x	xmm0,xmm1		\
								__asm x	xmm2,xmm3		\
								__asm x	xmm4,xmm5		\
								__asm x	xmm6,xmm7

#define	Thpt_XmmImm( x,imm )	    __asm x	xmm0,xmm1,imm	\
								__asm x	xmm2,xmm3,imm	\
								__asm x	xmm4,xmm5,imm	\
								__asm x	xmm6,xmm7,imm	\
								__asm x	xmm0,xmm1,imm	\
								__asm x	xmm2,xmm3,imm	\
								__asm x	xmm4,xmm5,imm	\
								__asm x	xmm6,xmm7,imm

#define	Thpt_XmmMemLoad( x,b )	__asm x	xmm0,aZero		\
								__asm x	xmm1,aZero+16	\
								__asm x	xmm2,aZero+32	\
								__asm x	xmm3,aZero+48	\
								__asm x	xmm4,aZero		\
								__asm x	xmm5,aZero		\
								__asm x	xmm6,aZero		\
								__asm x	xmm7,aZero

#define	Thpt_XmmImmMemLoad( x,imm )	__asm x	xmm0,aZero,imm	\
								__asm x	xmm1,aZero,imm	\
								__asm x	xmm2,aZero,imm	\
								__asm x	xmm3,aZero,imm	\
								__asm x	xmm4,aZero,imm	\
								__asm x	xmm5,aZero,imm	\
								__asm x	xmm6,aZero,imm	\
								__asm x	xmm7,aZero,imm

#define	Thpt_XmmMemStore( x,b )	__asm x	aStore,xmm0		\
								__asm x	aStore+16,xmm1	\
								__asm x	aStore+32,xmm2	\
								__asm x	aStore+48,xmm3	\
								__asm x	aStore+64,xmm4	\
								__asm x	aStore+80,xmm5	\
								__asm x	aStore+96,xmm6	\
								__asm x	aStore+112,xmm7

#define	Thpt_MMx2Xmm( x,b )		__asm x	xmm0,mm0		\
								__asm x	xmm1,mm1		\
								__asm x	xmm2,mm2		\
								__asm x	xmm3,mm3		\
								__asm x	xmm4,mm4		\
								__asm x	xmm5,mm5		\
								__asm x	xmm6,mm6		\
								__asm x	xmm7,mm7

#define	Thpt_Xmm2Mxx( x,b )		__asm x	mm0,xmm0		\
								__asm x	mm1,xmm1		\
								__asm x	mm2,xmm2		\
								__asm x	mm3,xmm3		\
								__asm x	mm4,xmm4		\
								__asm x	mm5,xmm5		\
								__asm x	mm6,xmm6		\
								__asm x	mm7,xmm7

#define	Thpt_Reg2Xmm( x,b )		__asm x	xmm0,eax		\
								__asm x	xmm1,eax		\
								__asm x	xmm2,eax		\
								__asm x	xmm3,eax		\
								__asm x	xmm4,eax		\
								__asm x	xmm5,eax		\
								__asm x	xmm6,eax		\
								__asm x	xmm7,eax

#define	Thpt_Xmm2Reg( x,b )		__asm x	eax,xmm0		\
								__asm x	eax,xmm1		\
								__asm x	eax,xmm2		\
								__asm x	eax,xmm3		\
								__asm x	eax,xmm4		\
								__asm x	eax,xmm5		\
								__asm x	eax,xmm6		\
								__asm x	eax,xmm7


#define	Thpt_Reg2XmmImm( x,imm )  	__asm x	xmm0,eax,imm	\
								__asm x	xmm1,eax,imm	\
								__asm x	xmm2,eax,imm	\
								__asm x	xmm3,eax,imm	\
								__asm x	xmm4,eax,imm	\
								__asm x	xmm5,eax,imm	\
								__asm x	xmm6,eax,imm	\
								__asm x	xmm7,eax,imm

#define	Thpt_Xmm2RegImm( x,imm )    __asm x	eax,xmm0,imm	\
								__asm x	eax,xmm1,imm	\
								__asm x	eax,xmm2,imm	\
								__asm x	eax,xmm3,imm	\
								__asm x	eax,xmm4,imm	\
								__asm x	eax,xmm5,imm	\
								__asm x	eax,xmm6,imm	\
								__asm x	eax,xmm7,imm

#define	Lat_Int( x,b )			__asm x	eax,edx		\
								__asm x	ebx,eax		\
								__asm x	ecx,ebx		\
								__asm x	edx,ecx		\
								__asm x	eax,edx		\
								__asm x	ebx,eax		\
								__asm x	ecx,ebx		\
								__asm x	edx,ecx

#define	Thpt_Int( x,b )			__asm x	eax,ebp		\
								__asm x	ebx,ebp		\
								__asm x	ecx,ebp		\
								__asm x	edx,ebp		\
								__asm x	eax,ebp		\
								__asm x	ebx,ebp		\
								__asm x	ecx,ebp		\
								__asm x	edx,ebp

#define	Thpt_IntImm( x,imm )	   __asm x	eax,imm		\
								__asm x	ebx,imm		\
								__asm x	ecx,imm		\
								__asm x	edx,imm		\
								__asm x	eax,imm		\
								__asm x	ebx,imm		\
								__asm x	ecx,imm		\
								__asm x	edx,imm

#define	Thpt_IntMem( x,b )		__asm x	eax,aStore	\
								__asm x	ebx,aStore	\
								__asm x	ecx,aStore	\
								__asm x	edx,aStore	\
								__asm x	eax,aStore	\
								__asm x	ebx,aStore	\
								__asm x	ecx,aStore	\
								__asm x	edx,aStore


#define	Thpt_Int_Xmm( x,icmd  )	__asm x	xmm0,xmm1		\
								__asm icmd eax,ebp         \
								__asm x	xmm2,xmm3		\
								__asm icmd ebx,ebp	        \
								__asm x	xmm4,xmm5		\
								__asm icmd ecx,ebp			\
								__asm x	xmm6,xmm7		\
								__asm icmd edx,ebp			\
								__asm x	xmm0,xmm1		\
								__asm icmd eax,ebp			\
								__asm x	xmm2,xmm3		\
								__asm icmd ebx,ebp			\
								__asm x	xmm4,xmm5		\
								__asm icmd ecx,ebp			\
								__asm x	xmm6,xmm7       \
								__asm icmd edx,ebp

#define Thpt_Int_Int( x,y ) \
								__asm x eax,ebx \
                                __asm y ecx,edx \
                                __asm x esi,edi \
								__asm y ebp,esp \
								__asm x eax,ebx \
                                __asm y ecx,edx \
                                __asm x esi,edi \
								__asm y ebp,esp \
								__asm x eax,ebx \
                                __asm y ecx,edx \
                                __asm x esi,edi \
								__asm y ebp,esp \
								__asm x eax,ebx \
                                __asm y ecx,edx \
                                __asm x esi,edi \
								__asm y ebp,esp  


//
// MMX Macros
//
#define	Lat_Mmx( x,b )			__asm x	mm0,mm7			\
								__asm x	mm1,mm0			\
								__asm x	mm2,mm1			\
								__asm x	mm3,mm2			\
								__asm x	mm4,mm3			\
								__asm x	mm5,mm4			\
								__asm x	mm6,mm5			\
								__asm x	mm7,mm6

#define	Lat_MmxImm( x,imm )  	__asm x	mm0,mm7,imm		\
								__asm x	mm1,mm0,imm		\
								__asm x	mm2,mm1,imm		\
								__asm x	mm3,mm2,imm		\
								__asm x	mm4,mm3,imm		\
								__asm x	mm5,mm4,imm		\
								__asm x	mm6,mm5,imm		\
								__asm x	mm7,mm6,imm

#define	Lat_MmxMem( x,b )		__asm x	mm0,aZero		\
								__asm x	aZero,mm0		\
								__asm x	mm1,aZero		\
								__asm x	aZero,mm1		\
								__asm x	mm2,aZero		\
								__asm x	aZero,mm2		\
								__asm x	mm3,aZero		\
								__asm x	aZero,mm3



#define	Thpt_Mmx( x,b )			__asm x	mm0,mm1			\
								__asm x	mm2,mm3			\
								__asm x	mm4,mm5			\
								__asm x	mm6,mm7			\
								__asm x	mm0,mm1			\
								__asm x	mm2,mm3			\
								__asm x	mm4,mm5			\
								__asm x	mm6,mm7

#define	Thpt_MmxImm( x,imm )	__asm x	mm0,mm1,imm		\
								__asm x	mm2,mm3,imm		\
								__asm x	mm4,mm5,imm		\
								__asm x	mm6,mm7,imm		\
								__asm x	mm0,mm1,imm		\
								__asm x	mm2,mm3,imm		\
								__asm x	mm4,mm5,imm		\
								__asm x	mm6,mm7,imm

#define	Thpt_MmxMemLoad( x,b )	__asm x	mm0,aZero		\
								__asm x	mm1,aZero		\
								__asm x	mm2,aZero		\
								__asm x	mm3,aZero		\
								__asm x	mm4,aZero		\
								__asm x	mm5,aZero		\
								__asm x	mm6,aZero		\
								__asm x	mm7,aZero

#define	Thpt_MmxImmMemLoad( x,imm )	__asm x	mm0,aZero,imm	\
								__asm x	mm1,aZero,imm	\
								__asm x	mm2,aZero,imm	\
								__asm x	mm3,aZero,imm	\
								__asm x	mm4,aZero,imm	\
								__asm x	mm5,aZero,imm	\
								__asm x	mm6,aZero,imm	\
								__asm x	mm7,aZero,imm

#define	Thpt_MmxMemStore( x,b )	__asm x	aStore,mm0		\
								__asm x	aStore+8,mm1	\
								__asm x	aStore+16,mm2	\
								__asm x	aStore+24,mm3	\
								__asm x	aStore+32,mm4	\
								__asm x	aStore+40,mm5	\
								__asm x	aStore+48,mm6	\
								__asm x	aStore+56,mm7

#define	Thpt_Reg2Mmx( x,b )		__asm x	mm0,eax			\
								__asm x	mm1,eax			\
								__asm x	mm2,eax			\
								__asm x	mm3,eax			\
								__asm x	mm4,eax			\
								__asm x	mm5,eax			\
								__asm x	mm6,eax			\
								__asm x	mm7,eax

#define	Thpt_Mmx2Reg( x,b )		__asm x	eax,mm0			\
								__asm x	eax,mm1			\
								__asm x	eax,mm2			\
								__asm x	eax,mm3			\
								__asm x	eax,mm4			\
								__asm x	eax,mm5			\
								__asm x	eax,mm6			\
								__asm x	eax,mm7

#define	Thpt_Reg2MmxImm( x,imm )__asm x	mm0,eax,imm		\
								__asm x	mm1,eax,imm		\
								__asm x	mm2,eax,imm		\
								__asm x	mm3,eax,imm		\
								__asm x	mm4,eax,imm		\
								__asm x	mm5,eax,imm		\
								__asm x	mm6,eax,imm		\
								__asm x	mm7,eax,imm

#define	Thpt_Mmx2RegImm( x,imm )__asm x	eax,mm0,imm		\
								__asm x	eax,mm1,imm		\
								__asm x	eax,mm2,imm		\
								__asm x	eax,mm3,imm		\
								__asm x	eax,mm4,imm		\
								__asm x	eax,mm5,imm		\
								__asm x	eax,mm6,imm		\
								__asm x	eax,mm7,imm


#define ReadTSC( x )            __asm rdtsc					\
								__asm mov dword ptr x,eax	\
								__asm mov dword ptr x+4,edx


#define LatThpt_CLEAR_XMM_I128	__asm pxor xmm0,xmm0	\
								__asm pxor xmm1,xmm1	\
								__asm pxor xmm2,xmm2	\
								__asm pxor xmm3,xmm3	\
								__asm pxor xmm4,xmm4	\
								__asm pxor xmm5,xmm5	\
								__asm pxor xmm6,xmm6	\
								__asm pxor xmm7,xmm7

#define LatThpt_CLEAR_XMM_I64	__asm pxor mm0,mm0		\
								__asm pxor mm1,mm1		\
								__asm pxor mm2,mm2		\
								__asm pxor mm3,mm3		\
								__asm pxor mm4,mm4		\
								__asm pxor mm5,mm5		\
								__asm pxor mm6,mm6		\
								__asm pxor mm7,mm7

#define LatThpt_CLEAR_XMM_SPFP	__asm movaps xmm0,aOnesPS \
								__asm movaps xmm1,xmm0	\
								__asm movaps xmm2,xmm0	\
								__asm movaps xmm3,xmm0	\
								__asm movaps xmm4,xmm0	\
								__asm movaps xmm5,xmm0	\
								__asm movaps xmm6,xmm0	\
								__asm movaps xmm7,xmm0

#define LatThpt_CLEAR_XMM_DPFP	__asm movapd xmm0,aOnesPD \
								__asm movapd xmm1,xmm0	\
								__asm movapd xmm2,xmm0	\
								__asm movapd xmm3,xmm0	\
								__asm movapd xmm4,xmm0	\
								__asm movapd xmm5,xmm0	\
								__asm movapd xmm6,xmm0	\
								__asm movapd xmm7,xmm0


#define Lat_Cmp_Jmp(x,y)        __asm cmp ebp,0 \
                                __asm jz $+10\
								__asm cmp ebp,1\
								__asm jz $+10\
								__asm cmp ebp,2\
								__asm jz $+10\
								__asm cmp ebp,3\
								__asm jz $+10\
								__asm cmp ebp,4 \
                                __asm jz $+10\
								__asm cmp ebp,5\
								__asm jz $+10\
								__asm cmp ebp,6\
								__asm jz $+10\
								__asm cmp ebp,7\
								__asm jz $+10

#define Lat_Inc_Cmp_Jmp(x,y)    __asm inc ebp\
                                __asm cmp ebp,0 \
                                __asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10\
								__asm inc ebp\
								__asm cmp ebp,0\
								__asm jz $+10
								

float  LatThpt_REPx=            LatThpt_REP;  
float  LatThpt_Inst=            LatThpt_INSTRUCTIONS;
float  LatThpt_DIVISOR= (float) LatThpt_LOOPCOUNT*LatThpt_REP*LatThpt_INSTRUCTIONS; 			

#define LatThpt_MACROx25( x )	x x x x x x x x x x x x x x x x x x x x x x x x x
volatile int iLatThpt;							
__int64 LatThptStartTime, LatThptEndTime;

//
// General purpose macros.
//

#define LatThpt_Init()																\
								__declspec( align( 16 ) ) static int aZero[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };		\
								__declspec( align( 16 ) ) static float aOnesPS[] = { 1, 1, 1, 1 };	\
								__declspec( align( 16 ) ) static double aOnesPD[] = { 1, 1 };		\
								__declspec( align( 16 ) ) static int aStore[ 32 ];

#define LatThpt_Free()			__asm emms

#define LatThpt_PrepInt64()		{ LatThpt_CLEAR_XMM_I64 }
#define LatThpt_PrepInt128()	{ LatThpt_CLEAR_XMM_I128 }
#define LatThpt_PrepSPFP()		{ LatThpt_CLEAR_XMM_SPFP }
#define LatThpt_PrepDPFP()		{ LatThpt_CLEAR_XMM_DPFP }

//#define	LatThpt_GetClocks()	((float)(LatThptEndTime-LatThptStartTime) / LatThpt_DIVISOR)
#define	LatThpt_GetClocks()	((float)(LatThptEndTime-LatThptStartTime) / ((float) LatThpt_LOOPCOUNT*LatThpt_REPx*LatThpt_Inst));

void *codeptr,*aptr, *eptr, *zptr;

typedef unsigned int U32; 

#define RET 0xc3; 
																
void copycode (U32 anz) { 
	U32 lab1,lab2;
	U32 repcount; 
	void* startptr; 
	lab1=(U32) eptr - (U32) aptr;
    lab2=(U32) zptr - (U32) eptr; 
    startptr=codeptr;
	for (repcount=0;repcount<anz; ++repcount) {
		CopyMemory(startptr,aptr,lab1);
		startptr=(void*) ((U32) startptr+ lab1); 
	}
    CopyMemory(startptr,eptr,lab2); 
	FlushInstructionCache (GetCurrentProcess(),NULL,NULL);
};
   
//emit 0x74, 06 for jz $+8, because VC8 doesn't compile as short jump 
#define LatThpt_COPY( m, x,y) \
   __asm jmp L3##m##x \
   __asm L1##m##x: \
       m( x,y )    \
   __asm L2##m##x: \
   __asm dec [iLatThpt] \
   __asm _emit 0x74\
   __asm _emit 6\
   __asm jmp [codeptr] \
   __asm ret \
   __asm L3##m##x: \
   __asm mov eax,offset L1##m##x \
   __asm mov aptr,eax \
   __asm mov eax,offset L2##m##x \
   __asm mov eptr,eax \
   __asm mov eax,offset L3##m##x \
   __asm mov zptr,eax 


void Run(){	
	float f, mtime=100000.0; 
	{LatThpt_CLEAR_XMM_I64} 
	{LatThpt_CLEAR_XMM_I128} 
	BOOL ok=true;
    copycode (LatThpt_REP);
	for (int i=0;i<reploop;i++) {                            
		if (WithSleep) Sleep(1);					   
		__try{          
			iLatThpt=LatThpt_LOOPCOUNT;
			__asm pushad 
            __asm mov ebp,0x12345678 
			__asm xor eax,eax		    
	        __asm cpuid                 
			{ReadTSC( LatThptStartTime )}
			__asm xor eax,eax
            __asm xor edx,edx 
			__asm call [codeptr]
			{ReadTSC (LatThptEndTime)}
			__asm popad 
			{LatThpt_Free ()}
			f=LatThpt_GetClocks(); 
				
			if (f<mtime) mtime=f;
           
		}__except (1){               
			ok=false; 
			mtime=-1;
			break;}                   
	}							   
 if (ok) printf( "%.2f\t", mtime);
 else printf ("Except\t");
}

#define measure( n,m,x,y)   \
{LatThpt_COPY (n##_##m,x,y)} \
{Run();} 

						 
						