// ctlatenz.cpp : c't/Andreas Stiller, Sep 2006 
// based on latency.c and LatThpt.h from Intel 
// see "Measuring Instruction Latency and Throughput"
//http://www.intel.com/cd/ids/developer/asmo-na/eng/dc/pentium4/optimization/20243.htm


#include "stdafx.h"
#include <stdio.h>
#include <conio.h>
#include <windows.h>
#include "LatThpt3.h"

   

int _tmain(int argc, _TCHAR* argv[])
{


	int      ch;
	HANDLE   Thandle; 
	unsigned int features,features2;
    __asm 
	{
		mov eax,1
		cpuid 
		mov features,edx 
		mov features2,ecx 
	}
    bool HasSSE = (features >> 25) && 1; 
	bool HasSSE2= (features >> 26) && 1;
	bool HasSSE3= (features2     ) && 1; 

    //SSE4 feature bit not yet specified by Intel so just try it  
	bool HasSSE4= false;  
	__try {
		__asm phaddd mm0,mm0; 
		__asm emms 
		HasSSE4= TRUE;
	}
    __except (1) {
		HasSSE4=FALSE; 
	}

    printf ("SSE= %d, SSE2= %d, SSE3= %d, SSE4= %d\n",HasSSE,HasSSE2,HasSSE3,HasSSE4);
	Thandle=GetCurrentThread; 
	SetThreadAffinityMask (Thandle,1); // use proc 1 
    SetThreadPriority (Thandle,THREAD_PRIORITY_TIME_CRITICAL);  // Realtime Priority for process not necessary  
 
	// 1 MByte Memory workspace should be enough 
    codeptr= VirtualAlloc (NULL,1024*1024,MEM_COMMIT, PAGE_EXECUTE_READWRITE );
	if (codeptr==NULL) {
		printf ("sorry can''t allocate 1 MByte Memory");
		return 1; 
	}
     
	for (int i=1;i <argc;i++) if ((argv[i][0]=='-' || argv[i][0]=='/') && (strlen (argv[i])==2)){
		if (argv[i][1]=='s') WithSleep=true;
		if (argv[i][1]=='w') WithWait=true;
	}

		
 
    //
    // Initialize latency/throughput macros.
    //
    LatThpt_Init();

	do{
    //
    // Perform xmm integer latency tests.
    //
    
    LatThpt_PrepInt128();
	/*
	printf ("Correction for emtpy Loop (XMM,MMX):");   
	measure ( Thpt,Zero,nul );  
	printf ("=> %.9f\n", mtime/LatThpt_REPx);  
    printf ("Correction for emtpy Loop (Integer):"); 
    measure (Thpt,ZeroInt,nul); 
    printf ("=> %.9f\n", mtime/LatThpt_REPx);  
	*/
	printf( "XMM i128 Latency:\n" );
    printf( "-----------------\n" );
    printf( "\t\tMOVDQA\tMOVDQU\tPSHUFD\tPMULLW\tPOR\tPMADDWD\tPUNPCKLQDQ\n" );
    printf( "Reg<-Reg\t" );
    measure ( Lat,Xmm,movdqa,NIX ); 
	measure ( Lat,Xmm,movdqu,NIX ); 
  	measure ( Lat,XmmImm,pshufd,0xe4); 
 	measure ( Lat,Xmm,pmullw,NIX ); 
 	measure ( Lat,Xmm,por,NIX    ); 
        measure ( Lat,Xmm,pmaddwd,NIX ); 
	if (HasSSE2) {measure (Lat,Xmm,punpcklqdq,NIX )}
	else printf ("na");

    printf( "\nMem<-Reg<-Mem\t" );
    measure ( Lat,XmmMem,movdqa,NIX ); 
	measure ( Lat,XmmMem,movdqu,NIX ); 
   
	printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "\n" );


    //
    // Perform xmm integer throughput tests.
    //
    LatThpt_PrepInt128();

    printf( "\nXMM i128 Throughput:\n" );
    printf(   "--------------------\n" );
    printf( "\t\tMOVDQA\tMOVDQU\tPSHUFD\tPMULLW\tPOR\tPMADDWD\tPUNPCKLQDQ\n" );
    printf( "Reg<-Reg\t" );

	measure ( Thpt,Xmm,movdqa,NIX ); 
	measure ( Thpt,Xmm,movdqu,NIX ); 
  	measure ( Thpt,XmmImm,pshufd,0xE4 ); 
 	measure ( Thpt,Xmm,pmullw,NIX ); 
 	measure ( Thpt,Xmm,por,NIX ); 
    measure ( Thpt,Xmm,pmaddwd,NIX ); 
	if (HasSSE2) {measure (Thpt,Xmm,punpcklqdq,NIX )}
	else printf ("na");

    printf( "\nMem<-Reg<-Mem\t" );
    measure ( Thpt,XmmMemLoad,movdqa,NIX ); 
	measure ( Thpt,XmmMemLoad,movdqu,NIX ); 
  	measure ( Thpt,XmmImmMemLoad,pshufd,0 ); 
 	measure ( Thpt,XmmMemLoad,pmullw,NIX ); 
 	measure ( Thpt,XmmMemLoad,por,NIX    ); 
	measure ( Thpt,XmmMemLoad,pmaddwd,NIX ); 
	if (HasSSE2) {measure ( Thpt,XmmMemLoad,Punpcklqdq,NIX )}
	else printf ("na");

    printf( "\nMem<-Reg\t" );
    measure ( Thpt,XmmMemStore,movdqa,NIX ); 
	measure ( Thpt,XmmMemStore,movdqu,NIX ); 

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "\n" );




    //
    // Perform xmm SPFP latency tests.
    //
    LatThpt_PrepSPFP();

    printf( "\nXMM SPFP Latency:\n" );
    printf(   "-----------------\n" );
    printf( "\t\tMOVAPS\tMOVUPS\tSHUFPS\tMULPS\tDIVPS\tMOVHLPS\tMOVLHPS\n" );
    printf( "Reg<-Reg\t" );
	measure ( Lat,Xmm,movaps,NIX ); 
	measure ( Lat,Xmm,movups,NIX ); 
  	measure ( Lat,XmmImm,shufps,0xE4 );
	measure ( Lat,Xmm,mulps,NIX );
 	measure ( Lat,Xmm,divps,NIX ); 
 	measure ( Lat,Xmm,movhlps,NIX ); 
    measure ( Lat,Xmm,movlhps,NIX );
	
	printf( "\nMem<-Reg<-Mem\t" );
    measure ( Lat,XmmMem,movaps,NIX ); 
	measure ( Lat,XmmMem,movups,NIX ); 
   
	printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "\n" );



    //
    // Perform xmm SPFP throughput tests.
    //
    LatThpt_PrepSPFP();

    printf( "\nXMM SPFP Throughput:\n" );
    printf(   "--------------------\n" );
    printf( "\t\tMOVAPS\tMOVUPS\tSHUFPS\tMULPS\tDIVPS\tMOVHLPS\tMOVLHPS\n" );
    printf( "Reg<-Reg\t" );

  	measure ( Thpt,Xmm,movaps,NIX ); 
	measure ( Thpt,Xmm,movups,NIX ); 
  	measure ( Thpt,XmmImm,shufps,0xE4 );
	measure ( Thpt,Xmm,mulps,NIX );
 	measure ( Thpt,Xmm,divps,NIX ); 
 	measure ( Thpt,Xmm,movhlps,NIX ); 
    measure ( Thpt,Xmm,movlhps,NIX );
	
	printf( "\nReg<-Mem\t" );
    measure ( Thpt,XmmMemLoad,movaps,NIX ); 
	measure ( Thpt,XmmMemLoad,movups,NIX ); 
  	measure ( Thpt,XmmImmMemLoad,shufps,0 ); 
 	measure ( Thpt,XmmMemLoad,mulps,NIX ); 
 	measure ( Thpt,XmmMemLoad,divps,NIX ); 
	
	printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "\nMem<-Reg\t" );

	
	measure ( Thpt,XmmMemStore,movaps,NIX ); 
	measure ( Thpt,XmmMemStore,movups,NIX ); 
   
	printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "xxx\t" );

    printf( "\n" );

	if (HasSSE2) {
    printf( "\nSSE2: Latency\n" );
    printf(   "--------------------\n" );
    printf( "\t\tMULPD" );
	printf( "\nReg<-Reg\t" );
	measure (Lat,Xmm,mulpd,NIX);
	printf ("\n"); 
    printf( "\nSSE2: Throughput\n" );
    printf(   "--------------------\n" );
    printf( "\t\tMULPD" );
	printf( "\nReg<-Reg\t" );
	measure (Thpt,Xmm,mulpd,NIX);
	
	printf ("\n"); 
	}

	if (HasSSE3) {
    printf( "\nSSE3: Latency\n" );
    printf(   "--------------------\n" );
    printf( "\t\thaddps movsldup\taddsubps" );
	printf( "\nReg<-Reg\t" );
	measure (Lat,Xmm,haddps,NIX);
	measure (Lat,Xmm,MOVSLDUP,NIX);
	measure (Lat,Xmm,ADDSUBPS,NIX);


	printf ("\n"); 

    printf( "\nSSE3: Throughphut\n" );
    printf(   "--------------------\n" );
    printf( "\t\thaddps movsldup\taddsubps" );
	printf( "\nReg<-Reg\t" );
	measure (Thpt,Xmm,haddps,NIX);
	measure (Thpt,Xmm,MOVSLDUP,NIX);
	measure (Thpt,Xmm,ADDSUBPS,NIX);


	printf ("\n"); 
	}
	if (HasSSE4){
    printf( "\nSSE4: Latency\n" );
    printf( "--------------------\n" );
	printf ("\t\tphaddd\tpabsb  pmulhrsw\tpshufb\tpsignb\tpalignr");
	printf( "\nReg<-Reg\t" );
    measure (Lat,Xmm,phaddd,NIX);
    measure (Lat,Xmm,pabsb,NIX);
	measure (Lat,Xmm,pmulhrsw,NIX);
	measure (Lat,Xmm,pshufb,NIX);
	measure (Lat,Xmm,psignb,NIX);
	measure (Lat,XmmImm,palignr,0xe4);
	printf ("\n"); 

    printf( "\nSSE4: Throughput\n" );
    printf(   "--------------------\n" );
    printf ("\t\tphaddd\tpabsb  pmulhrsw\tpshufb\tpsignb\tpalignr");
	printf( "\nReg<-Reg\t" );
   	measure (Thpt,Xmm,phaddd,NIX);
    measure (Thpt,Xmm,pabsb,NIX);
	measure (Thpt,Xmm,pmulhrsw,NIX);
	measure (Thpt,Xmm,pshufb,NIX);
	measure (Thpt,Xmm,psignb,NIX);
	measure (Thpt,XmmImm,palignr,0xe4);
	printf ("\n"); 
	}

    printf( "\nInteger: Latency\n" );
    printf(   "--------------------\n" );
    printf( "\t\tIMUL\tadd\taddc" );
	printf( "\nReg<-Reg\t" );
	measure ( Lat,Int,imul,NIX ); 
	measure ( Lat,Int,add,NIX ); 
	measure ( Lat,Int,adc,NIX ); 

    printf( "\n" );

    printf( "\nInteger: Throughput\n" );
	printf(   "--------------------\n" );
    printf( "\t\tIMUL\tadd\taddc" );
	printf( "\nReg<-Reg\t" );
	measure ( Thpt,Int,imul,NIX ); 
   	measure ( Thpt,Int,add,NIX );
    measure ( Thpt,Int,adc,NIX );

	printf( "\nReg<-Imm\t" );
	measure ( Thpt,IntImm,imul,0x12345678 );
	measure ( Thpt,IntImm,add,0x12345678 );
    measure ( Thpt,IntImm,adc,0x12345678 );

	
	printf( "\nReg<-mem\t" );
	measure ( Thpt,IntMem,imul,NIX );
    measure ( Thpt,IntMem,add,NIX );
	measure ( Thpt,IntMem,adc,NIX );

	//measure ( Thpt,Intm,idiv );

	printf( "\n" );
    printf( "\nInteger+SSE: Throughput\n" );
    printf(   "--------------------\n" );
    printf( "\t\tIMUL+mulps\tadd+or" );
    printf( "\nReg<-Reg\t" );
	measure ( Thpt,Int_Xmm,mulps,imul );
	printf ("\t");
	measure ( Thpt,Int_Int,add,or );  
	printf( "\n" );
    printf( "\nCMP + Jnz(nT)\t" );
    measure (Lat,Cmp_Jmp,NIX,NIX); 
    measure (Lat,Inc_Cmp_Jmp,NIX,NIX); 
    printf( "\n" );

    //
    // Initialize latency/throughput macro resources.
    //
    LatThpt_Free();
    if (!WithWait) ch='x';
	else ch=_getch();
}

while (ch == 'n'); 
VirtualFree (codeptr,1024*1024,MEM_DECOMMIT);
return 0;


}
