#include <Windows.h> 
#include <iostream>
#include <bitset>
#include <intrin.h>
#include "compilerinfo.h"
#include <boost/dynamic_bitset.hpp>
#define SIZE 0x1000000
ULONGLONG M[128];
typedef boost::dynamic_bitset<> Bitset;
Bitset MF[64]; 

using namespace std;

__int64 runsetandget_boost ( int size) {  
	int sum=0;
    boost::dynamic_bitset<> x(size); 
	for (int i=0;i<size;i++) x[i]=1;
	for (int i=0;i< size;i++) sum+=x[i];
	x.clear();
	return sum;
}

__int64 runsetandget_std ( int size) {  
	int sum=0;
    bitset<SIZE> x; // all 0's by default
	for (int i=0;i<size;i++) x[i]=1; 
	for (int i=0;i< size;i++) sum+=x[i];
	return sum;
}

__int64 runsetandget_intrinsic ( int size) {  
	int sum=0;
	LONG64* bitset=(LONG64*) calloc(size/64,8); 
	for (int i=0;i<size;i++) _bittestandset64(bitset,i); 
	for (int i=0;i<size;i++) sum+=_bittest64(bitset,i);
	free (bitset); 
	return sum;
}

__int64 runsetandget_bool ( int size) {  
	int sum=0;
	bool* bitset=(bool*) calloc(size,sizeof(bool)); 
	for (int i=0;i<size;i++) bitset[i]=true;
	for (int i=0;i<size;i++) sum+= bitset[i];
	free (bitset); 
	return sum;
}

__int64 runsetandget_uchar ( int size) {  
	int sum=0;
	UCHAR* bitset=(UCHAR*) calloc(size,sizeof(UCHAR)); 
	for (int i=0;i<size;i++) bitset[i]=1;
	for (int i=0;i<size;i++)  sum+= bitset[i];
	free (bitset); 
	return sum;
}


int runpopcount_std (int size){
  int sum=0;
  bitset<64> x = rand() & 0x3F; 
  for (int i=0; i<size;i++) 
	  sum+=(int)x.count(); 
  return sum;  
}


//******************* Bitscan *****************************


inline int findfirstbsf(ULONGLONG v) {
	DWORD index;
	_BitScanForward64(&index,v); 
	return index;
}

__int64 runbsf0(int size) { // Mit Abfrage auf 0
	int sum=0;
	DWORD index; 
	for (int i=0;i< size/64;i++) 		
		for (int j=0; j <64; j++) { 
			if (BitScanForward64(&index,M[j])) sum+=index; 
			else sum+=64;

	}
	return sum;
}

__int64 runbsf(int size) { // Ohne Abfrage auf 0
	int sum=0;
	for (int i=0;i< size/64;i++) 		
		for (int j=0; j <64; j++) sum+=findfirstbsf(M[j]); 
	return sum;
}



__int64 runboost ( int size) {  
	int sum=0;   
	for (int i=0;i< size/64;i++)	
		for (int j=0; j <64; j++) sum+=(int) (MF[j].find_first());
	return sum;
}


inline int findfirstlin (ULONGLONG v) {
	
int c;  
if (v) {
  v = (v ^ (v - 1)) >> 1;       // Set v's trailing 0s to 1s and zero rest
  for (c = 0; v; c++) v >>= 1;  // alle Einsen zhlen
}
else c = CHAR_BIT * sizeof(v);
return c;
}


__int64 runlin(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++) 
				for (int j=0; j <64; j++) sum+=findfirstlin(M[j]);	
	return sum;
}


static const int DeBruijn32[32] = {
  0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
  31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};


 static const int DeBruijn64 [64] = {
    0,  1,  2, 53,  3,  7, 54, 27,
    4, 38, 41,  8, 34, 55, 48, 28,
   62,  5, 39, 46, 44, 42, 22,  9,
   24, 35, 59, 56, 49, 18, 29, 11,
   63, 52,  6, 26, 37, 40, 33, 47,
   61, 45, 43, 21, 23, 58, 17, 10,
   51, 25, 36, 32, 60, 20, 57, 16,
   50, 31, 19, 15, 30, 14, 13, 12,
};
 

inline int findfirstdebruijn32 (ULONG v) {
return DeBruijn32[((v & -v) * 0x077CB531U) >> 27];
}

inline int findfirstdebruijn64 (ULONGLONG v) {
    return DeBruijn64[((v&-v)*0x022fdd63cc95386d) >> 58];
}

_int64 rundebruijn(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++)  
		for (int j=0; j <64; j++) sum+=findfirstdebruijn64(M[j]);
	return sum;
}

inline int findfirstfloat (ULONGLONG v) {
	assert(v);
    float d=(float)(v&-v);       
    return (((WORD*) &d)[1]>>7) -0x7f;
}

_int64 runfloat(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++) 
		for (int j=0; j <64; j++) sum+=findfirstfloat(M[j]);
	return sum;
}


inline int findfirstdouble(ULONGLONG v)
{
   assert(v);
   union {
      double d;
      struct {
         unsigned int mantissal : 32;
         unsigned int mantissah : 20;
         unsigned int exponent : 11;
         unsigned int sign : 1;
      };
   } ud;
   ud.d = (double)(v & -v); // isolated LS1B to double
   return ud.exponent - 1023;
}

_int64 rundouble(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++)
	 for (int j=0; j <64; j++) sum+=findfirstdouble(M[j]);	
	return sum;
}

inline int findfirstpopcnt(ULONGLONG v){
	assert(v); 
	return (int) _mm_popcnt_u64 (~v & (v - 1));
}

_int64 runpopcnt(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++) 
		for (int j=0; j <64; j++) sum+=findfirstpopcnt(M[j]);	
	return sum;
}

//*************** with tbm, bmi *******************************

inline int findfirstlzcnt(ULONGLONG v) {
	return (int) _lzcnt_u64(v); 
}

__int64 runlzcnt(int size) {
	int sum=0; 
	for (int i=0;i< size/64;i++) 		
		for (int j=0; j <64; j++) 
			sum+=findfirstlzcnt(M[j]); 
	return sum;
}

inline int findfirsttzcnt(ULONGLONG v) {
	return (int) _tzcnt_u64(v); 
}

__int64 runtzcnt(int size) {
	int sum=0; 
	for (int i=0;i< size/64;i++) 		
		for (int j=0; j <64; j++) 
			sum+=findfirsttzcnt(M[j]); 
	return sum;
}


inline int findfirstdebruijn64_bmi (ULONGLONG v) {
	assert (v);
    return DeBruijn64[(_blsi_u64(v)*0x022fdd63cc95386d) >> 58];
}

_int64 rundebruijn_bmi(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++) 
		for (int j=0; j <64; j++) sum+=findfirstdebruijn64_bmi(M[j]);
	return sum;
}


inline int findfirstfloat_bmi (ULONGLONG v) {
	assert (v);
	float d=(float) _blsi_u64(v);       
    return (((WORD*) &d)[1]>>7) -0x7f;
}

_int64 runfloat_bmi(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++)  		
		for (int j=0; j <64; j++) sum+=findfirstfloat(M[j]);
	return sum;
}

inline int findfirstpopcnt_bmi(ULONGLONG v){
	assert (v);
 	return (int) _mm_popcnt_u64 (_blsmsk_u64(v))-1;
}

_int64 runpopcnt_bmi(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++) 
		for (int j=0; j <64; j++) sum+=findfirstpopcnt_bmi(M[j]);	
	return sum;
}

#ifndef __INTEL_COMPILER
inline int findfirstpopcnt_tbm(ULONGLONG v){
	assert (v);
 	return (int) _mm_popcnt_u64 (_blsfill_u64(v)^v);
}

_int64 runpopcnt_tbm(int size) {
	int sum=0;
	for (int i=0;i< size/64;i++) 
		for (int j=0; j <64; j++) sum+=findfirstpopcnt_tbm(M[j]);	
	return sum;
}
#endif


void bench (__int64 (*aloop)(int),char name[80]) { 	
	__int64 ta,te,td,tdmin;
	volatile _int64 res=0;
	volatile int vn=SIZE;
	LARGE_INTEGER qa,qe,qf;
	__int64 d,dmin;
	unsigned int id; 
	int reg[4];
		
	dmin =MAXLONGLONG;
	tdmin=MAXLONGLONG; 
	for(int i=0; i< 100; i++) {
		QueryPerformanceCounter (&qa);
		ta=__rdtsc ();
	    res=(*aloop)(vn);
	    te=__rdtscp(&id);
	    __cpuid(reg,0);
		
		QueryPerformanceCounter (&qe);
		QueryPerformanceFrequency(&qf);
		d=qe.QuadPart-qa.QuadPart;
		if (d< dmin) dmin=d;
	
		td=te-ta;
		if (td< tdmin) tdmin=td; 
		if (d< dmin) dmin=d;
    	//printf_s ("%s %4.2f Clocks %f ns\n",name,(double)td/(SIZE),(double)d/(SIZE)/qf.QuadPart*1e9 );

	}
	//printf ("\n");
	printf_s ("%s %4.2f Clocks %f ns %f\n",name,(double)tdmin/(SIZE),(double)dmin/(SIZE)/qf.QuadPart*1e9,(double)res );
 
}

  
int main(int argc, CHAR* argv[]) {   
	int reg[4];
	bool BMI,TBM,ABM;
	bool dotest=false;
	bool dowarmup=true;
	if (argc >1) { 
		 if (strcmp(argv[1],"-t")==0) dotest=true; 
		 if (strcmp(argv[1],"-n")==0) dowarmup=false; 
	}
		__cpuidex(reg,7,0);
	BMI=(reg[1]>> 3) &1;
	__cpuidex(reg,0x80000001,0);
	TBM=(reg[2] >>21) &1; 
	ABM=(reg[2] >>5) & 1;

    SetThreadAffinityMask(GetCurrentThread(),0x1);
	printf_s ("Benchprogramm bitmanipulation c't 7/13, as\n");
		printf_s ("\n");
    printf_s ("Compilerinfo:\n");
	print_compilerinfo();
	printf_s ("\n");
	if (ABM) printf ("ABM supported\n"); else printf("ABM not supported \n");
	if (BMI) printf ("BMI supported\n"); else printf("BMI not supported \n");
	if (TBM) printf ("TBM supported\n"); else printf("TBM not supported \n");




	M[0]=0;
	for (int i=0; i<64;i++) M[i]=1ULL << i; 
	for (int i=0; i<64; i++) 
	{
	 MF[i].resize(64); 
	 for ( int j=0; j < 64; j++) MF[i][j]= (M[i]>>j) &1;
	}
	M[64]=0; 
	M[65]=0x5555555555555550;
	M[66]=0xAAAAAAAAAAAAAAA0;
	M[67]=0xFFFFFFFFFFFFFFF0;
	if (dotest) {
	for (int i=0; i<68; i++) 
	{
		
	printf ("%3d,%16I64x, all: %3d,%3d,%3d,%3d,%3d",i,M[i],findfirstlin(M[i]), findfirstdebruijn64(M[i]), findfirstbsf(M[i]),findfirstfloat(M[i]),findfirstpopcnt(M[i]));
 	if (BMI) printf (" bmi: %3d,%3d,%3d,%3d",findfirsttzcnt(M[i]), findfirstdebruijn64_bmi(M[i]), findfirstfloat_bmi(M[i]),findfirstpopcnt_bmi(M[i]));
	#ifndef __INTEL_COMPILER
	if (TBM) printf (" tbm: %3d",findfirstpopcnt_tbm(M[i]));
	#endif;
	printf ("\n");
	}


	return (0);
	}
	volatile __int64 res=0; 
    
		SetPriorityClass(GetCurrentProcess(),HIGH_PRIORITY_CLASS);
	if (dowarmup) {
	printf_s("Kern fuer Turbo Mode ein paar s hochfahren\n");   
	for (_int64 i=0; i< 3300000000; i++) res++;  
	}
	printf_s("ok, Messung startet\n"); 

	

	bench(&runsetandget_bool,         "runboolsetandget     : default   :");
	bench(&runsetandget_uchar,        "runucharsetandget    : default   :");

	bench(&runsetandget_intrinsic,    "runintrinsicsetandget: default   :");
	bench(&runsetandget_boost,        "runboostsetandget    : default   :");
	bench(&runsetandget_std,          "runstdsetandget      : default   :");
	bench(&runfloat,				  "runfloat             : default   :");
	bench(&rundouble,				  "rundouble            : default   :");
	bench(&runpopcnt,                 "runpopcnt            : default   :");


	bench(&runlin,                    "runlin               : default   :");
	bench(&runbsf,                    "runbsf               : default   :");
	bench(&runbsf0,                   "runbsf0              : default   :");

	bench(&rundebruijn,               "rundebruijn          : default   :");
	bench(&runboost,                  "runboost             : default   :");
	if (BMI || ABM)  bench(&runlzcnt, "runlzcnt             : default   :");
	if (BMI) bench(&runtzcnt,         "runtzcnt             : default   :");
	if (BMI) bench(&rundebruijn_bmi,  "rundebruijn_bmi      : default   :");
	if (BMI) bench(&runfloat_bmi,     "runfloat_bmi         : default   :");
	if (BMI) bench(&runpopcnt_bmi,    "runpopcnt_bmi        : default   :");

	#ifndef __INTEL_COMPILER
	if (TBM) bench(&runpopcnt_tbm,    "runpopcnt_tbm        : default   :");
#endif;
	SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_NORMAL);	
	return 0;
}

