// apfeldll.cpp : Defines the entry point for the DLL application.
//

#include "stdafx.h"
#include <xmmintrin.h>
typedef __int64 (*Line_D) (const double r0, const double i0, const double dr, int xmax, double g, int maxi, DWORD *pCount); 
typedef struct 
{
	double f0;
	double f1;
} double2;

double G2; 
int maxiter; 

//In Order Pipeline, Original: Intel -- Complex Arithmetic with SSE3  
void SSE3_cal_pixel_DP (double2 cR, double2 cI, DWORD * pCount0, DWORD * pCount1)
{
	double2 * pCReal = &cR;
	double2 * pCImag = &cI;
    double fours[2] = {G2, G2 };

	double * pFours = fours;
	_asm {
		push edi
			push esi
			//----- Setup -----
			// initial values for Z are zero
			xorpd xmm0,xmm0 // r1 r0
			xorpd xmm1,xmm1 // i1 i0
			xorpd xmm2,xmm2 // r0^2 I0^2
			xorpd xmm3,xmm3 // r1^2 I1^2
			// Load Constants
			mov eax,pCReal
			movupd xmm4,[eax] // Cr1 Cr0
		mov eax,pCImag
			movupd xmm5,[eax] // Ci1 Ci0
		// xmm6 - four copies of exit test: 4.0 4.0 4.0 4.0
		mov eax,pFours
			movupd xmm6,[eax]
		// Counter - start
		    mov edi,1
			mov esi,1
			// setup the counters
			mov ecx,1 // loop counter
			// pipelining setup
			addpd xmm2,xmm4 // add Cr ---> r1 r0
			//----- Test Pixels -----
TestPixels:
		// max iteration counter
		    inc ecx
			// temp = z.real^2 - z.imag^2 + Cr (pipelined) - moved
			// z.imag = z.real*z.imag + z.real*z.imag + Ci
			mulpd xmm1,xmm0 // result=r1*i1 r0*i0
			addpd xmm1,xmm1 // *2
			addpd xmm1,xmm5 // add Ci ---> i1 i0
			// z.real
			movapd xmm0,xmm2 // r1 r0
			// re-arrange to prepare for lengthsq
			movapd xmm3,xmm2 // r1 r0
			// Using Streaming SIMD Extensions 3 in Algorithms with Complex Arithmetic 21
			shufpd xmm3,xmm1,3 // srcH,destH or i1 r1
			shufpd xmm2,xmm1,0 // srcL,destL or i0 r0
			// lengthsq = z.real^2 + z.imag^2
			mulpd xmm3,xmm3 // I1^2 r1^2 - need to have r at low end
			// as hsubpd does low-high
			mulpd xmm2,xmm2 // I0^2 r0^2
			movapd xmm7,xmm2
			haddpd xmm7,xmm3 // L1 L0
			// test
			cmpltpd xmm7,xmm6 // 4 4
			// temp = z.real^2 - z.imag^2 + Cr (pipelining)
			hsubpd xmm2,xmm3 // (A) xmm2=I0^2 r0^2 xmm3=I1^2 r1^2 ---> diff1 diff0
			addpd xmm2,xmm4 // (B) add Cr ---> r1 r0
			movmskpd edx,xmm7
			cmp edx, 0
			je Done
			mov ebx,edx
			and edx,1
			shr ebx,1
			add esi,edx
			add edi,ebx
			cmp ecx,maxiter
			jl TestPixels
			//----- Counters -----
Done:
		mov eax,pCount0
			mov edx,pCount1
			mov [eax],esi
			mov [edx],edi
			pop esi
			pop edi
	}
}


//Out of Order Pipeline for a row; extended as,c't Aug 2006 
void xLine_SSE3_DP(const double r, const double i, const double dr, int xmax, DWORD *pCount) 
{   
	double2  cR ={r+dr,r};
    double   rn = r+2*dr;
    double2  cI ={i,i};  
	double fours[2] = { G2, G2 };
	DWORD *maxptr= &pCount[xmax+1];
      
	static const _MM_ALIGN16 double  zero[] = {0,0};

	_asm {
		
			
			//----- Setup -----
			// initial values R, I 
  			
	  
	
			// Load Constants
			
			movupd xmm4,cR       // Cr1 ; Cr0
		    movupd xmm5,cI       // Ci1 ; Ci0
		   	movupd xmm6,fours    // G^2 ; G^2 
		// Counter - start
            mov edi,pCount
			mov esi,edi 
			add esi,4
			mov edx,edi 
			add edx,8

start:
			xorpd xmm0,xmm0   // r1 ; r0 
			xorpd xmm1,xmm1   // i1 ; i0

           
			// setup the counters
	        mov ebx,0
			mov ecx,0
		
			// pipelining setup
			movapd xmm2,xmm4 //   r1'=Cr1; r0'=Cr0 
			//----- Test Pixels -----
           			
TestPixels: 
             inc ebx
             inc ecx 

  		// max iteration counter
	        
			// temp = z.real^2 - z.imag^2 + Cr (pipelined) - moved
			// z.imag = z.real*z.imag + z.real*z.imag + Ci
			mulpd xmm1,xmm0    // r1*i1            ; r0*i0
			addpd xmm1,xmm1    // 2*r1*i1          ; 2*r0*i0 
			addpd xmm1,xmm5    // i1=2*r1*i1+Ci1  ; i0=2*r0+i0+ Ci0
		    
			    //      xmm1   // i1          ; i0    
                //      xmm2   // r1           : r0 
			movapd xmm0,xmm2   // r1           ; r0
			movapd xmm3,xmm2   // r1           ; r0 
		
			shufpd xmm3,xmm1,3 //  i1          ; r1  
			shufpd xmm2,xmm1,0 //  i0          ; r0
			// lengthsq = z.real^2 + z.imag^2
			mulpd  xmm3,xmm3   // i1^2       ; r1^2 - need to have r at low end
			// as hsubpd does low-high
			mulpd  xmm2,xmm2   // i0^2       ; r0^2
			movapd xmm7,xmm2   // i0^2       ; r0^2 
			haddpd xmm7,xmm3   // i1^2+r1^2  ; i0^2+r0^2     
			// test
			
			cmpltpd xmm7,xmm6  // i1^2+r1^2 - G; i0^2+r0^2 -G 

			// temp = z.real^2 - z.imag^2 + Cr (pipelining)
			hsubpd xmm2,xmm3   // r1^2-i1^2  ; r0^2-i0^2  
			
			addpd xmm2,xmm4    // r1'=r1^2-i1^2+Cr1   ; r0'=r0^2-i02+Cr0 
			movmskpd eax,xmm7  // signs => edx.1 und edx.0  
			cmp eax, 3      
			jnz checkmask     // <>3 then  Pipe0 and/or Pipe2 > limit 
		 	cmp ebx,maxiter         
            jge   maxit0       //  Pipe0 >=maxiter, Pipe1 =? 
	        cmp ecx,maxiter 
			jl  TestPixels  
			and eax,1        // Mask for Pipe 0 ready
checkmask:  or  eax,eax      // both Pipes ready? 
			jz  next01       // then restart both
            test eax,1       // pipe 0 ready? 
            jz   next0       // then get next Point for Pipe 0   
			                 // else for Pipe 1 
next1:      // next point for Pipe 0
            or edi,edi 
            jz error 

            mov [edi],ecx       // store Counter for Pipe 1          
			or esi,esi          // pipe 0 blocked? 
			jz  done            // then all points calculated => done 
            cmp edx,[maxptr]    // last point in row?  
			jz  block1          // then block  pipe1 
				                // else restart pipe1 
start1:     xor ecx,ecx   
			movhpd XMM0,zero 
			movhpd XMM1,zero

			mov edi,edx        //pointer to next point
			fld  qword ptr rn 
            fst  qword ptr cR.f1   
			fadd qword ptr dr 
			fstp qword ptr rn  //update rn
			
 			movupd xMM4,cR      // Crn, Cr0 
			movhpd XMM2,cR.f1   // Crn ; XMM2.lo                  		
			add edx,4 
			jmp TestPixels 

block1:     mov edi,0         // pointer to nil (blocked)
			mov ecx,-1
			movhpd XMM0,zero 
			movhpd XMM1,zero
			movhpd XMM2,zero
			movhpd XMM4,zero
			movhpd XMM5,zero 
            jmp TestPixels 

maxit0:     and eax,2        // set mask for Pipe 0 ready  
			cmp ecx,maxiter  // Pipe1 is ready too? 
			jl checkmask    // if no 
			xor eax,eax      // if yes 
			jmp checkmask 
            
next0:		//next point for pipe 0
			or esi,esi 
            jz error  
            mov [esi],ebx  // store counter for Pipe0 
            or edi,edi  // pipe 1 blocked?
            jz  done             // then all points calculated => done 
			cmp edx,[maxptr]     // last point of row? 
			jz  block0           // block Pipe 0 

start0:     xor  ebx,ebx  
			movlpd XMM0,zero 
			movlpd XMM1,zero 
		    mov esi,edx         //
			fld qword ptr rn    // rn           
            fst qword ptr cR.f0 // (cr1,rn)  
			fadd qword ptr dr   //  rn+dr
			fstp qword ptr rn   // update rn 
			movupd xMM4,cR      // Crn, Cr0 
			movlpd XMM2,cR.f0   //XMM2.hi  ; Crn             
			add edx,4 		
			jmp TestPixels

block0:     mov esi,0            // pointer blocked
			mov ebx,-1 
			movlpd XMM0,zero 
			movlpd XMM1,zero 
			movlpd XMM2,zero 
			movlpd XMM4,zero
			movlpd XMM5,zero 
            jmp TestPixels  

next01:     or edi,edi 
			jz error 
			or esi,esi 
			jz error


			mov [esi],ebx    // store counter for  Pipe 0
		    mov [edi],ecx    // store counter for Pipe 1

            // at least two points left?
			mov eax,maxptr 
			sub eax,edx
			jz  done      // no, alls point calculated => done 
			cmp eax,4     // one single point is left so
			              // block pipe 1, restart Pipe0  
			jnz start01   // else restart both pipes

		    mov   edi,0      /// block Pipe 1 
			    
			movq   XMM4,rn       //0,rn
			movapd XMM2,XMM4    
			xorpd  XMM0,XMM0 
            xorpd  XMM1,XMM1
			movhpd XMM5,zero 
			mov    esi,edx 
            add    edx,4
			xor    ebx,ebx
			mov    ecx,-1  
            jmp   TestPixels      // and restart Pipe 0     

start01:    mov esi,edx
			add edx,4
			mov edi,edx
			add edx,4
			movddup XMM4,rn     //rn   ; rn
            xorpd   XMM0,XMM0   // 0    ; 0 
			movhpd  XMM0,dr     // dr   ; 0 
			addpd   XMM4,XMM0   //rn+dr ;rn 
			addpd   XMM0,XMM4   //rn+2dr; rn 
			movhpd  rn,XMM0     //rn=rn+2*dr
			movupd  cR,XMM4 
			jmp start
error:      mov     eax,5 

done:
			
	}

} // end SSE3_cal_pixel_D








extern "C"  __declspec( dllexport )   __int64 Line_Gen_D_C (const double r0, const double i0, const double dr, int xmax, const double g, int maxiter, DWORD *pCount) 
{


	__int64 sumcount=0;
	double r=r0; 
	double G2=g*g;
	for (int j=0;j<xmax;++j) { 
		double a=0.0,b=0.0,a2=0.0,b2=0.0; 
		int count = 0;
   
		do{
		
         
			b=a*b*2+i0; 
			a=a2-b2+r;
	        a2=a*a;
			b2=b*b; 
		 }
		while (++count <maxiter && a2+b2 <G2);
		*pCount++ = count;
		sumcount+=count;
		r +=dr;
	}
	return sumcount;
}


extern "C" __declspec  (dllexport)  __int64 Line_SSE3_DP_C (const double r0, const double i0, const double dr, int xmax, const double g, int maxi, DWORD *pCount) 
{
	int sumcount=0;
	G2=g*g; 
    maxiter=maxi; 
	DWORD p1,p2;
	double2 i2 = {i0,i0};
	double2 r2 = {r0,r0+dr};
	for (int j=0;j<xmax/2;j++) {
		SSE3_cal_pixel_DP(r2,i2,&p1,&p2);
		r2.f0+=dr+dr;
		r2.f1+=dr+dr; 
		*pCount++=p1; 
		*pCount++=p2;
		sumcount += p1+p2;

	}
	__asm emms; 
	return sumcount; 

}


//SSE1-Version mit Scalar-Operationen 
void SSE_cal_pixel_D1 (double Cr, double Ci,  DWORD * pCount0)
{
	
	_asm {
		
			//----- Setup -----
			// initial values for Z are zero
			xorpd xmm0,xmm0 // z=0|0 
			xorpd xmm1,xmm1 

			// Load Constants
			
			movq xmm4,Cr 
		    movq xmm5,Ci
			movq xmm6,G2 

		// setup the counters
		    mov ecx,maxiter   // loop counter
			
TestPixels:

		                              // XMM0=(0    : r )
		                              // XMM1=(0    : i )  
		                              // XMM4=(0    : Cr)
		                              // XMM5=(0    : Ci) 
		                              // XMM6=(0    : G } 

		    MOVQ     XMM2,XMM1        //XMM2=(0    : i )  2/2                    
			MULSD    XMM2,XMM0        //XMM2=(0    : r*i) 7(6)/2       
			MULSD    XMM1,XMM1        //XMM1=(0    : i ) 7(6)/2      
			MULSD    XMM0,XMM0        //XMM0=(0    : r ) 7(6)/2      
			ADDSD    XMM2,XMM2        //XMM2=(0    : 2ri} 5(4)/2      
			MOVQ     XMM3,XMM0        //XMM3=(0    : r )             
			ADDSD    XMM3,XMM1        //XMM3=(0    : r+i) 5(4)/2    
			SUBSD    XMM0,XMM1        //XMM0=(0    : r-i) 5(4)/2     
			ADDSD    XMM2,XMM5        //XMM2=(0    : 2ri+Ci)5(4)/2    
			COMISD   XMM3,XMM6        //efl = ---  : r+i<>G) 7(6)/2 
			jnc       Done            //                                    
			ADDSD    XMM0,XMM4        //XMM0=(0    : r-i+C.r)                 
			MOVQ     XMM1,XMM2        //XMM1=(0    : 2ri+Ci)          
			dec      ecx 
			jnz      TestPixels       //                              
			//----------------------------------------------------------------- 
	
Done:
		    Mov      ebx,maxiter 
            sub      ebx,ecx 
			Mov      eax,pCount0
			Mov      [eax],ebx
			
	}
} // end SSE3_cal_pixel_D1

//SSE2 SP Routine von Alex Klimakovski 

DWORD CalcRow_SSE2(DWORD *buf, int xmax, float del, float r, float i, float g, int t)
{
//	int x = 0;
	DWORD iter = 0;
	int gi=(int)g; 
   
	float space[4 * 3 + 4];

	static const _MM_ALIGN16 DWORD smask[] = { ~(0x1 << 31), ~(0x1 << 31), ~(0x1 << 31), ~(0x1 << 31) };
	static const _MM_ALIGN16 float const1[] = { 1.0f, 1.0f, 1.0f, 1.0f };
	static const _MM_ALIGN16 float const3210[] = { 0.0f, 1.0f, 2.0f, 3.0f };
	static const _MM_ALIGN16 int iconst1[] = { 1, 1, 1, 1 };

	__asm
	{
		push		ebx
		push		edi
		push		esi

		lea			ebx, [space + 0xf]
		and			ebx, ~0xf  // aligned local space

		mov			edi, buf
		mov			edx, xmax

		mov			esi, ~0x0

		// set i and g since these do not change
		xorps		xmm7, xmm7
		mov			eax, gi
		cvtsi2ss	xmm7, eax
		shufps		xmm7, xmm7, _MM_SHUFFLE(0,0,0,0)  // g auf vier Pipe

		movss		xmm5, i
		shufps		xmm5, xmm5, _MM_SHUFFLE(0,0,0,0)  // i auf vier Pipes
		
		mulps		xmm7, xmm7		// g^2

		movss		xmm4, del  // 0,0,0,del
		shufps		xmm4, xmm4, _MM_SHUFFLE(0,0,0,0)
		mulps		xmm4, const3210  // del*3,del*2,del,r
		movhlps		xmm3, xmm4  // 0,0,del*3,del*2

		movss		xmm6, r
		shufps		xmm6, xmm6, _MM_SHUFFLE(0,0,0,0)  // r
		addps		xmm6, xmm4  // r+del*3,r+del*2,r+del,r

		shufps		xmm3, xmm3, _MM_SHUFFLE(0,0,0,0)  // del*2,del*2,del*2,del*2
		addps		xmm3, xmm3  // del*4,del*4,del*4,del*4
		
		movaps		[ebx], xmm5  // i
		movaps		[ebx + 16], xmm3  // del*4,del*4,del*4,del*4
		pxor		xmm5, xmm5
		movdqa		[ebx + 32], xmm5  // iter

		//do {
l_line_loop:
/*
		int count = 0;
		//count = pointiter(i, r, g, t);
		{
			float a = 0.0f, b = 0.0f, c;

			do
			{
				c = a * a - b * b + r;
				b = a * b * 2 + i;
				a = c;
				count++;
			} while (fabs(a) <= fg && fabs(b) <= fg && count < t);
		}
*/			
		xorps		xmm5, xmm5		// count
		xorps		xmm0, xmm0
		xorps		xmm1, xmm1
		mov			ecx, t

		//xorps		xmm4, xmm4
		movaps		xmm3, [ebx]		// i

		// xmm1 = a
		// xmm2 = b

l_iter_loop:
			addps		xmm0, xmm3		// 2ab + i == B
			movaps		xmm2, xmm7

			addps		xmm1, xmm6		// a2 - b2 + r == A
			movaps		xmm4, xmm0		// B

			mulps		xmm0, xmm1		// ab
			mulps		xmm1, xmm1		// a2
			mulps		xmm4, xmm4		// b2
			movaps		xmm3, const1

			cmpnleps	xmm2, xmm1		// A2 <= g2
			subps		xmm1, xmm4		// a2 - b2
			cmpleps		xmm4, xmm7		// B2 <= g2
			sub			ecx, 1
			andps		xmm4, xmm2		// (A2 <= g2) && (B2 <= g2)
			addps		xmm0, xmm0		// 2ab
			movmskps	eax, xmm4		// hash cmp result

			andps		xmm4, xmm3
			movaps		xmm3, [ebx]	// i
			test		eax, eax
			addps		xmm5, xmm4
			cmovnz		eax, esi
			and			ecx, eax

			jnz			l_iter_loop

//l_break_iter:
		movdqa		xmm0, [ebx + 32]
		sub			edx, 4  // note: nothing affects the flags until jg
		addps		xmm5, xmm4
		cvtps2dq	xmm5, xmm5

		//r += del;
		addps		xmm6, [ebx + 16]
		
		//iter += count;
		paddd		xmm0, xmm5

		//packssdw	xmm5, xmm5
		
		/*ASSERT((count >= 1) && (count <= t));*/
		//buf[x] = count;
		movupd		[edi], xmm5
		movdqa		[ebx + 32], xmm0
	
		//x++;
		lea			edi, [edi + 4 * 4] /*sizeof(int) * 4*/

		//} while (x < xmax);
		jg			l_line_loop

		movq		xmm0, qword ptr [ebx + 32]
		movq		xmm1, qword ptr [ebx + 32 + 8]
		paddd		xmm0, xmm1
		//pshufd		xmm1, xmm0, _MM_SHUFFLE(3,2,1,1)
		pshuflw		xmm1, xmm0, _MM_SHUFFLE(3,2,3,2)
		paddd		xmm0, xmm1

		movd		iter, xmm0

		// emms not needed
		pop			esi
		pop			edi
		pop			ebx
	}

	return iter;
}

extern "C" __declspec  (dllexport)  __int64 Line_SSE2_SP4_C (const double r0, const double i0, const double dr, int xmax, const double g, int maxi, DWORD *pCount) 

{
	return CalcRow_SSE2(pCount,xmax, dr, r0, i0, g, maxi);
}

extern "C" __declspec  (dllexport)  __int64 Line_SSE3_DP2_C (const double r0, const double i0, const double dr, int xmax, const double g, int maxi, DWORD *pCount) 
{
	
	G2=g*g; 
	maxiter=maxi;
   	//Out of Order Pipeline 
    xLine_SSE3_DP(r0, i0, dr, xmax, pCount);
    int sumcount=0; 
    for (int j=0;j< xmax;j++) sumcount +=pCount[j]; 
	__asm emms; 
	return sumcount; 
}

extern "C"_declspec (dllexport) __int64 Line_SSE_D_Scalar (const double r0, const double i0, const double dr, int xmax, const double g, int maxi,  DWORD *pCount) 
{
    G2=g*g; 
	maxiter=maxi;
	int sumcount=0; 
	double r=r0;
	for (int j=0;j<xmax;j++) {
		SSE_cal_pixel_D1(r,i0,pCount);
		r+=dr; 
		sumcount += *pCount++;
	}
	return sumcount; 

}	

extern "C"_declspec (dllexport) __int64 Frame_SSE_D_Scalar (const double r0, const double i0, const double dr, const double di, int xmax, int ymax, int deltap,  const double g, int maxi,  DWORD *pCount) 
{
	__int64 llIterations=0;
	double r=r0; 
    Line_D Line_Proc=Line_SSE_D_Scalar; 

#pragma omp parallel
	{
#pragma omp for schedule(dynamic, 8) reduction(+: llIterations)  
		for (int y=0; y<ymax; y++) {
			llIterations+=Line_Proc(r0,i0+di*y,dr,xmax,g,maxi,&pCount[y*xmax]);
	
		}
	}
	__asm emms; 
	return llIterations; 
	
}

extern "C"_declspec (dllexport) __int64 OpenMP_Frame (Line_D Line_Proc, const double r0, const double i0, const double dr, const double di, int xmax, int ymax, int deltap,  const double g, int maxi,  DWORD *pCount) 
{
	__int64 llIterations=0;
	double r=r0; 
    

#pragma omp parallel
	{
#pragma omp for schedule(dynamic, 8) reduction(+: llIterations)  
		for (int y=0; y<ymax; y++) {
			llIterations+=Line_Proc(r0,i0+di*y,dr,xmax,g,maxi,&pCount[y*xmax]);
	
		}
	}
	__asm emms; 
	return llIterations; 
	
}

BOOL APIENTRY DllMain( HMODULE hModule,
                       DWORD  ul_reason_for_call,
                       LPVOID lpReserved
					 )
{
    return TRUE;
}

