// mmtest.cpp : Definiert den Einstiegspunkt fr die Konsolenanwendung.
//

#include "stdafx.h"
#include <intrin.h> 
#include <Windows.h> 
#include <string>
#include <math.h>
#include <intrin.h>
#include <xmmintrin.h>
#ifdef MKL
#include <mkl.h>
#endif 
#include "compilerinfo.h"

#ifndef DIM0 
#define DIM0 1024
#endif

#ifndef DIM 
#define DIM DIM0
#endif


#ifndef NOMSR
#include "mymsr.h"
#endif


bool verbose = false;






typedef double mat[DIM0][DIM0];
__declspec(align(128)) mat a;
__declspec(align(128)) mat b;
__declspec(align(128)) mat c;

double* ap = *a; // = (double*) &a 
double* bp = *b; // = (double*) &b
double* cp = *c; // = (double*) &c

int dim = DIM;
int dim0 = DIM0;

void fill(double *a, double *b, double *c, int dim, int dim0) {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++) {
			a[i*dim0 + j] = i*j;
			b[i*dim0 + j] = i*j;
			c[i*dim0 + j] = 0;
		}
}

double sumup(double * c, int dim, int dim0) {
	double r = 0.0;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++) {
			r += c[i*dim0 + j];
		}
	return r;
}


void setNull() {
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;
}


//************************** with static global arrays, vari dim (global) <=DIM0 
#define mm_ijk 0
void matrixmul_ijk() {
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++) //Intel: Perm+Vec
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)
				c[i][j] += a[i][k] * b[k][j];
}

#define mm_jik 1 
void matrixmul_jik() {
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int j = 0; j< dim; j++)    //Intel: Perm+Vec
		for (int i = 0; i< dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)
				c[i][j] += a[i][k] * b[k][j];
}

#define mm_kij 2
void matrixmul_kij() { //1,6T
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int k = 0; k < dim; k++)
		for (int i = 0; i< dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j< dim; j++) //MS: vector, Intel: Vec, matmul
				c[i][j] += a[i][k] * b[k][j];
}

#define mm_ikj 3
void matrixmul_ikj() { //1,6 T
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;
	for (int i = 0; i< dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j< dim; j++)  //MS:vector, Intel: vec, matmul
				c[i][j] += a[i][k] * b[k][j];
}

#define mm_kji 4
void matrixmul_kji() { //45,5T
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int k = 0; k < dim; k++)
		for (int j = 0; j< dim; j++) //Intel: Perm+Vec, matmul
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i< dim; i++)
				c[i][j] += a[i][k] * b[k][j];
}

#define mm_jki 5
void matrixmul_jki() { //45T
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int j = 0; j< dim; j++)     //Intel: Perm+Vec, matmul
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i< dim; i++)
				c[i][j] += a[i][k] * b[k][j];
}


//************************ with zeroing of c in main loop,static global arrays, vari dim <=DIM0 
#define mmX_ijk 6
void matrixmulX_ijk() { //20T 


	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)   //Intel: Perm+Vec, Partial, Matmul
		{
			//c[i][j] = a[i][0] * b[0][j];
			c[i][j] = 0;
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int k = 0; k < dim; k++) //
				c[i][j] += a[i][k] * b[k][j];

		}
}
#define mmX_jik 7
void matrixmulX_jik() {
	for (int j = 0; j< dim; j++)  //Intel: Perm+Vec, Partial, Matmul
		for (int i = 0; i< dim; i++)
		{
			c[i][j] = 0;
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int k = 0; k < dim; k++) //#innerloop
				c[i][j] += a[i][k] * b[k][j];

		}
}


//************************ with  static global arrays, dim fixed to DIM,
#define mmF_ijk 10
void matrixmulF_ijk() { //1,6 T
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
			c[i][j] = 0.0;
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < DIM; k++)  //MS:vector, Intel: Matmul
				c[i][j] += a[i][k] * b[k][j];
}

#define mmF_jik 11
void matrixmulF_jik() { //1,6 T
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
			c[i][j] = 0.0;
	for (int j = 0; j < DIM; j++)          //Intel Perm+vec,matmul
		for (int i = 0; i < DIM; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < DIM; k++)  //MS:vector, 
				c[i][j] += a[i][k] * b[k][j];
}

#define mmF_kij 12
void matrixmulF_kij() { //1,6 T
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
			c[i][j] = 0.0;
	for (int k = 0; k < DIM; k++)
		for (int i = 0; i < DIM; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < DIM; j++)  //MS:vector, Intel: Vec, Matmul
				c[i][j] += a[i][k] * b[k][j];
}

#define mmF_ikj 13
void matrixmulF_ikj() { //1,6 T
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
			c[i][j] = 0.0;
	for (int i = 0; i < DIM; i++)
		for (int k = 0; k < DIM; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < DIM; j++)  //MS:vector, Intel: Vec,Matmul
				c[i][j] += a[i][k] * b[k][j];
}

#define mmF_kji 14
void matrixmulF_kji() { //1,6 T
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
			c[i][j] = 0.0;
	for (int k = 0; k < DIM; k++)
		for (int j = 0; j < DIM; j++)   //Intel: Perm+Vec
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < DIM; i++)  //MS:vector
				c[i][j] += a[i][k] * b[k][j];
}

#define mmF_jki 15
void matrixmulF_jki() { //1,6 T
	for (int i = 0; i < DIM; i++)
		for (int j = 0; j < DIM; j++)
			c[i][j] = 0.0;
	for (int j = 0; j < DIM; j++)  //Intel perm+vec, matmul
		for (int k = 0; k < DIM; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < DIM; i++)
				c[i][j] += a[i][k] * b[k][j];
}


//****************************** Mul with transposition, static global arrays, vari dim <= DIM0
#define mmT_ijk 20 
void matrixmulT_ijk() {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++) //Intel: Vec
				c[i][j] += a[i][k] * b[j][k];
}

#define mmT_jik 21 
void matrixmulT_jik() {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int j = 0; j< dim; j++)
		for (int i = 0; i< dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)  //Intel: Vec
				c[i][j] += a[i][k] * b[j][k];

}

#define mmT_kij 22 
void matrixmulT_kij() {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int k = 0; k< dim; k++)     //Intel: Perm+Vec
		for (int i = 0; i< dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++)
				c[i][j] += a[i][k] * b[j][k];

}

#define mmT_ikj 23
void matrixmulT_ikj() {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int i = 0; i< dim; i++)
		for (int k = 0; k< dim; k++) //Intel: Perm+Vec
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++)
				c[i][j] += a[i][k] * b[j][k];

}

#define mmT_kji 24
void matrixmulT_kji() {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int k = 0; k< dim; k++)   //Intel: Perm+Vec
		for (int j = 0; j< dim; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++)
				c[i][j] += a[i][k] * b[j][k];

}

#define mmT_jki 25
void matrixmulT_jki() {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int j = 0; j< dim; j++)
		for (int k = 0; k< dim; k++) //Intel: Perm+Vec
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++)
				c[i][j] += a[i][k] * b[j][k];

}

#define mmTX_ijk 26
void matrixmulTX_ijk() {
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
		{
			c[i][j] = 0.0;
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 1; k < dim; k++)
				c[i][j] += a[i][k] * b[j][k]; // Intel: NoVec
		}
}

#define mmTX_jik 27
void matrixmulTX_jik() {
	for (int j = 0; j< dim; j++)
		for (int i = 0; i< dim; i++)
		{
			c[i][j] = 0.0;
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 1; k < dim; k++) //Intel: Vec
				c[i][j] += a[i][k] * b[j][k];
		}
}


//******************************  with static arrays as parameter, vari dim <= DIM0 
#define mmS_ijk 30
void matrixmulS_ijk(const mat a, const mat b, mat c, int dim) {
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;


	for (int i = 0; i< dim; i++)
		for (int j = 0; j < dim; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k< dim; k++) //Intel: Vec
				c[i][j] += a[i][k] * b[k][j];
}

#define mmS_jik 31
void matrixmulS_jik(const mat a, const mat b, mat c, int dim) { //1,6 T
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
			c[i][j] = 0.0;
	for (int j = 0; j < dim; j++)  //Intel: Perm+Vec
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)  //MS:vector
				c[i][j] += a[i][k] * b[k][j];
}

#define mmS_kij 32
void matrixmulS_kij(const mat a, const mat b, mat c, int dim) { //1,6 T
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
			c[i][j] = 0.0;
	for (int k = 0; k < dim; k++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++)  //MS:vector, //Intel: Vec
				c[i][j] += a[i][k] * b[k][j];
}

#define mmS_ikj 33
void matrixmulS_ikj(const mat a, const mat b, mat c, int dim) { //1,6 T
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
			c[i][j] = 0.0;
	for (int i = 0; i < dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++)  //MS:vector, //Intel: Vec
				c[i][j] += a[i][k] * b[k][j];
}

#define mmS_kji 34
void matrixmulS_kji(const mat a, const mat b, mat c, int dim) { //1,6 T
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
			c[i][j] = 0.0;
	for (int k = 0; k < dim; k++)
		for (int j = 0; j < dim; j++) //Intel: Perm+Vec
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++)  //MS:vector
				c[i][j] += a[i][k] * b[k][j];
}

#define mmS_jki 35
void matrixmulS_jki(const mat a, const mat b, mat c, int dim) { //1,6 T
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
			c[i][j] = 0.0;
	for (int j = 0; j < dim; j++)  //Intel: Perm+Vec
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++)  //MS:vector
				c[i][j] += a[i][k] * b[k][j];


}


#define mmSX_ijk 36
void matrixmulSX_ijk(const mat a, const mat b, mat c, int dim) {

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
		{
			c[i][j] = 0;
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int k = 0; k < dim; k++)
				c[i][j] += a[i][k] * b[k][j];

		}
}
#define mmSX_jik 37
void matrixmulSX_jik(mat a, mat b, mat c, int dim) {
	for (int j = 0; j< dim; j++)
		for (int i = 0; i< dim; i++)
		{
			c[i][j] = 0;
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int k = 0; k < dim; k++)
				c[i][j] += a[i][k] * b[k][j];

		}
}

#define mmSP_ikj 38

//****************************** with tiling of second loop, static global arrays, vari dim <= dim0
//****************************** with tiling of second loop, static global arrays, vari dim <= DIM0
//****************************** restricted version (dim%n==0) to allow vectorization
#define mmB_ikj 40
int matrixmulB_ikj(int n, int dim) { //

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;


	for (int k0 = 0; k0 < dim; k0 += n)
		for (int i = 0; i < dim; i++) {

#ifdef novec 
			for (int k = k0; k<min(k0 + n, dim); k++)
#pragma loop(no_vector)
#else 
			if (dim%n != 0) return 1;           //einfache Fassung 
			for (int k = k0; k < k0 + n; k++)   // fr Vektorisierer 
#endif
				for (int j = 0; j < dim; j++)  //MS:Vector, //Intel: Vec
					c[i][j] += a[i][k] * b[k][j];

		}
	return 0;
}


//****************************** with tiling of all loops, static global arrays, vari dim <= DIM0
//****************************** restricted version (dim%n==0) to allow vectorization
#define mmN_ijk 50
int matrixmulN_ijk(int n, int dim) {
	if (dim%n != 0) return 1;

	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int i0 = 0; i0 < dim; i0 += n)
		for (int j0 = 0; j0 < dim; j0 += n)
			for (int k0 = 0; k0 < dim; k0 += n)
				for (int i = i0; i < i0 + n; i++)
					for (int j = j0; j < j0 + n; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
						for (int k = k0; k < k0 + n; k++)  //Intel: Vec
							c[i][j] += a[i][k] * b[k][j];
	return 0;
}

#define mmN_ikj 51
int matrixmulN_ikj(int n, int dim) {
	if (dim%n != 0) return 1; //einfache Version, erlaubt nur ganze Teiler von dim 
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int i0 = 0; i0 < dim; i0 += n)
		for (int k0 = 0; k0 < dim; k0 += n)
			for (int j0 = 0; j0 < dim; j0 += n)
				for (int i = i0; i < i0 + n; i++)
					for (int k = k0; k < k0 + n; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
						for (int j = j0; j < j0 + n; j++) //MS vector, Intel: Vec
							c[i][j] += a[i][k] * b[k][j];
	return 0;
}

#define mmN2_ikj 52
int matrixmulN2_ikj(int n, int dim) {
	if (dim%n != 0) return 1; //einfache Version, erlaubt nur ganze Teiler von dim 
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

	for (int i0 = 0; i0 < dim; i0 += n)
		for (int k0 = 0; k0 < dim; k0 += n)

			for (int i = i0; i < i0 + n; i++)
				for (int k = k0; k < k0 + n; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
					for (int j = 0; j < dim; j++) //MS vector, Intel: Vec
						c[i][j] += a[i][k] * b[k][j];
	return 0;
}


//****************************** with OpenMP vari dim <= DIM0
#define mmSOS_ikj 60
void matrixmulSOS_ikj(mat a, mat b, mat c, int dim) { //1,6 T
#pragma omp parallel for 
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

#pragma omp parallel for schedule (static)
	for (int i = 0; i< dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j< dim; j++)
				c[i][j] += a[i][k] * b[k][j]; // MS: No autovectorization in openMP for, Intel: Vec
}

#define mmSOD_ikj 61
void matrixmulSOD_ikj(const mat a, const mat b, mat c, int dim) { //1,6 T
#pragma omp parallel for 
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

#pragma omp parallel for schedule (dynamic)
	for (int i = 0; i< dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j< dim; j++)
				c[i][j] += a[i][k] * b[k][j]; // MS: No autovectorization in openMP for, //Intel: Vec
}


#define mmBOS_ikj 62
void matrixmulBOS_ikj(int n, int dim) { //
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

#pragma omp parallel for schedule (static)
	for (int kx = 0; kx <= dim / n; kx++)
		for (int i = 0; i < dim; i++)
			for (int k = n*kx; k< min(n*kx + n, dim); k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
				for (int j = 0; j< dim; j++)  //MS:Vector, Intel: Vec
					c[i][j] += a[i][k] * b[k][j];
}

#define mmBOD_ikj 63
void matrixmulBOD_ikj(int n, int dim) { //
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i][j] = 0.0;

#pragma omp parallel for schedule (static)
	for (int kx = 0; kx <= dim / n; kx++)
		for (int i = 0; i < dim; i++)
			for (int k = n*kx; k< min(n*kx + n, dim); k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
				for (int j = 0; j< dim; j++)  //MS:Vector, Intel: Vec
					c[i][j] += a[i][k] * b[k][j];
}



//****************************** all dynamic, with pointers to arrays, for dim=dim0

#define mmP_ijk 70
void matrixmulP_ijk(double *a, double *b, double *c, int dim, int dim0) {
	if (dim != dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim + j] = 0.0;

	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++) //MS: noVec , Intel Vec: Vmulpd
				c[i*dim + j] += a[i*dim + k] * b[k*dim + j];
}

#define mmP_jik 71
void matrixmulP_jik(double *a, double *b, double *c, int dim, int dim0) {
	if (dim != dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim + j] = 0.0;

	for (int j = 0; j < dim; j++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)  //MS: noVec , Intel Vec: vmulpd
				c[i*dim + j] += a[i*dim + k] * b[k*dim + j];
}

#define mmP_kij 72
void matrixmulP_kij(double *a, double *b, double *c, int dim, int dim0) {
	if (dim != dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim + j] = 0.0;
	for (int k = 0; k < dim; k++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++) //MS: noVec , Intel Vec: vfmadd213pd
				c[i*dim + j] += a[i*dim + k] * b[k*dim + j];
}

#define mmP_ikj 73
void matrixmulP_ikj(double *a, double *b, double *c, int dim, int dim0) {
	if (dim != dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim + j] = 0.0;


	for (int i = 0; i < dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int j = 0; j < dim; j++) //MS: noVec , Intel Vec: vfmadd213pd
				c[i*dim + j] += a[i*dim + k] * b[k*dim + j];
}

#define mmP_kji 74
void matrixmulP_kji(double *a, double *b, double *c, int dim, int dim0) {
	if (dim != dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int k = 0; k < dim; k++)
		for (int j = 0; j < dim; j++)
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int i = 0; i < dim; i++) //MS: noVec , Intel noVec
				c[i*dim + j] += a[i*dim + k] * b[k*dim + j];
}

#define mmP_jki 75
void matrixmulP_jki(double *a, double *b, double *c, int dim, int dim0) {
	if (dim != dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim + j] = 0.0;

	for (int j = 0; j < dim; j++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++) //MS: noVec , Intel noVec
				c[i*dim + j] += a[i*dim + k] * b[k*dim + j];
}

//****************************** all dynamic, with pointers to arrays, with dim <= dim0 (variable)
#define mmP0_ijk 80
void matrixmulP0_ijk(const double *a, const double *b, double *c, int dim, int dim0) {
	/*if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
	for (int j = 0; j< dim; j++)
	c[i*dim0 + j] = 0.0;
	*/
	setNull();
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
			for (int k = 0; k < dim; k++) //MS: noVec , Intel Vec: Vmulpd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmP0_jik 81
void matrixmulP0_jik(double *a, double *b, double *c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int j = 0; j < dim; j++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)  //MS: noVec , Intel Vec: vmulpd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmP0_kij 82
void matrixmulP0_kij(double *a, double *b, double *c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;
	for (int k = 0; k < dim; k++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++) //MS: noVec , Intel Vec: vfmadd213pd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmP0_ikj 83
void matrixmulP0_ikj(double *a, double *b, double *c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;


	for (int i = 0; i < dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int j = 0; j < dim; j++) //MS: noVec , Intel Vec: vfmadd213pd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmP0_kji 84
void matrixmulP0_kji(double *a, double *b, double *c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int k = 0; k < dim; k++)
		for (int j = 0; j < dim; j++)
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int i = 0; i < dim; i++) //MS: noVec , Intel noVec
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmP0_jki 85
void matrixmulP0_jki(double *a, double *b, double *c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int j = 0; j < dim; j++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++) //MS: noVec , Intel noVec
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}


#define mmPP_ikj 86

void matrixmulPP_ikj(double *a, double* b, double* c, int dim, int dim0) {

	double* cp0;
	double* bpx;
	double* cpx;
	double aw;
	int i0;
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;


	for (int i = 0; i < dim; i++) {
		i0 = i*dim0;
		cp0 = &c[i0];
		for (int k = 0; k < dim; k++) {
			cpx = cp0;
			bpx = &b[k*dim0];
			aw = a[i0 + k];
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int j = 0; j < dim; j++) //

				*cpx++ += aw * *bpx++;
		}
	}
}


#define mmPR_ijk 90
void matrixmulPR_ijk(const double*  __restrict a, const double * __restrict b, double* __restrict c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;
#pragma omp parallel for
	for (int i = 0; i < dim; i++)
		for (int j = 0; j < dim; j++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++) //MS: noVec , Intel Vec: Vmulpd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmPR_jik 91
void matrixmulPR_jik(const double*  __restrict a, const double * __restrict b, double* __restrict c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int j = 0; j < dim; j++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int k = 0; k < dim; k++)  //MS: noVec , Intel Vec: vmulpd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmPR_kij 92
void matrixmulPR_kij(const double*  __restrict a, const double * __restrict b, double* __restrict c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;
	for (int k = 0; k < dim; k++)
		for (int i = 0; i < dim; i++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int j = 0; j < dim; j++) //MS: noVec , Intel Vec: vfmadd213pd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmPR_ikj 93
void matrixmulPR_ikj(const double*  __restrict a, const double * __restrict b, double* __restrict c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;


	for (int i = 0; i < dim; i++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int j = 0; j < dim; j++) //MS: noVec , Intel Vec: vfmadd213pd
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmPR_kji 94
void matrixmulPR_kji(const double*  __restrict a, const double * __restrict b, double* __restrict c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int k = 0; k < dim; k++)
		for (int j = 0; j < dim; j++)
#ifdef novec 
#pragma loop (no_vector)
#endif
			for (int i = 0; i < dim; i++) //MS: noVec , Intel noVec
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}

#define mmPR_jki 95
void matrixmulPR_jki(const double*  __restrict a, const double * __restrict b, double* __restrict c, int dim, int dim0) {
	if (dim > dim0) return;
	for (int i = 0; i< dim; i++)
		for (int j = 0; j< dim; j++)
			c[i*dim0 + j] = 0.0;

	for (int j = 0; j < dim; j++)
		for (int k = 0; k < dim; k++)
#ifdef novec 
#pragma loop(no_vector)
#endif
			for (int i = 0; i < dim; i++) //MS: noVec , Intel noVec
				c[i*dim0 + j] += a[i*dim0 + k] * b[k*dim0 + j];
}






//****************************** with Library, pointer to (dynamic) arrays, vari dim <= DIM0
#define mm_DGEMM 100
#ifdef MKL
void matrixmulDGEMM(double *a, double *b, double *c, int dim, int dim0) {

	cblas_dgemm( //C=alpha*A*B+beta
		CblasRowMajor, //Layout row-column
		CblasNoTrans,  // a not trans
		CblasNoTrans, //  b not trans 
		dim,   // M
		dim,   // N
		dim,   // K
		1.0,   //alpha
		a,    // pointer to a 
		dim0,  // leading Dim i.e  &a[1][0]-&a[0][0]=DIM0*8
		b,
		dim0,
		0.0, //beta
		c,
		dim0
		);
}




#endif




bool run(int benchnr, int rep, long long &clocks, double &time, double &afreq) {
	int reg[4] = { -1 };
	unsigned int id;
	unsigned long long t0, t1;
	LARGE_INTEGER Qa, Qe, Qf;
	bool result = true;
	SetThreadAffinityMask(GetCurrentThread(), 1);
#ifndef NOMSR
	uint64  CLK_UNHALTED_REFa = 0;
	uint64  CLK_UNHALTED_THREADa = 0;
	uint64  CLK_UNHALTED_REFe = 0;
	uint64  CLK_UNHALTED_THREADe = 0;
	readMSR(CPU_CLK_UNHALTED_REF_ADDR, &CLK_UNHALTED_REFa);
	readMSR(CPU_CLK_UNHALTED_THREAD_ADDR, &CLK_UNHALTED_THREADa);

#endif 

	QueryPerformanceCounter(&Qa);
	t0 = __rdtsc();
	__cpuid(reg, 1);
	SetThreadAffinityMask(GetCurrentThread(), 0xff);
	for (int r = 0; r < rep; r++) {
		switch (benchnr) {



			//Autovectorization Default Intel 14   
		case mm_ijk: matrixmul_ijk(); break; //PERMUTED LOOP WAS VECTORIZED.
		case mm_jik: matrixmul_jik(); break; //PERMUTED LOOP WAS VECTORIZED.
		case mm_kij: matrixmul_kij(); break; //LOOP WAS VECTORIZED.
		case mm_ikj: matrixmul_ikj(); break; //LOOP WAS VECTORIZED.
		case mm_kji: matrixmul_kji(); break; //PERMUTED LOOP WAS VECTORIZED.
		case mm_jki: matrixmul_jki(); break; //PERMUTED LOOP WAS VECTORIZED.

		case mmX_ijk: matrixmulX_ijk(); break;    //PERMUTED LOOP WAS VECTORIZED.
		case mmX_jik: matrixmulX_jik(); break;    //PERMUTED LOOP WAS VECTORIZED.

		case mmF_ijk: if (dim != DIM) return false;  //only for dim=DIM 
			matrixmulF_ikj(); break;
		case mmF_jik: if (dim != DIM) return false;  //only for dim=DIM 
			matrixmulF_jik(); break;
		case mmF_kij: if (dim != DIM) return false;  //only for dim=DIM 
			matrixmulF_kij(); break;
		case mmF_ikj: if (dim != DIM) return false;  //only for dim=DIM 
			matrixmulF_ikj(); break;
		case mmF_kji: if (dim != DIM) return false;  //only for dim=DIM 
			matrixmulF_kji(); break;
		case mmF_jki: if (dim != DIM) return false;  //only for dim=DIM 
			matrixmulF_jki(); break;

		case mmS_ijk: matrixmulS_ijk(a, b, c, dim); break;
		case mmS_jik: matrixmulS_jik(a, b, c, dim); break;
		case mmS_kij: matrixmulS_kij(a, b, c, dim); break;
		case mmS_ikj: matrixmulS_ikj(a, b, c, dim); break;
		case mmS_kji: matrixmulS_kji(a, b, c, dim); break;
		case mmS_jki: matrixmulS_jki(a, b, c, dim); break;
		case mmSX_ijk: matrixmulSX_ijk(a, b, c, dim); break;
		case mmSX_jik: matrixmulSX_jik(a, b, c, dim); break;

		case mmB_ikj: matrixmulB_ikj(2, dim); break;
		case mmB_ikj + 1: matrixmulB_ikj(4, DIM); break;
		case mmB_ikj + 2: matrixmulB_ikj(8, DIM); break;
		case mmB_ikj + 3: matrixmulB_ikj(16, DIM); break;
		case mmB_ikj + 4: matrixmulB_ikj(32, DIM); break;

		case mmN_ijk: matrixmulN_ijk(32, dim); break;
		case mmN_ikj: matrixmulN_ikj(32, dim); break;
		case mmN2_ikj: matrixmulN2_ikj(32, dim); break;
		case mmSOS_ikj: matrixmulSOS_ikj(a, b, c, dim); break;
		case mmSOD_ikj: matrixmulSOD_ikj(a, b, c, dim); break;

		case mmBOS_ikj: matrixmulBOS_ikj(16, dim); break;
		case mmBOD_ikj: matrixmulBOD_ikj(16, dim); break;

		case mmT_ijk: matrixmulT_ijk(); break;
		case mmT_jik: matrixmulT_jik(); break;
		case mmT_kij: matrixmulT_kij(); break;
		case mmT_ikj: matrixmulT_ikj(); break;
		case mmT_kji: matrixmulT_kji(); break;
		case mmT_jki: matrixmulT_jki(); break;
		case mmTX_ijk:matrixmulTX_ijk(); break;
		case mmTX_jik:matrixmulTX_jik(); break;


#ifdef MKL
		case mm_DGEMM: matrixmulDGEMM(ap, bp, cp, dim, dim0); break;
#endif
		case mmP_ijk: matrixmulP_ijk(ap, bp, cp, dim, dim0); break;
		case mmP_jik: matrixmulP_jik(ap, bp, cp, dim, dim0); break;
		case mmP_jki: matrixmulP_jki(ap, bp, cp, dim, dim0); break;
		case mmP_ikj: matrixmulP_ikj(ap, bp, cp, dim, dim0); break;
		case mmP_kji: matrixmulP_kji(ap, bp, cp, dim, dim0); break;
		case mmP_kij: matrixmulP_kij(ap, bp, cp, dim, dim0); break;

		case mmP0_ijk: matrixmulP0_ijk(ap, bp, cp, dim, dim0); break;
		case mmP0_jik: matrixmulP0_jik(ap, bp, cp, dim, dim0); break;
		case mmP0_jki: matrixmulP0_jki(ap, bp, cp, dim, dim0); break;
		case mmP0_ikj: matrixmulP0_ikj(ap, bp, cp, dim, dim0); break;
		case mmP0_kji: matrixmulP0_kji(ap, bp, cp, dim, dim0); break;
		case mmP0_kij: matrixmulP0_kij(ap, bp, cp, dim, dim0); break;

		case mmPR_ijk: matrixmulPR_ijk(ap, bp, cp, dim, dim0); break;
		case mmPR_jik: matrixmulPR_jik(ap, bp, cp, dim, dim0); break;
		case mmPR_jki: matrixmulPR_jki(ap, bp, cp, dim, dim0); break;
		case mmPR_ikj: matrixmulPR_ikj(ap, bp, cp, dim, dim0); break;
		case mmPR_kji: matrixmulPR_kji(ap, bp, cp, dim, dim0); break;
		case mmPR_kij: matrixmulPR_kij(ap, bp, cp, dim, dim0); break;

		case mmSP_ikj: matrixmulS_ikj(*(mat*)ap, *(mat*)bp, *(mat*)cp, dim); break;
		case mmPP_ikj: matrixmulPP_ikj(ap, bp, cp, dim, dim0); break;

		default: result = false;
		}
	}
	SetThreadAffinityMask(GetCurrentThread(), 1);
	t1 = __rdtscp(&id);
	__cpuid(reg, 0);
	QueryPerformanceCounter(&Qe);
	QueryPerformanceFrequency(&Qf);
#ifndef NOMSR
	readMSR(CPU_CLK_UNHALTED_REF_ADDR, &CLK_UNHALTED_REFe);
	readMSR(CPU_CLK_UNHALTED_THREAD_ADDR, &CLK_UNHALTED_THREADe);
	afreq = (double)(CLK_UNHALTED_THREADe - CLK_UNHALTED_THREADa) / (double)(CLK_UNHALTED_REFe - CLK_UNHALTED_REFa);
	if (verbose) printf("CLK unhalted %I64d %I64d ref=%I64d %I64d\n", CLK_UNHALTED_THREADe, CLK_UNHALTED_THREADa,
		CLK_UNHALTED_REFe, CLK_UNHALTED_REFa);

#else
	afreq = 1.4; // Defaultwert fr i7-4750HQ, 
#endif
	clocks = (t1 - t0);
	time = (double)(Qe.QuadPart - Qa.QuadPart) / (double)Qf.QuadPart;
	return result;
}


int _tmain(int argc, _TCHAR* argv[])
{

	volatile __int64 dummy = 0;
	bool skip = true;
	int benchstart = 80;
	int benchend = 80;
	int minsecs = 1000; // 1000 ms 
	int rep;
	long long clocks;
	double secs;
	double afreq;

	for (int p = 1; p <argc; p++) {
		if (_tcscmp(argv[p], _TEXT("-b")) == 0)  { benchstart = _tstoi(argv[++p]); benchend = benchstart; continue; }
		if (_tcscmp(argv[p], _TEXT("-bs")) == 0)  { benchstart = _tstoi(argv[++p]); benchend = benchstart; continue; }
		if (_tcscmp(argv[p], _TEXT("-be")) == 0)  { benchend = _tstoi(argv[++p]); continue; }
		if (_tcscmp(argv[p], _TEXT("-t")) == 0)   { minsecs = _tstoi(argv[++p]); continue; }

		if (_tcscmp(argv[p], _TEXT("-d")) == 0)   { dim = _tstoi(argv[++p]); continue; }
		if (_tcscmp(argv[p], _TEXT("-d0")) == 0)  { dim0 = _tstoi(argv[++p]); continue; }
		if (_tcscmp(argv[p], _TEXT("-skip")) == 0) skip = true;
		if (_tcscmp(argv[p], _TEXT("-v")) == 0) verbose = true;
	}

	print_compilerinfo();
	//#define showmatrices
	SetThreadAffinityMask(GetCurrentThread(),1);
#ifndef NOMSR 
	if (!InitDrv()) {
		printf(" no access to msr-driver, start with admin rights\n");
		return 1;
		EnableFixCtrl(); // fr Core 0 
	}
#endif;
	printf("Hochfahren\n");
#ifdef _DEBUG 
	skip = true;
#endif
	if (!skip) for (__int64 i = 0; i < 3000000000; i++) dummy++;
#ifdef _SSE3 
	printf("Option=SSE3\n");
#endif

	dim = DIM;
	dim0 = DIM0;

	printf("dim=%d, dim0=%d", dim, dim0);
	int size = dim*dim * 8;
	if (size < 1024) printf(" %d B ", size);
	else if (size < 1024 * 1024) printf(" %d KB ", size / 1024);
	else if (size < 1024 * 1024 * 1024) printf(" %d MB\n", size / (1024 * 1024));
	printf("\n");

	for (int benchnr = benchstart; benchnr <= benchend; benchnr++){
		ap = *a; // = (double*) &a 
		bp = *b; // = (double*) &b
		cp = *c; // = (double*) &c

		fill(ap, bp, cp, dim, dim0);
		rep = 1;

		if (!run(benchnr, rep, clocks, secs, afreq)) continue;

		double s = sumup(cp, dim, dim0);
		double d = (dim*(dim - 1)) / 2;
		double sx = (d*d*d*(2 * dim - 1)) / 3;
		double eps = (1 - s / sx);
		double mults = (double)dim*(double)dim*(double)dim;
		double adds = (double)(dim - 1)*(double)dim*(double)dim;
		bool ok = abs(eps) < (1e-10)*mults;
		if (verbose) printf("res=%f ref=%f diff=%f\n", s, sx, eps);
		if (ok) {
			if (secs < 0.1) run(benchnr, rep, clocks, secs, afreq);
			if (1000 * secs < minsecs) {
				rep = (int)(minsecs / (1000 * secs) + 0.5);
				if (rep == 0) rep = 1;
				run(benchnr, rep, clocks, secs, afreq);
			}
			if (1100 * secs < minsecs) {
				rep *= 2;
				run(benchnr, rep, clocks, secs, afreq);
			}
		}

		double ops = (mults + adds)*(double)rep;

		char* com;

		switch (benchnr) {               //Autovectorization Default Intel 14  

		case  mm_ijk:   com = "mm_ijk()      "; break; //PERMUTED LOOP WAS VECTORIZED.
		case  mm_jik:   com = "mm_jik()      "; break; //PERMUTED LOOP WAS VECTORIZED.
		case  mm_kij:   com = "mm_kij()      "; break; //LOOP WAS VECTORIZED.
		case  mm_ikj:   com = "mm_ikj()      "; break; //LOOP WAS VECTORIZED.
		case  mm_kji:   com = "mm_kji()      "; break; //PERMUTED LOOP WAS VECTORIZED
		case  mm_jki:   com = "mm_jki()      "; break; //PERMUTED LOOP WAS VECTORIZED.
		case  mmX_ijk:  com = "mmX_ijk()     "; break;
		case  mmX_jik:  com = "mmX_jik()     "; break;

		case  mmF_ijk:  com = "mmF_ijk()     "; break;
		case  mmF_jik:  com = "mmF_jik()     "; break;
		case  mmF_ikj:  com = "mmF_ikj()     "; break;
		case  mmF_kij:  com = "mmF_kij()     "; break;
		case  mmF_kji:  com = "mmF_kji()     "; break;
		case  mmF_jki:  com = "mmF_ikj()	 "; break; //ignores var dim, uses fixed DIM0 

		case  mmS_ijk:  com = "mmS_ijk(a..)  "; break;
		case  mmS_jik:  com = "mmS_jik(a..)  "; break;
		case  mmS_kij:  com = "mmS_kij(a..)  "; break;
		case  mmS_ikj:  com = "mmS_ikj(a..)  "; break;
		case  mmS_kji:  com = "mmS_kji(a..)  "; break;
		case  mmS_jki:  com = "mmS_jki(a..)  "; break;
		case  mmSX_ijk: com = "mmSX_ijk(a..) "; break;
		case  mmSX_jik: com = "mmSX_ijk(a..) "; break;


		case mmB_ikj:   com = "mmB_ikj(2)    "; break;
		case mmB_ikj + 1: com = "mmB_ikj(4)    "; break;
		case mmB_ikj + 2: com = "mmB_ikj(8)    "; break;
		case mmB_ikj + 3: com = "mmB_ikj(16)   "; break;
		case mmB_ikj + 4: com = "mmB_ikj(32)   "; break;

		case mmN_ijk:   com = "mmN_ijk(32)   "; break;
		case mmN_ikj:   com = "mmN_ikj(32)   "; break;
		case mmN2_ikj:  com = "mmN2_ikj(32)   "; break;

		case mmSOS_ikj: com = "mmSOS_ikj(a..)"; break;
		case mmSOD_ikj: com = "mmSOD_ikj(a..)"; break;

		case mmBOS_ikj: com = "mmBOS_ikj()   "; break;
		case mmBOD_ikj: com = "mmBOD_ikj()   "; break;

		case mmT_ijk:   com = "mmT_ijk()     "; break;
		case mmT_jik:   com = "mmT_jik()     "; break;
		case mmT_kij:   com = "mmT_kij()     "; break;
		case mmT_ikj:   com = "mmT_ikj()     "; break;
		case mmT_kji:   com = "mmT_kji()     "; break;
		case mmT_jki:   com = "mmT_jki()     "; break;

		case mmTX_ijk:  com = "mmTX_ijk()    "; break;
		case mmTX_jik:  com = "mmTX_jki()    "; break;


#ifdef MKL
		case mm_DGEMM:  com = "mm_DGEMM(..)  "; break;
#endif
		case mmP_ijk:   com = "mmP_ijk(ap..) "; break;
		case mmP_jik:   com = "mmP_jik(ap..) "; break;
		case mmP_kij:   com = "mmP_kij(ap..) "; break;
		case mmP_ikj:   com = "mmP_ikj(ap..) "; break;
		case mmP_kji:   com = "mmP_kji(ap..) "; break;
		case mmP_jki:   com = "mmP_jki(ap..) "; break;
		case mmP0_ijk:  com = "mmP0_ijk(ap..) "; break;
		case mmP0_jik:  com = "mmP0_jik(ap..) "; break;
		case mmP0_kij:  com = "mmP0_kij(ap..) "; break;
		case mmP0_ikj:  com = "mmP0_ikj(ap..) "; break;
		case mmP0_kji:  com = "mmP0_kji(ap..) "; break;
		case mmP0_jki:  com = "mmP0_jki(*..) "; break;
		case mmPR_ijk:  com = "mmPR_ijk(*..) "; break;
		case mmPR_jik:  com = "mmPR_jik(*..) "; break;
		case mmPR_kij:  com = "mmPR_kij(*..) "; break;
		case mmPR_ikj:  com = "mmPR_ikj(*..) "; break;
		case mmPR_kji:  com = "mmPR_kji(*..) "; break;
		case mmPR_jki:  com = "mmPR_jki(*..) "; break;
		case mmSP_ikj:  com = "mmSP_ikj(*..) "; break;
		case mmPP_ikj:  com = "mmPP_ikj(*..) "; break;

		default:        com = "not defined   ";
		}


		printf("%2d %s", benchnr, com);

		if (benchnr == mm_DGEMM) ok = true;
		if (ok) {
			double currfreq = (double)clocks / secs*afreq;
			printf("%7.3lf s %7d reps, %5.2f GF/s", secs, rep, ops / secs / 1e9);
			printf(" %6.3lf Fpc", (double)ops / ((double)clocks*afreq));
			printf(" %5.0lf MHz", currfreq / 1e6);
		}
		else printf(" ## not ok");
		printf("\n");
#ifdef showmatrices 
		printf("\n");
		for (int i = 0; i <dim; i++) {
			for (int j = 0; j <dim; j++) printf("%d ", (int)a[i][j]);
			printf("\n");
		}
		printf("\n");
		for (int i = 0; i <dim; i++) {
			for (int j = 0; j <dim; j++) printf("%d ", (int)c[i][j]);
			printf("\n");
		}
#endif



	}


	return 0;
}

