#define RUNPROC matrixmul(ap, bp, cp, dim, dim0)
// i-k-j with AVX intrinsics";
void matrixmul(const double* __restrict a, const double* __restrict b, double* __restrict c, int dim, int dim0) {
	setNull();
	__m256d mm_a, mm_b, mm_c;

	for (int i = 0; i < dim; i++)  {

		for (int k = 0; k < dim; k++) {

			double* k1 = (double*)&b[k*dim0];
			double* i1 = (double*)&c[i*dim0];
			double f = a[i*dim0 + k];
			mm_a = _mm256_broadcast_sd(&f);
			for (int j = 0; j < dim / 4; j++) {
				mm_c = _mm256_load_pd(i1); 
				mm_b = _mm256_load_pd(k1);
				mm_c = _mm256_fmadd_pd(mm_a,mm_b, mm_c);
				_mm256_store_pd(i1, mm_c); 
				i1 += 4;
				k1 += 4;
			}

		}
	}
}
