/*
 * c't 2003/7 p 234
 *
 *
 * w9a ... w9 - values instead of pointers
 *
 * w9 ... 493 Gs (Athlon)
 *
 *
 * Please note: terms hibit/lobit are swapped
 */

/* statistik counter */
/* XXX mit diesen ist das Programm schneller !!!1
 * wahrscheinlich schlectes Code-Alignment von gcc 2.95.2
 */
 #define ADB

/* nur 1 Teil im ersten level => 68735 sol. */
// #define TEST

#ifdef JUDY
#include "J/src/linux_ia32/product/deliver/usr/include/Judy.h"
#endif

/* ugly hack skip first bit */
#define P1 1

/* 3,4 or 6 bits for next part */
#define TYPS 3

/* Lsungsposition checken */
// #define POS

/* per piece AND_MASK */
 #define AND_MASK

/* global and mask */
 #define AND_LIST

/* einfacher test 2x1 Lcher */
 #define LOCH_CHECK

/* einfacher test Teile */
 #define ANY_LIST

/* passen teile pro used */
 #define USED_LIST


#define BSF
// #define BSF_HI

// #define REV_BSF (3.985 vs 3.979 so no...

// #define SYM1
// #define SYM2
/* optimale? Teilereduzierung f. Teil SYMi */
 #define SYM3

/* use 256 byte aligned pool */
// #define MEMALIGN

/* align vals/hibits in pool if (n) */
// #define POOL_ALIGN

#if !P1 && TYPS
#error "p1 typs"
#endif
#ifdef TEST
# ifndef ADB
#  define ADB
# endif
#endif


#include <stdlib.h>
#include <stdio.h>
#include <malloc.h>
#include <time.h>
#include <assert.h>

#ifdef DEBUG2
# define DEBUG
#endif

#ifdef DEBUG
#ifndef POS
#define POS
#endif
#define debug(x) printf x
#define debug2(x)
#else
#define debug(x)
#define debug2(x)
#endif
#define DBP 0	/* debug_piece */

#define Ci 8
#define Ti 2
#define Vi 9
#define SYMi 1
char *parts[] = {
    " X  "
    "XXX "
    " X  ",
/* asymmetric part Nr SYMi */

    "XX  "
    "Xz  ",

/* long parts Ti .. Ti+2 */
    "X   "
    "XX  "
    "X   "
    "XX  ",

    " X  "
    " X  "
    " X  "
    "XX  ",

    "X   "
    "XX  "
    "X   "
    "X   ",

    "  X "
    " XX "
    "XX  ",

    "XXX "
    " X  "
    " X  ",

    " XX "
    " X  "
    "XX  ",

    "XX  "
    "X   "
    "XX  ",

    "z   "
    "XX  ",

    "X   "
    "XXX "
    " X  ",

    " X  "
    "XX  "
    "XX  ",

};

typedef union {
    unsigned long long v;
    struct {
	unsigned int lo;
	unsigned int hi;
    } u;
} val;
val C, T;
/*
 * part bits
 */
static unsigned long long pbits[12][24];
/* n cubes */
static int nc[12];
/* n rotated parts */
static int nrot[12];
/* n total rotated/shifted parts */
static int ntot[12];
/* count   i  rot  b   */
#define NROT 48*8
#define N_MSK (1<<TYPS)    // bits
struct {
    int n;
    val v[NROT];
} shifts[12][64*N_MSK];	/* s. mask below */
struct {
    int n;
    unsigned int v[NROT];
} lshifts[12][64*N_MSK];	/* s. mask below */
/* highest bit */
val *hibit[64*N_MSK][16];
unsigned int used;
int solutions, max;
static clock_t start, now, last;
static void *pool, *pptr;

#define NALL 500
static struct {
    int n;
    val v[NALL];
} lbits [60][16];
/* dimensions */
static int dim[12][3];
/* parts are in 4x4 flat space (X), z denoting
 * a cube additionally in that dir
 * convert to X(5) * Y(4) * Z(3) room bits
 */
static int part_to_bits(int i)
{
    int j, x, y, bits, xm, ym;
    dim[i][2] = 1;
    for (j = bits = xm = ym = 0; j < strlen(parts[i]); j++) {
	x = j % 4;
	y = j / 4;
	if (parts[i][j] == 'X') {
	    bits |= 1 << (x*12 + y * 3);
	    if (x > xm)
		xm = x;
	    if (y > ym)
		ym = y;
	    nc[i]++;
	}
	else if (parts[i][j] == 'z') {
	    bits |= 1 << (x *12+ y * 3);
	    bits |= 1 << (x *12+ y * 3 + 1);
	    if (x > xm)
		xm = x;
	    if (y > ym)
		ym = y;
	    dim[i][2] = 2;
	    nc[i] += 2;
	}
    }
    dim[i][0] = xm + 1;
    dim[i][1] = ym + 1;
    return bits;
}

static inline void
b2xyz(int b, int *x, int *y, int *z)
{
    *z = b % 3;
    *x = b / 12;
    *y = (b - *x*12) / 3;
}

#if 0
static int xyz2b(int x, int y, int z)
{
    return z + 3*y + 12*x;
}
#endif

#ifdef DEBUG2
static void
coors2xy(int i, int coors[6])
{
    int k, x, y, z;
    char res[4][20];
    for (y = 0; y < 4; y++)
        strcpy(res[y], "..... ..... .....");
    for (k = 0; k < nc[i]; k++) {
	x = (coors[k] & 0xf);
	y = ((coors[k] >> 4) & 0xf);
	z = ((coors[k] >> 8) & 0xf);
	res[y][x] = 'X';
	if (z)
	    res[y][x]= '0'+z;
	res[z][x+6] = 'X';
	res[z][y+12] = 'X';
    }
    for (y = 0; y < 4; y++)
	puts(res[y]);
    puts("");
}

static void v2xy(int i, unsigned long long v) {
    int coors[6];
    int x, y, z, b, c;
    for (b = c = 0; b < 3*4*5; b++, v >>= 1) {
	if (v & 1) {
	    z = b % 3;
	    x = b / 12;
	    y = (b - x*12) / 3;
	    coors[c++] = x + 16 * y + 256 * z;
#ifdef DEBUG3
	    debug(("p%d %d (%d,%d,%d)\n", i, c-1, x, y, z));
#endif
	}
    }
    coors2xy(i, coors);
}
static void pbits2xy(int i, int max) {
    int coors[6];
    unsigned long long v;
    int j, x, y, z, b, c;
    for (j = 0; j < ntot[i]; j++) {
	if (max >= 0 && max != j)
	    continue;
	v = pbits[i][j];
	printf(" = %16llx\n", v);
	for (b = c = 0; b < 3*4*5; b++, v >>= 1) {
	    if (v & 1) {
		z = b % 3;
		x = b / 12;
		y = (b - x*12) / 3;
		coors[c++] = x + 16 * y + 256 * z;
#ifdef DEBUG3
		if (i == DBP)
		    debug(("p%d %d (%d,%d,%d)\n", i, c-1, x, y, z));
#endif
	    }
	}
	printf("f %d, r%d\n", i, j);
	assert(c == nc[i]);
	coors2xy(i, coors);
    }
}
#endif

static void rotz(int *x, int *y, int *z)
{
    int nx, ny;

    ny = *x;
    nx = 10-*y;
    *x = nx;
    *y = ny;
}
static void roty(int *x, int *y, int *z)
{
    int nx, nz;

    nz = *x;
    nx = 10-*z;
    *x = nx;
    *z = nz;
}
static void rotx(int *x, int *y, int *z)
{
    int ny, nz;

    nz = *y;
    ny = 10-*z;
    *y = ny;
    *z = nz;
}

static void rotxyz(int c, int dir, int *dst, int *src)
{
    int k, x, y, z, mx, my, mz;
    void (*rotf[3])(int *, int *, int*) = {
	rotz, roty, rotx
    };
    for (k = 0; k < c; k++)
	dst[k] = 0;
    mx = my = mz = 10;
    for (k = 0; k < c; k++) {
	/* shift by 5/5 */
	x = 5 + (src[k] & 0xf);
	y = 5 + ((src[k] >> 4) & 0xf);
	z = 5 + ((src[k] >> 8) & 0xf);
	(rotf[dir])(&x, &y, &z);
	/* remember min coors */
	if (x < mx)
	    mx = x;
	if (y < my)
	    my = y;
	if (z < mz)
	    mz = z;
	dst[k] = x + 16 * y + 256 * z;
    }
    /* normalize to corner */
    for (k = 0; k < c; k++) {
	x = dst[k] & 0xf;
	y = (dst[k] >> 4) & 0xf;
	z = (dst[k] >> 8) & 0xf;
	x -= mx;
	y -= my;
	z -= mz;
	dst[k] = x + 16 * y + 256 * z;
    }
}
static int sortf(const void *a, const void *b)
{
    unsigned long long ba, bb;
    ba = *(unsigned long long*) a;
    bb = *(unsigned long long*) b;
    if (ba < bb)
	return -1;
    else if (ba > bb)
	return 1;
    return 0;
}
/* convert rotated coors to unique bitlist */
static void coors2bits(int i, int coors[64][6])
{
    int j, k, l, n, x, y, z, fit;
    unsigned long long bits[64];
    for (j = n = 0; j < 64; j++) {
	bits[n] = 0ULL;
	fit = 1;
#ifdef SYM1
	if (i == SYMi &&
		!(j == 0 || j == 1 || j == 4 || j== 5 || j == 16 || j == 17))
	    continue;
#endif
	for (k = 0; k < nc[i]; k++) {
	    x = (coors[j][k] & 0xf);
	    if (x >= 5) {
		fit = 0;
		break;
	    }
	    y = ((coors[j][k] >> 4) & 0xf);
	    if (y >= 4) {
		fit = 0;
		break;
	    }
	    z = ((coors[j][k] >> 8) & 0xf);
	    if (z >= 3) {
		fit = 0;
		break;
	    }
	    assert(x >= 0 && y >= 0 && z >= 0);
	    bits[n] |= 1LL << (z + y * 3 + x * 4 * 3);
	}
	if (fit) {
#ifdef DEBUG2
	    if (i == DBP) {
		debug(("f%d rot %d->%d %llx\n", i, j, n, bits[n]));
		coors2xy(i, coors[j]);
	    }
#endif
	    n++;
	}
    }
    qsort(bits, n, sizeof(long long), sortf);
    pbits[i][0] = bits[0];
#ifdef DEBUG2
    if (i == DBP)
	debug(("uniq f%d rot %d %llx\n", i, 0, bits[0]));
#endif
    for (j = k = 1, l = 0; j < n; j++) {
	if (bits[j] != bits[l]) {
	    pbits[i][k++] = bits[j];
	    l = j;
#ifdef DEBUG2
	    if (i == DBP) {
		debug(("uniq f%d rot %d %llx\n", i, j, bits[j]));
	    }
#endif
	}
    }
    nrot[i] = k;
    ntot[i] = nrot[i];
#ifdef DEBUG2
    debug(("f%d nrot %d\n", i, k));
    if (i == DBP)
	pbits2xy(i, -1);
#endif
}
#if 0
static inline unsigned long
ld(unsigned long x)
{
    unsigned long m; /* bit position of highest set bit of x */

    __asm__("bsrl %1,%0\n\t" : "=r"(m) : "g"(x));

    return m;
}


static unsigned long
ld64(unsigned long long v) {
    int b;
    if ((v>>32))
	b = 32 + ld(v >> 32);
    else
	b = ld(v & 0xffffffffULL);
    return b;
}
#endif

static inline unsigned long
bsf(unsigned long x)
{
    unsigned long m;  /* bit position of lowest set bit of x */

    __asm__("bsf %1,%0\n\t" : "=r"(m) : "g"(x));

    return m;
}

static unsigned long
bsf64(unsigned long long v) {
    int b;
    if (! (v & 0xffffffffULL))
	b = 32 + bsf(v >> 32);
    else
	b = bsf(v & 0xffffffffULL);
    return b;
}
/* return eith most lo bits
 * or on same lo bits that with most lo bits set
 */
static unsigned long long
bsf_cmp(unsigned long long r1,unsigned long long r2)
{
    int b;
    if (bsf64(r1) < bsf64(r2))
	return r2;
    if ((b =bsf64(r1)) > bsf64(r2))
	return r1;
    for ( ; b < 3*4*5; b++) {
	if (! (r2 & (1ULL << b)) && (r1 & (1ULL <<b)))
#ifdef REV_BSF
	    return r2;
#else
	    return r1;
#endif
	if (! (r1 & (1ULL << b)) && (r2 & (1ULL <<b)))
#ifdef REV_BSF
	    return r1;
#else
	    return r2;
#endif
    }
    assert("never");
    return 0;
}


#ifdef SYM3
static unsigned long long
rev(unsigned long long v)
{
    /* return mirror with most lo bits */
    int x, y, z, b;
    unsigned long long r1, r2, r3;
    r1 = r2 = r3 = 0ULL;
    for (b = 0; b < 3*4*5; b++, v >>= 1) {
	if (v & 1) {
	    b2xyz(b, &x, &y, &z);
	    /* yz */
	    r1 |= 1ULL << ( (2-z) + (3-y)*3 + x*12 );
	    /* xz */
	    r2 |= 1ULL << ( (2-z) + y*3 + (4-x)*12 );
	    /* xy */
	    r3 |= 1ULL << ( z + (3-y)*3 + (4-x)*12 );
	}
    }
    r1 = bsf_cmp(r1, r2);
    r1 = bsf_cmp(r1, r3);
    return r1;
}
#endif


static inline int get_typ(unsigned int v)
{
    int typ;
#if TYPS == 6
    typ = v & 0x27;    // 2y2z  bit 0..2, 5
    typ |= (v >> 8) & 0x8;       // x bit 3
    typ |= (v >> (8+11)) & 0x10;       // x bit 4
#elif TYPS == 3
    typ = v & 0x5;    // yz  bit 0,2
    typ |= (v>>10) & 0x2;       // x bit 1
#elif TYPS == 4
    typ = v & 0x7;    // yzz  bit 0..2
    typ |= (v>>8) & 0x8;       // x bit 3
#elif TYPS == 5
    typ = v & 0xf;    // yzz  bit 0..2 + dia
    typ |= (v>>7) & 0x10;       // x bit 4
#elif TYPS == 0
    typ = 0;
#else
#error "no typ"
#endif
    return typ;
}

/* shift all parts around in the 5*4*3 room */
static void
shift_bits(int i)
{
    int x,y,z, j,k,l,m, n, nn, sx, sy, sz;
    unsigned long long v;
    static double p = 1.0;
    int b;
    int mx, my, mz;

#ifdef SYM2
    if (i == SYMi)
	opt1();
#endif
    ntot[i] = 0;
    for (j = 0; j < nrot[i]; j++) {
	v = pbits[i][j];
	mx = my = mz = 0;
	for (b = 0; b < 3*4*5; b++, v >>= 1) {
	    if (v & 1) {
		b2xyz(b, &x, &y, &z);
		if (x > mx)
		    mx = x;
		if (y > my)
		    my = y;
		if (z > mz)
		    mz = z;
	    }
	}
	v = pbits[i][j];
	if (i == DBP) {
	    debug(("s%d rot %d %16llx\n",
                        i, j, v));
	}
	sx = sy = sz = 0;
#if defined(SYM2)
	if (i == SYMi) {
	    sx = 2;
	    sy = 0;
	    sz = 0;
	}
#endif
	for (k = sx, nn = 0; k < 5-mx; k++)
	    for (l = sy; l < 4-my; l++)
		for (m = sz; m < 3-mz; m++) {
                    int shift = m +l*3 + k*12;
                    unsigned long long r, v1, msk;
                    // unsigned long long col;
                    int o;

                    v1 = v << shift;
                    msk = v1;
#ifdef SYM3
                    if (i == SYMi) {
                        /* check if mirror has more lo bits unset */
                        r = rev(v1);
                        if (bsf_cmp(v1, r) == r) {
                            msk = r;
                        }
                    }
#endif
                    b = bsf64(msk);
                    n = lbits[b][i].n;
                    for (o = 0; o < n; o++)
                        if (lbits[b][i].v[o].v == msk)
                            goto done;
                    lbits[b][i].v[n].v = msk;
                    lbits[b][i].n++;
                    nn++;
done:
		}
	ntot[i] += nn;
    }
    p *= ntot[i];

    debug(("total f%d nrot %d ntot %d  p %.0g\n", i, nrot[i], ntot[i], p));
}

static void make_shifts()
{
    int i, b, n, j, typ, tt, bb, o;
    unsigned long long v1;
    int us[12][24], f;
    for (i = 0; i < 12; i++)
        for (j = 0; j < 24; j++)
            us[i][j] = 0;
    f = 1;
    for (b = 0; b < 60; b++)
        for (i = 0; i < 12; i++) {
            n = lbits[b][i].n;
            for (j = 0; j < n; j++) {
                v1 = lbits[b][i].v[j].v;
                bb = bsf64(v1);
                if (b != bb)
                    continue;
                bb = bsf64(v1) + P1;
                v1 >>= bb;
                typ = get_typ(v1);
                if (i == DBP)
                    debug2(("%16llx v1 bb %2d typ %2d\n", v1, bb, typ));
                //assert(typ);
                assert(typ < N_MSK);
                for (tt = N_MSK-1; tt >= 0;  tt--) {
                    int nt, bi;
                    if ((typ & tt) != typ)
                        continue;
                    if (i == DBP)
                        debug2(("\ttt %2d\n", tt));
                    bi = bb+ tt*64;
                    nt = shifts[i][bi].n;
                    assert(nt < NROT);
                    for (o = 0; o < nt; o++)
                        if (shifts[i][bi].v[o].v == v1)
                            goto done;
                    shifts[i][bi].v[nt].v = v1;
                    shifts[i][bi].n++;
done:
                }
            }
        }
}

static void
make_hibit()
{
    int i, j, b, n;
    int bs[12], s, nr, col;
    val val0;

    val0.v = 0ULL;
    for (i = 0; i < 12; i++)
        bs[i] = 0;
    col = 0;

    for (b = 0; b < 64*N_MSK; b++) {
        //int pos = b & 0x3f;
        debug(("%2d ", b));
        for (i = s = 0; i < 12; i++) {
            n = shifts[i][b].n;
#ifdef POOL_ALIGN
            while ( ((unsigned long) pptr) & 0x7)
                (char*) pptr += 1;
#endif
            hibit[b][i] = pptr;
#ifdef AND_MASK
            *((val*) pptr)++ = val0;	/* space for and_mask */
#endif
            nr = 0;
            for (j = 0; j < n; j++) {
                unsigned long long m = shifts[i][b].v[j].v;
                *((unsigned long long*) pptr)++ = m;
                nr++;
#ifdef DEBUG
                if (i == DBP)
                    debug2(("i%2d j%3d n %4d b%2d %16llx\n",
                                i, j, n, b, m));
#endif
            }
            debug(("%4d", nr));
            s += nr;
            bs[i] += nr;
            *((val*) pptr)++ = val0;
        } // for i
        debug((" %4d\n", s));
    }
    debug(("sum"));
    for (i = s = 0; i < 12; i++) {
        debug(("%4d", bs[i]));
        s += bs[i];
    }
    debug(("Total %d\n", s));

    debug(("pool used %d\n", pptr - pool));

}

unsigned int and_list[64*N_MSK];
unsigned int any_list[64*N_MSK];
unsigned int used_list[60][4096];

static void make_used() {
    int i, j, k;
    unsigned int and;
    val *vp;

    for (j = 64*(N_MSK-1); j < 64*N_MSK; j++) {
        for (k = 0; k < 4096; k++) {
            and = ~0U;
            for (i = 0; i < 12; i++) {
                if (! (k & (1 << i))) {
                    vp = hibit[j][i];
                    and &= vp->u.lo;
                }
            }
            if ((j & 0x3f) >= 60)
                break;
            used_list[j & 0x3f][k] = and | and_list[j];
            if (!any_list[j])
                used_list[j & 0x3f][k] = 0xffffffff;

        }
    }
}


static void check_hibit() {
    int i, j, n;
    unsigned long long and, or;
    unsigned int andi;
    val *vp;
    int maxpos = 0;
    int any;

    //make_Vi();
    for (j = 0; j < 64*N_MSK; j++) {
        any = 0;
	and = ~0ULL;
        or = 0;
	for (i = 0; i < 12; i++) {
	    andi = ~0U;
	    vp = hibit[j][i];
	    n = 0;
            // first is always and mask
#ifdef AND_MASK
            vp++;
#endif
	    for (; vp->u.lo ; vp++) {
                and &= vp->v;
                or |= vp->v;
                andi &= vp->u.lo;
                if (i == DBP)
                    debug2(("cbit %16llx v1 bb %2d typ %2d\n",
                                vp->v, j, get_typ(vp->v)));
                n++;
	    }
	    //assert(n == shifts[i][j].n);
            if (n)
                any++;
#ifdef AND_MASK
	    if (n && andi != 1-P1) {
		vp = hibit[j][i];	/* get start */
		debug(("\tp %2d bit %2d n=%2d and %8x\n", i, j, n, andi));
		/* insert and_mask */
		*((unsigned int*)vp) = andi;
	    }
            else if (!n) {
		vp = hibit[j][i];	/* get start */
		*((unsigned int*)vp) = 0xffffffffU;
            }
#endif
            if (n && j > maxpos)
                maxpos = j;
	}
        if (or) {
            debug2(("bit %3d or %16llx\n", j, or << j));
        }
	if (and != ~0ULL && and != 1-P1) {
            and_list[j] = and;
            debug(("bit %3d and %16llx pos %2d any %d\n",
                        j, and, j & 0x3f, any));
	}
        if (any) {
            debug2(("not any b%2d\n", j));
            any_list[j] = 1;
        }
    }
#ifdef AND_MASK
    make_used();
#endif
}



#ifdef POS
static val res[12];
#endif

#ifdef ADB
static long long l1, l2, l3;
static int lc[12];
#endif
int level;

static int mhz;
void
p_trys(void)
{
#if 1
    int i, nl = 0;
    static int j = 0, osol = 0;
    ++solutions;
    now = clock()/CLOCKS_PER_SEC;
    i = (now != start) ? solutions / (now - start) : 0;
    if (solutions % 10000 == 0) {
	nl = 1;
    }
    if (now != last) {
	j = (solutions-osol) / (now - last);
	osol = solutions;
	last = now;
    }
    if (j > max) {
	max = j;
	nl = 1;
    }
#ifdef ADB
#define D (1000*1000)
    fprintf(stderr,
            "%7d %4d/s%4d %5d%5d%5d%5d%5d%5d%5d%5d%5d%5d%5d%5d%c",
            solutions,
	    i, j,
            (int)(l1/D),(int)(l2/D),(int)(l3/D),lc[3]/D,lc[4]/D,lc[5]/D,
            lc[6]/D, lc[7]/D,lc[8]/D,lc[9]/D,lc[10]/D,lc[11]/D,
            nl ? '\n' : '\r');
#else
    fprintf(stderr, "%7d %4d/s (%4d)%c",
	    solutions, i, j, nl ? '\n' : '\r');
#endif
#ifdef POS
    {
	int k;
	val s;
        unsigned long long sym;
	s.v = 0ULL;
	debug(("\nsolution %d\n", solutions));
	for (k = 0; k < 12; k++) {
	    debug(("part %2d = %16llx\n", k, res[k].v));
	    s.v |= res[k].v;
	}
	debug(("      or| %16llx\n", s.v));
	assert((s.v & 0x0fffffffffffffffULL) == 0x0fffffffffffffffULL);
	fflush(stdout);
    }
#endif
#else
    ++solutions;
    fprintf(stderr, "\r%7d", solutions);
#endif
}


#ifdef LOCH_CHECK
/* any zz zy zx loch */
#ifdef ADB
#define loch_check(v)  \
    if (((v) | ~(1 << 1 | 1 << 3 | 1 << 12)) == 0xffffffffU) { \
	continue; \
    }
#else
#define loch_check(v)  \
    if (((v) | ~(1 << 1 | 1 << 3 | 1 << 12)) == 0xffffffffU) { \
	continue; \
    }

#endif
#else
#define loch_check(v)
#endif	// LOCH_CHECK




void * valid[4096];
void solve_l(unsigned int v, int pos)
{
    int i, npos;
    val *vp, **vpp;
    unsigned int v1;
#ifdef AND_MASK
    unsigned int and;
#endif
    int typ;
#ifdef JUDY
    int sol = solutions;
#define PP 32
    if ((pos & 0x3f) >= PP) {
        int *pi;
        JLG(pi, valid[used], v);
        if (pi) {
            solutions += *pi-1;
            p_trys();
            l3++;
            return;
        }
    }
#endif

#ifdef ADB
    ++level;
    ++l2;
    ++lc[level];
#endif
    debug(("L%02d %16x pos %2d typ %2d L\n",
                level, v, pos & 0x3f, pos >> 6));
    vpp = hibit[pos];
    for (i = 0; i < 12; ++i) {
        if (used & (1 << i))
            continue;
        vp = vpp[i];
#ifdef AND_MASK
        and = vp++ ->u.lo;
        if (v & and) {
#ifdef ADB
            ++l3;
#endif
            continue;
        }
#endif
        for (; vp->u.lo  ; ++vp) {
            if (! (v & vp->u.lo)) {
                v1 = v | vp->u.lo;
                debug(("|%2d %16x   %16llx => %16x\n", i, v, vp->v, v1));
#ifdef POS
#if P1
                res[i].v = ((vp->v) << 1 | 1ULL) << ((pos & 0x3f)-1);
#else
                res[i].v = vp->v << (pos & 0x3f);
#endif
#endif
                npos = bsf(~v1);
                v1 >>= npos;
                npos += (pos & 0x3f);
                if (npos == 60) {
                    p_trys();
                    continue;
                }
                loch_check(v1);
#if P1
                v1 >>= 1;
#endif
                typ = get_typ(~v1);
#if TYPS
                if (!typ)
                    continue;
#endif
                npos = npos + P1 + typ*64;
#ifdef ANY_LIST
                if (!any_list[npos]) {
#ifdef ADB
                    l3++;
#endif
                    continue;
                }
#endif
#ifdef AND_LIST
                if ((v1 & and_list[npos])) {
#ifdef ADB
                    l3++;
#endif
                    continue;
                }
#endif
                used  |= 1 << i;
#ifdef USED_LIST
                if ( (v1 & used_list[npos & 0x3f][used])) {
#ifdef ADB
                    l3++;
#endif
                }
                else
#endif
                    solve_l(v1, npos);
                used &= ~(1 << i);
            }
        }
    }
#ifdef ADB
    level--;
#endif
    debug(("\n"));
#ifdef JUDY
    if ((pos & 0x3f) >= PP && solutions != sol) {
        int *pi;
        JLI(pi, valid[used], v);
        *pi = solutions - sol;
    }
#endif
}
void solve1(int pos, val v)
{
    int i, npos;
    int typ;
    val v1, *vp;
#ifdef AND_MASK
    unsigned int and;
#endif


#ifdef ADB
    ++level;
    ++l1;
    ++lc[level];
#endif
    debug(("L%02d %16llx pos %2d typ %2d\n",
                level, v.v, pos & 0x3f, pos >> 6));
#ifdef TEST
    for (i = (level == 1) ? 2 :0; i < (level == 1 ? 3:12); ++i) {
#else
    for (i = 0; i < 12; ++i) {
#endif
        if (used & (1 << i))
            continue;
        vp = hibit[pos][i];
#ifdef AND_MASK
        and = vp++ ->u.lo;
        if (v.u.lo & and) {
#ifdef ADB
            ++l3;
#endif
            debug2(("v %8x and %8x pos %2d i %2d\n", v.u.lo, and, pos, i));
            continue;
        }
#endif
        for (; vp->u.lo ; ++vp) {
            if (! (v.u.lo & vp->u.lo) && !(v.u.hi & vp->u.hi)) {
                v1.v = v.v | vp->v;
                debug(("|%2d %16llx   %16llx => %16llx\n", i, v.v, vp->v, v1.v));
#ifdef POS
#if P1
                res[i].v = ((vp->v) << 1 | 1ULL) << ((pos & 0x3f)-1);
#else
                res[i].v = vp->v << (pos & 0x3f);
#endif
#endif
#ifdef BSF_HI
                npos = bsf(~v1.u.lo);
                v1.v >>= npos;  /* this is really and slow ugly in asm */
                npos += (pos & 0x3f);
#else
                for (npos = pos & 0x3f; ; ++npos, v1.v >>= 1) {
                    if (!(v1.u.lo & 1))
                        break;
                }
#endif
                loch_check(v1.u.lo);
#if P1
                v1.v >>= 1;
#endif
                typ = get_typ(~v1.u.lo);
#if TYPS
                if (!typ) {
                    debug(("no typ for %16llx (%8x)\n", v1, ~v1.u.lo));
                    continue;
                }
#endif
                npos = npos + P1 + typ*64;
#ifdef AND_LIST
                if ((v1.u.lo & and_list[npos])) {
#ifdef ADB
                    l3++;
#endif
                    continue;
                }
#endif
#ifdef ANY_LIST
                if (!any_list[npos]) {
#ifdef ADB
                    l3++;
#endif
                    continue;
                }
#endif
                used |= 1 << i;
#ifdef USED_LIST
                if ( (v1.u.lo & used_list[npos & 0x3f][used])) {
#ifdef ADB
                    l3++;
#endif
                }
                else
#endif
                {
                    if ((npos & 0x3f) < 28) 	/* 28..59 are 32 hibits */
                        solve1(npos, v1);
                    else
                        solve_l(v1.u.lo, npos);
                }
                used &= ~(1 << i);
            }
        }
    }
#ifdef ADB
    level--;
#endif
    debug(("\n"));
}


static void
solve(int takt)
{
    int s, q;
    val v;

    v.v = 0ULL;
    solutions = 0;
    last = start = clock()/CLOCKS_PER_SEC;
    if (takt)
	mhz = takt;
    fflush(stdout);
    if (0) {
	v.v &= (C.v << 40);	// FIXME
	//used[Ci] = 1;
	v.v &= (T.v << (40+(3*4)));	// FIXME
	//used[Ti] = 1;
    }
    solve1(P1+64*(N_MSK-1), v);
    for (q = 0, s = solutions; s ; s /= 10)
	q += s % 10;
    printf("\n%d solutions q=%d\n", solutions, q);
    printf("Features "
#ifdef SYM1
	    "sym1 "
#endif
#ifdef SYM2
	    "sym2 "
#endif
#ifdef SYM3
	    "sym3 "
#endif
#ifdef REV_BSF
	    "rev_bsf "
#endif
#ifdef LOCH_CHECK
	    "loch_check "
#endif
#ifdef AND_LIST
	    "and_list "
#endif
#ifdef AND_MASK
	    "and_mask "
#endif
#ifdef POS
	    "pos "
#endif
#ifdef ADB
	    "adb "
#endif
#ifdef BSF
	    "bsf "
#endif
#ifdef BSF_HI
	    "bsf_hi "
#endif
#ifdef ANY_LIST
	    "any_list "
#endif
#ifdef USED_LIST
	    "used_list "
#endif
#ifdef OPT_L
	    "opt_l "
#endif
       "typs_%d"

	    "\n", TYPS);
#ifdef ADB
    printf("l1 = %lld l2 = %lld l3 = %lld tot %lld\n", l1, l2, l3, l1+l2);
#endif
}
/*
 * make all possible permutations of parts
 * by turning and moving them in the 3*4*5 room
 *
 */
static void
make_parts_list(int n) {
    int i, j, k, l, b, x, y, z;
    int coors[64][6];
    int c;
    unsigned long long v;

    for (i = 0; i < 12; i++) {
	v = part_to_bits(i);
	if (i == Ti)
	    T.v = v;
	else if (i == Ci)
	    C.v = v;
	debug2(("pbits[%2d] %8llx %d (%d,%d,%d)\n", i, v, nc[i],
		    dim[i][0],dim[i][1],dim[i][2]));
	for (b = c = 0; b < 3*4*5; b++, v >>= 1) {
	    if (v & 1) {
	    z = b % 3;
	    x = b / 12;
	    y = (b - x*12) / 3;
		debug2(("f%2d (%d,%d,%d)\n", i, x,y,z));
		/* place it in a 16x16x16 cube */
		coors[0][c++] = x + 16 * y + 256 * z;
	    }
	}
	assert(c == nc[i]);
	/* rotate around Z in x-y plane towards left */
	for (j = 1; j < 4; j++)
	    rotxyz(c, 0, coors[j], coors[j-1]);
	/* rotate around Y in x-z plane */
	for (k = 1; k < 4; k++)
	    for (j = 0; j < 4; j++)
		rotxyz(c, 1, coors[k*4+j], coors[(k-1)*4+j]);
	/* rotate around X in y-z plane  - duplicates are eliminated
	 * later
	 */
	for (l = 1; l < 4; l++)
	    for (k = 0; k < 4; k++)
		for (j = 0; j < 4; j++)
		    rotxyz(c, 2, coors[l*16+k*4+j], coors[(l-1)*16+k*4+j]);
#ifdef DEBUG2
	if (i == DBP)
	    for (j = 0; j < 64; j++) {
		debug(("f%d rot %d\n", i, j));
		coors2xy(i, coors[j]);
	    }
#endif
	/* convert rotated to uniq and fitting bitlist */
	coors2bits(i, coors);
	/* shift all around in the 5*4*3 room */
	shift_bits(i);
	/* make a list of highest set bit */
    }
#define SIZE 2000000
#ifdef MEMALIGN
    pool = pptr = memalign(256, SIZE);
    memset(pool, 0, SIZE);
#else
    pool = pptr = calloc(1, SIZE);
#endif
    make_shifts();
    make_hibit();
    for (j = k = 0; j < 12; j++)
	k += ntot[j];
    debug(("Total %d pool used %d\n", k, pptr-pool));
    check_hibit();
}


int main(int argc, char *argv[])
{
    int n = sizeof(parts)/sizeof(parts[0]);
    assert(n == 12);
    make_parts_list(n);
    solve(argc > 1 ? atoi(argv[1]) : 0);
    return 0;
}

/*
 * vim: expandtab shiftwidth=4:
 */
