TalkChess.com

Posted: **Wed Apr 17, 2019 7:16 am**

I have wanted to do this for years but it was too complicated. About two weeks ago I found out that I had an iodine deficiency! Now what was too complicated before turns out to be extremely simple!!

My new almost bb engine does not use bbs to generate moves. It uses them for other things though. Each piece in the SOA has a bb stack. When a move is made the board is scanned to determine which pieces bb has been invalidated. The complete bb for pawns, knights and the kings. The rays for the sliders. I'll just explain the sliders. I set a bit for each invalidated ray. On needing moves for an invalidated ray that bb is pushed onto the piece bb stack. Then the ray is removed by a ray zero mask. The new ray generated by classic bb ray generation is ored in. Popcount is used on the new ray to get the number of moves. That number is saved in the first element of a ray move array. Example, unsigned char rayNE[64][8]. The rest of the elements contain actual to_squares in the direction of the array. For move generation a for_loop is used. For capture only generation only the last ts is checked. So move generation is done by precomputed move list and the bbs are used for evaluation, move from tt verification and other things. And no bb is generated until needed. It seems imho to be the best of all worlds.

Sample code.

Code: Select all

void InitBBGen() {
  s32 i, j, s;

  for (i = 0; i < 40; i++) { 
    t->top[i] = 0;
    t->inv[i] = 0;
    for (j = 0; j < 200; j++)
      t->bbStack[i][j] = 0;
  }

  for (s = 0; s <= H8; s++) {
    i = t->brd[s];
    j = t->typ[i];
    switch (j) {
    case NONE:
      break;
    case WP:
      t->inv[i] = 0b00000000;
      WPBBG(t,s,&t->bbStack[i][t->top[i]]);
      break;
    case WN:
      t->inv[i] = 0b00000000;
      WNBBG(t,s,&t->bbStack[i][t->top[i]]);
      break;
    case WB:
      t->inv[i] = 0b00001111;
      WSBBG(t,s,&t->inv[i], &t->bbStack[i][t->top[i]]);
      break;
    case WR:
      t->inv[i] = 0b11110000;
      WSBBG(t,s,&t->inv[i], &t->bbStack[i][t->top[i]]);
      break;
    case WQ:
      t->inv[i] = 0b11111111;
      WSBBG(t,s,&t->inv[i], &t->bbStack[i][t->top[i]]);
      break;
    case WK:
      t->inv[i] = 0b00000000;
      WKBBG(t,s,&t->bbStack[i][t->top[i]]);
      break;
    case WCKS: case WCQS: case WTER:
      break;
    case BP:
      t->inv[i] = 0b00000000;
      BPBBG(t,s,&t->bbStack[i][t->top[i]]);
      break;
    case BN:
      t->inv[i] = 0b00000000;
      BNMMG(t,s,&t->bbStack[i][t->top[i]]);
      break;
    case BB:
      t->inv[i] = 0b00001111;
      BSBBG(t,s,&t->inv[i], &t->bbStack[i][t->top[i]]);
      break;
    case BR:
      t->inv[i] = 0b11110000;
      BSMMG(t,s,&t->inv[i], &t->bbStack[i][t->top[i]]);
      break;
    case BQ:
      t->inv[i] = 0b11111111;
      BSBBG(t,s,&t->inv[i], &t->bbStack[i][t->top[i]]);
      break;
    case BK:
      t->inv[i] = 0b00000000;
      BKBBG(t,s,&t->bbStack[i][t->top[i]]);
      break;
    }
  }
}

// White Pawn bb Generator
void WPBBG(threadS *t, s32 fs, u64 *bb) {
  s32 row = fs >> 3;
  switch (row) {
  case 0:
    break;
  case 1:
    *bb = ((sqWPc[fs] & t->bPieces) | (sqWPm[fs] & below[FirstBit64(sqWPm[fs] & t->aPieces)]));
    break;
  case 2:
  case 3: 
    *bb = ((sqWPc[fs] & t->bPieces) | (sqWPm[fs] & t->nPieces));
    break;
  case 4:
    *bb = ((sqWPc[fs] & t->bPieces | t->esq) | (sqWPm[fs] & t->nPieces));
    break;
  case 5: case 6:
    *bb = ((sqWPc[fs] & t->bPieces) | (sqWPm[fs] & t->nPieces));
    break;
  }
}

// White Knight bb Generator
void WNBBG(threadS *t, s32 fs, u64 *bb) {
  *bb = (sqrKN[fs] & ~t->wPieces);
}

// White King bb Generator
void WKBBG(threadS *t, s32 fs, u64 *bb) {
  *bb = (sqrKG[fs] & ~t->wPieces);
}

// White Slider bb Generator
void WSBBG(threadS *t, s32 fs, u32 *inv, u64 *bb) {
  s32 i;
  do {
    i = FirstBit32(*inv);
    *inv ^= (1 << i);
    switch (i) {
    case NW:
      *bb &= andNW[fs];
      *bb |= (rayNW[fs] & belowInc[FirstBit64(rayNW[fs] & t->aPieces)]);
      break;
    case NE:
      *bb &= andNE[fs];
      *bb |= (rayNE[fs] & belowInc[FirstBit64(rayNE[fs] & t->aPieces)]);
      break;
    case SW:
      *bb &= andSW[fs];
      *bb |= (raySW[fs] & aboveInc[LastBit64(raySW[fs] & t->aPieces)]);
      break;
    case SE:
      *bb &= andSE[fs];
      *bb |= (raySE[fs] & aboveInc[LastBit64(raySE[fs] & t->aPieces)]);
      break;
    case NN:
      *bb &= andNN[fs];
      *bb |= (rayNN[fs] & belowInc[FirstBit64(rayNN[fs] & t->aPieces)]);
      break;
    case EE:
      *bb &= andEE[fs];
      *bb |= (rayEE[fs] & belowInc[FirstBit64(rayEE[fs] & t->aPieces)]);
      break;
    case SS:
      *bb &= andSS[fs];
      *bb |= (raySS[fs] & aboveInc[LastBit64(raySS[fs] & t->aPieces)]);
      break;
    case WW:
      *bb &= andWW[fs];
      *bb |= (rayWW[fs] & aboveInc[LastBit64(rayWW[fs] & t->aPieces)]);
      break;
    }
  } while (*inv);
  *bb &= ~t->wPieces;
}

Posted: **Wed Apr 17, 2019 4:43 pm**

My new engine is still in the very early stages. I am particularly happy with the MakeMove() routine. Except for one switch statement it is totally branchless, i.e. not a single if() statement.

Code: Select all

void MakeMove(threadS *t, moveU *m) {
  s32 id;
  u08 typ;

  t->ply++;

  t->wtm = 1 - t->wtm;

  t->fifty[ply] = t->fifty[ply - 1] + 1;

  id = t->brd[m->fs];

  switch (m->typ & 0b1111) {
  case WCAP:
    t->nxt[t->prv[m->id]] = t->nxt[m->id];
    t->prv[t->nxt[m->id]] = t->prv[id];
    t->bmt -= t->val[m->id];
    t->bPieces ^= one << m->ts;
    t->fifty[ply] = 0;
  case WMOV:
    t->brd[m->ts] = m->id;
    t->brd[m->fs] = NONE;
    t->sqr[id] = m->ts;
    t->wPieces ^= one << m->fs;
    t->wPieces |= one << m->ts;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    return;
  case WCSH:
    t->brd[m->ts] = WKID;
    t->brd[m->fs] = NONE;
    t->sqr[WKID] = G1;
    t->wPieces ^= one << E1;
    t->wPieces |= one << G1;
    id = t->brd[H1];
    t->brd[F1] = id;
    t->brd[H1] = NONE;
    t->sqr[id] = F1;
    t->wPieces ^= one << H1;
    t->wPieces |= one << F1;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case WCLG:
    t->brd[m->ts] = WKID;
    t->brd[m->fs] = NONE;
    t->sqr[WKID] = C1;
    t->wPieces ^= one << m->fs;
    t->wPieces |= one << m->ts;
    id = t->brd[A1];
    t->brd[D1] = id;
    t->brd[A1] = NONE;
    t->sqr[id] = D1;
    t->wPieces ^= one << A1;
    t->wPieces |= one << D1;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case WPCE:
    t->nxt[t->prv[m->id]] = t->nxt[m->id];
    t->prv[t->nxt[m->id]] = t->prv[m->id];
    t->bmt -= t->val[m->id];
    t->bPieces ^= one << m->ts;
    t->brd[m->ts - 8] = NONE;
    t->brd[m->ts] = id;
    t->brd[m->fs] = NONE;
    t->sqr[id] = m->ts;
    t->wPieces ^= one << m->fs;
    t->wPieces |= one << m->ts;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case WPPC:
    t->nxt[t->prv[m->id]] = t->nxt[m->id];
    t->prv[t->nxt[m->id]] = t->prv[m->id];
    t->bmt -= t->val[m->id];
    t->bPieces ^= one << m->ts;
  case WPPM:
    t->brd[m->ts] = id;
    t->brd[m->fs] = NONE;
    t->sqr[id] = m->ts;
    typ = m->typ >> 4;
    t->typ[id] = wTypM[typ];
    t->wmt += (valM[typ] - t->val[id]);
    t->val[id] = valM[typ];
    t->wPieces ^= one << m->fs;
    t->wPieces |= one << m->ts;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case BCAP:
    t->nxt[t->prv[m->id]] = t->nxt[m->id];
    t->prv[t->nxt[m->id]] = t->prv[m->id];
    t->wmt -= t->val[m->id];
    t->wPieces ^= one << m->ts;
    t->fifty[ply] = 0;
  case BMOV:
    t->brd[m->ts] = id;
    t->brd[m->fs] = NONE;
    t->sqr[m->id] = m->ts;
    t->bPieces ^= one << m->fs;
    t->bPieces |= one << m->ts;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply]++;
    return;
  case BCSH:
    t->brd[m->ts] = BKID;
    t->brd[m->fs] = NONE;
    t->sqr[BKID] = G8;
    t->wPieces ^= one << E8;
    t->wPieces |= one << G8;
    id = t->brd[H8];
    t->brd[F8] = id;
    t->brd[H8] = NONE;
    t->sqr[id] = F8;
    t->bPieces ^= one << H8;
    t->bPieces |= one << F8;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case BCLG:
    t->brd[m->ts] = BKID;
    t->brd[m->fs] = NONE;
    t->sqr[BKID] = C8;
    t->wPieces ^= one << E8;
    t->wPieces |= one << C8;
    id = t->brd[A8];
    t->brd[D8] = id;
    t->brd[A8] = NONE;
    t->sqr[id] = D8;
    t->bPieces ^= one << A8;
    t->bPieces |= one << D8;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case BPCE:
    t->nxt[t->prv[m->id]] = t->nxt[m->id];
    t->prv[t->nxt[m->id]] = t->prv[m->id];
    t->wmt -= t->val[m->id];
    t->wPieces ^= one << m->ts;
    t->brd[m->ts + 8] = NONE;
    t->brd[m->ts] = id;
    t->brd[m->fs] = NONE;
    t->sqr[id] = m->ts;
    t->bPieces ^= one << m->fs;
    t->bPieces |= one << m->ts;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  case BPPC:
    t->nxt[t->prv[m->id]] = t->nxt[m->id];
    t->prv[t->nxt[m->id]] = t->prv[m->id];
    t->wmt -= t->val[m->id];
    t->wPieces ^= one << m->ts;
  case BPPM:
    t->brd[m->ts] = id;
    t->brd[m->fs] = NONE;
    t->sqr[id] = m->ts;
    typ = m->typ >> 4;
    t->typ[id] = bTypM[typ];
    t->bmt += (valM[typ] - t->val[id]);
    t->val[id] = valM[typ];
    t->bPieces ^= one << m->fs;
    t->bPieces |= one << m->ts;
    t->aPieces = t->wPieces | t->bPieces;
    t->nPieces = ~t->aPieces;
    t->fifty[ply] = 0;
    return;
  }
  return;
}

Posted: **Wed Apr 17, 2019 5:51 pm**

In my old engine (the one I'm still playing with) I also have a switch statement in my domove() with 15 branchless cases (move types), but I find it difficult to maintain. Each time I change or add something I have to modify most of these cases. In a new version of the engine I'm working on I just have 3 routines with some branches: remove_piece(), move_piece() and add_piece(). I expected it to be slower, but in practice the difference in performance is negligible, and it is much easier to maintain.

Posted: **Wed Apr 17, 2019 6:11 pm**

I suppose that I go overboard looking for better performance. And after all is said and done there may not be much gained. And probably I make things harder on myself than need be. And there is something to say for smaller code when it comes to performance.

I am curious about one thing though. Why an odd number of case statements? Is there one common to both sides?

Posted: **Wed Apr 17, 2019 6:42 pm**

Michael Sherwin wrote: ↑Wed Apr 17, 2019 6:11 pm I suppose that I go overboard looking for better performance. And after all is said and done there may not be much gained. And probably I make things harder on myself than need be. And there is something to say for smaller code when it comes to performance.

I am curious about one thing though. Why an odd number of case statements? Is there one common to both sides?

Of course you are right, I also have a move type 'mtNone', out of consistency I included that in the switch, but it does nothing at all. I my case I have templates for White and Black so it doesn't have to be even at all, the switch is just for one side.

Back in 2004 when I programmed this move-generator/do-move it did with Perft() on average 25 mln. moves/sec. Nowadays it is about 6 times as fast, this is the hardware improvement in 15 years time.

It is somewhat position dependend too, in some positions I get 70 mln. moves/sec. and some others go over 200 mln. moves/sec. (on a 3.8 GHz. Intel Broadwell).

I have to add that I also look to get the best performance speed wise. I remember spending many weeks on just the do-move() to get another 25% in speed. Now I realize that it is not very important because it translates to maybe 1% in engine speed (~1Elo).

Posted: **Thu Apr 18, 2019 4:55 am**

I think that this is the final form of the slider BB generator. I will try to explain the time savings.

Code: Select all

// White Slider BB Generator
void WSBBG(threadS *t, s32 fs, u08 *inv, u64 *bb) {
  s32 i;
  u64 ray;

  do {
    i = FirstBit32(*inv);
    *inv ^= (1 << i);
    switch (i) {
    case NW:
      ray = (rayNW[fs] & belowInc[FirstBit64(rayNW[fs] & t->aPieces)]);
      *bb &= andNW[fs]; *bb |= ray; mvsNW[fs][0] = (u08)__popcnt64(ray);
      break;
    case NE:
      ray = (rayNE[fs] & belowInc[FirstBit64(rayNE[fs] & t->aPieces)]);
      *bb &= andNW[fs]; *bb |= ray; mvsNE[fs][0] = (u08)__popcnt64(ray);
      break;
    case SW:
      ray = (raySW[fs] & aboveInc[LastBit64(raySW[fs] & t->aPieces)]);
      *bb &= andNW[fs]; *bb |= ray; mvsSW[fs][0] = (u08)__popcnt64(ray);
      break;
    case SE:
      ray = (raySE[fs] & aboveInc[LastBit64(raySE[fs] & t->aPieces)]);
      *bb &= andSE[fs]; *bb |= ray; mvsSE[fs][0] = (u08)__popcnt64(ray);
      break;
    case NN:
      ray = (rayNN[fs] & belowInc[FirstBit64(rayNN[fs] & t->aPieces)]);
      *bb &= andSE[fs]; *bb |= ray; mvsNN[fs][0] = (u08)__popcnt64(ray);
      break;
    case EE:
      ray = (rayEE[fs] & belowInc[FirstBit64(rayEE[fs] & t->aPieces)]);
      *bb &= andSE[fs]; *bb |= ray; mvsEE[fs][0] = (u08)__popcnt64(ray);
      break;
    case SS:
      ray = (raySS[fs] & aboveInc[LastBit64(raySS[fs] & t->aPieces)]);
      *bb &= andSE[fs]; *bb |= ray; mvsSS[fs][0] = (u08)__popcnt64(ray);
      break;
    case WW:
      ray = (rayWW[fs] & aboveInc[LastBit64(rayWW[fs] & t->aPieces)]);
      *bb &= andSE[fs]; *bb |= ray; mvsWW[fs][0] = (u08)__popcnt64(ray);
      break;
    }
  } while (*inv);
  *bb &= ~t->wPieces;
}

The time savings over regular classic bitboards is for a couple of reasons. In classic bitboards the bit scan instruction in the worst case , a corner square, will need as many as 8 executions to gen 7 moves. And the bsf and bsr instructions are clock tick inefficient. In this method only one bitscan and one popcount is needed. Also rays at the edge of the board that go immediately off of the board are constantly expensive. So when a ray has not been invalidated by a move the edge detection is as simple as while ( mvsNW[fs][0] ) {}. And when mvsNW[fs][0] has a number of moves a move loop need not do any further edge detection. And when captures only are being generated the last ts needed to be looked at is mvsNW[fs][0] as it contains the index of the last valid ts along the ray.

And the Invalidate function is only needed for the moved piece and affected rays.

Code: Select all

void Invalidate(threadS *t, moveU *m) {
  s32 ts, id, sq, i;
  u32 top;

  // Invalidate the moved piece
  id = t->brd[m->fs];
  top = t->top[id];
  i = t->typ[id];
  t->inv[id][top] = inv[i];


  // Invalidate the rays affected
  id = t->brd[m->fs];
  top = t->top[id];

  for (i = 0, sq = m->fs; i < 2; sq = m->ts, i++) { 

    // Bishop || Queen on square
    if ((ts = FirstBit64(rayNW[sq] & t->aPieces)) != 64) { 
      id = t->brd[ts];
      if (t->fig[id] == B || t->fig[id] == Q) t->inv[id][top] |= 0b00001000;
    }

    if ((ts = FirstBit64(rayNE[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == B || t->fig[id] == Q) t->inv[id][top] |= 0b00000100;
    }

    if ((ts = LastBit64(raySW[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == B || t->fig[id] == Q) t->inv[id][top] |= 0b00000010;
    }

    if ((ts = LastBit64(raySE[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == B || t->fig[id] == Q) t->inv[id][top] |= 0b00000001;
    }

    // Rook || Queen on square
    if ((ts = FirstBit64(rayNN[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == R || t->fig[id] == Q) t->inv[id][top] |= 0b00100000;
    }

    if ((ts = FirstBit64(rayEE[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == R || t->fig[id] == Q) t->inv[id][top] |= 0b10000000;
    }

    if ((ts = LastBit64(raySS[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == R || t->fig[id] == Q) t->inv[id][top] |= 0b00000001;
    }

    if ((ts = LastBit64(rayWW[sq] & t->aPieces)) != 64) {
      id = t->brd[ts];
      if (t->fig[id] == R || t->fig[id] == Q) t->inv[id][top] |= 0b00000010;
    }
  }
}

Any opinions?

Posted: **Thu Apr 18, 2019 10:24 am**

Michael Sherwin wrote: ↑Thu Apr 18, 2019 4:55 am And the bsf and bsr instructions are clock tick inefficient.

Does this still hold on modern CPUs? Anyway, since you're hunting for cycles, one more low hanging fruit for you:
mark default case in your switches unreachable to eliminate bounds check, turning the switch into plain jump table,
making your move generator even more awesome

You can try this in godbolt.org:

Code: Select all

#include <iostream>
#include <stdlib.h>

volatile int x = rand();
volatile int y = rand();


#ifdef _MSC_VER
#	define UNREACHABLE __assume(0)
#else
#	define UNREACHABLE __builtin_unreachable()
#endif

int main()
{
    switch(x)
    {
        case 0:
            y=5;
            break;
        case 1:
            y++;
            break;
        case 2:
            y--;
            break;
        case 3:
            y <<= 1;
            break;
        case 4:
            y >>= 3;
            break;
        case 5:
            y += x;
            break;
        case 6:
            y -= x;
            break;
        case 7:
            y *= x;
            break;
        default:
            UNREACHABLE;
    }
    std::cout << "x=" << x << " y=" << y << std::endl;
	return 0;
}

Posted: **Thu Apr 18, 2019 4:28 pm**

mar wrote: ↑Thu Apr 18, 2019 10:24 am
Michael Sherwin wrote: ↑Thu Apr 18, 2019 4:55 am And the bsf and bsr instructions are clock tick inefficient.
Does this still hold on modern CPUs? Anyway, since you're hunting for cycles, one more low hanging fruit for you:
mark default case in your switches unreachable to eliminate bounds check, turning the switch into plain jump table,
making your move generator even more awesome

You can try this in godbolt.org:
Code: Select all
#include <iostream>
#include <stdlib.h>

volatile int x = rand();
volatile int y = rand();


#ifdef _MSC_VER
#	define UNREACHABLE __assume(0)
#else
#	define UNREACHABLE __builtin_unreachable()
#endif

int main()
{
    switch(x)
    {
        case 0:
            y=5;
            break;
        case 1:
            y++;
            break;
        case 2:
            y--;
            break;
        case 3:
            y <<= 1;
            break;
        case 4:
            y >>= 3;
            break;
        case 5:
            y += x;
            break;
        case 6:
            y -= x;
            break;
        case 7:
            y *= x;
            break;
        default:
            UNREACHABLE;
    }
    std::cout << "x=" << x << " y=" << y << std::endl;
	return 0;
}

That is awesome! I was wondering how to do that. Thanks! I assume bsf and bsr are better than they used to be. I looked for timing info the other day and did not find any. Anyway not everyone has a newer cpu.

Posted: **Thu Apr 18, 2019 8:30 pm**

I've been thinking what to call this move generation method. So before anyone else coins a name for it I have decided to call it the BITBOARD ON DEMAND INCREMENTAL VECTORIZATION MOVE GENERATOR.

Posted: **Thu Apr 18, 2019 11:10 pm**

Hey Michael,

This looks interesting. How does it perform in perf tests?

BTW: if you'd like to write it up I'd be happy to publish it as a blog article on chessprogramming.net

Steve

TalkChess.com

My newest almost bb move generator is wonderful

My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful

Re: My newest almost bb move generator is wonderful