Magic Move Generation

Tord Romstad · Post by **Tord Romstad** » Wed Feb 20, 2008 10:36 am

Gerd Isenberg wrote:I like snoob, same number of one bits, for magics with 4 to 8 or 56..60 bit sets.
http://chessprogramming.wikispaces.com/ ... 20a%20Set4
You may try random numbers as well or xor them with some constant, like a De Bruijn sequence or whatever else.

Just trying out random numbers with a low number of nonzero bits until you find a number which works is by far the fastest and easiest way to generate the magic numbers, in my experience. On my Core Duo 2.8 GHz, it takes less than a second to find magic numbers for rooks and bishops for all squares (and I have made no attempt to optimize the code, it should be easy to make it much faster).

Here is the source code:

Code: Select all

#include <stdio.h>
#include <stdlib.h>

#define USE_32_BIT_MULTIPLICATIONS

typedef unsigned long long uint64;

uint64 random_uint64() {
  uint64 u1, u2, u3, u4;
  u1 = (uint64)(random()) & 0xFFFF; u2 = (uint64)(random()) & 0xFFFF;
  u3 = (uint64)(random()) & 0xFFFF; u4 = (uint64)(random()) & 0xFFFF;
  return u1 | (u2 << 16) | (u3 << 32) | (u4 << 48);
}

uint64 random_uint64_fewbits() {
  return random_uint64() & random_uint64() & random_uint64();
}

int count_1s(uint64 b) {
  int r;
  for(r = 0; b; r++, b &= b - 1);
  return r;
}

const int BitTable[64] = {
  63, 30, 3, 32, 25, 41, 22, 33, 15, 50, 42, 13, 11, 53, 19, 34, 61, 29, 2, 
  51, 21, 43, 45, 10, 18, 47, 1, 54, 9, 57, 0, 35, 62, 31, 40, 4, 49, 5, 52, 
  26, 60, 6, 23, 44, 46, 27, 56, 16, 7, 39, 48, 24, 59, 14, 12, 55, 38, 28, 
  58, 20, 37, 17, 36, 8
};

int pop_1st_bit(uint64 *bb) {
  uint64 b = *bb ^ (*bb - 1);
  unsigned int fold = (unsigned) ((b & 0xffffffff) ^ (b >> 32));
  *bb &= (*bb - 1);
  return BitTable[(fold * 0x783a9b23) >> 26];
}

uint64 index_to_uint64(int index, int bits, uint64 m) {
  int i, j, k;
  uint64 result = 0ULL;
  for(i = 0; i < bits; i++) {
    j = pop_1st_bit(&m);
    if(index & (1 << i)) result |= (1ULL << j);
  }
  return result;
}

uint64 rmask(int sq) {
  uint64 result = 0ULL;
  int rk = sq/8, fl = sq%8, r, f;
  for(r = rk+1; r <= 6; r++) result |= (1ULL << (fl + r*8));
  for(r = rk-1; r >= 1; r--) result |= (1ULL << (fl + r*8));
  for(f = fl+1; f <= 6; f++) result |= (1ULL << (f + rk*8));
  for(f = fl-1; f >= 1; f--) result |= (1ULL << (f + rk*8));
  return result;
}

uint64 bmask(int sq) {
  uint64 result = 0ULL;
  int rk = sq/8, fl = sq%8, r, f;
  for(r=rk+1, f=fl+1; r<=6 && f<=6; r++, f++) result |= (1ULL << (f + r*8));
  for(r=rk+1, f=fl-1; r<=6 && f>=1; r++, f--) result |= (1ULL << (f + r*8));
  for(r=rk-1, f=fl+1; r>=1 && f<=6; r--, f++) result |= (1ULL << (f + r*8));
  for(r=rk-1, f=fl-1; r>=1 && f>=1; r--, f--) result |= (1ULL << (f + r*8));
  return result;
}

uint64 ratt(int sq, uint64 block) {
  uint64 result = 0ULL;
  int rk = sq/8, fl = sq%8, r, f;
  for(r = rk+1; r <= 7; r++) {
    result |= (1ULL << (fl + r*8));
    if(block & (1ULL << (fl + r*8))) break;
  }
  for(r = rk-1; r >= 0; r--) {
    result |= (1ULL << (fl + r*8));
    if(block & (1ULL << (fl + r*8))) break;
  }
  for(f = fl+1; f <= 7; f++) {
    result |= (1ULL << (f + rk*8));
    if(block & (1ULL << (f + rk*8))) break;
  }
  for(f = fl-1; f >= 0; f--) {
    result |= (1ULL << (f + rk*8));
    if(block & (1ULL << (f + rk*8))) break;
  }
  return result;
}

uint64 batt(int sq, uint64 block) {
  uint64 result = 0ULL;
  int rk = sq/8, fl = sq%8, r, f;
  for(r = rk+1, f = fl+1; r <= 7 && f <= 7; r++, f++) {
    result |= (1ULL << (f + r*8));
    if(block & (1ULL << (f + r * 8))) break;
  }
  for(r = rk+1, f = fl-1; r <= 7 && f >= 0; r++, f--) {
    result |= (1ULL << (f + r*8));
    if(block & (1ULL << (f + r * 8))) break;
  }
  for(r = rk-1, f = fl+1; r >= 0 && f <= 7; r--, f++) {
    result |= (1ULL << (f + r*8));
    if(block & (1ULL << (f + r * 8))) break;
  }
  for(r = rk-1, f = fl-1; r >= 0 && f >= 0; r--, f--) {
    result |= (1ULL << (f + r*8));
    if(block & (1ULL << (f + r * 8))) break;
  }
  return result;
}


int transform(uint64 b, uint64 magic, int bits) {
#if defined(USE_32_BIT_MULTIPLICATIONS)
  return
    (unsigned)((int)b*(int)magic ^ (int)(b>>32)*(int)(magic>>32)) >> (32-bits);
#else
  return (int)((b * magic) >> (64 - bits));
#endif
}

uint64 find_magic(int sq, int m, int bishop) {
  uint64 mask, b[4096], a[4096], used[4096], magic;
  int i, j, k, n, mbits, fail;

  mask = bishop? bmask(sq) : rmask(sq);
  n = count_1s(mask);

  for(i = 0; i < (1 << n); i++) {
    b[i] = index_to_uint64(i, n, mask);
    a[i] = bishop? batt(sq, b[i]) : ratt(sq, b[i]);
  }
  for(k = 0; k < 100000000; k++) {
    magic = random_uint64_fewbits();
    if(count_1s((mask * magic) & 0xFF00000000000000ULL) < 6) continue;
    for(i = 0; i < 4096; i++) used[i] = 0ULL;
    for(i = 0, fail = 0; !fail && i < (1 << n); i++) {
      j = transform(b[i], magic, m);
      if(used[j] == 0ULL) used[j] = a[i];
      else if(used[j] != a[i]) fail = 1;
    }
    if(!fail) return magic;
  }
  printf("***Failed***\n");
  return 0ULL;
}

int RBits[64] = {
  12, 11, 11, 11, 11, 11, 11, 12, 11, 10, 10, 10, 10, 10, 10, 11,
  11, 10, 10, 10, 10, 10, 10, 11, 11, 10, 10, 10, 10, 10, 10, 11,
  11, 10, 10, 10, 10, 10, 10, 11, 11, 10, 10, 10, 10, 10, 10, 11,
  11, 10, 10, 10, 10, 10, 10, 11, 12, 11, 11, 11, 11, 11, 11, 12
};

int BBits[64] = {
  6, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 7, 7, 7, 7, 5, 5, 5, 5, 7, 9, 9, 7, 5, 5,
  5, 5, 7, 9, 9, 7, 5, 5, 5, 5, 7, 7, 7, 7, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6
};

int main() {
  int square;
  uint64 magic;

  printf("const uint64 RMagic[64] = {\n");
  for(square = 0; square < 64; square++)
    printf("  0x%llxULL,\n", find_magic(square, RBits[square], 0));
  printf("};\n\n");

  printf("const uint64 BMagic[64] = {\n");
  for(square = 0; square < 64; square++)
    printf("  0x%llxULL,\n", find_magic(square, BBits[square], 1));
  printf("};\n\n");

  return 0;
}

Tord

Gerd Isenberg · Post by **Gerd Isenberg** » Wed Feb 20, 2008 1:03 pm

Tord Romstad wrote:
Gerd Isenberg wrote:I like snoob, same number of one bits, for magics with 4 to 8 or 56..60 bit sets.
http://chessprogramming.wikispaces.com/ ... 20a%20Set4
You may try random numbers as well or xor them with some constant, like a De Bruijn sequence or whatever else.
Just trying out random numbers with a low number of nonzero bits until you find a number which works is by far the fastest and easiest way to generate the magic numbers, in my experience. On my Core Duo 2.8 GHz, it takes less than a second to find magic numbers for rooks and bishops for all squares (and I have made no attempt to optimize the code, it should be easy to make it much faster).

Tord

Yes, if you postulate known shifts, and don't try to reduce squares further, it is very fast. It even works under old msvc6 and it takes about 45 seconds for the rooks and one second for the bishops in 32 bit mode. Defining USE_32_BIT_MULTIPLICATIONS is a few seconds only (K8 2GHz).

Some modifications necessary for this antique compiler:

Code: Select all

#include <stdio.h>
#include <stdlib.h>

// #define USE_32_BIT_MULTIPLICATIONS

typedef unsigned __int64 U64;

const U64 one = 1; // replace 1ULL by one

U64 random_uint64() {
  U64 u1, u2, u3, u4;
  u1 = (U64)(rand()) & 0xFFFF; u2 = (U64)(rand()) & 0xFFFF;
  u3 = (U64)(rand()) & 0xFFFF; u4 = (U64)(rand()) & 0xFFFF;
  return u1 | (u2 << 16) | (u3 << 32) | (u4 << 48);
}
...

int main() {
  int square;

  printf("const U64 RMagic[64] = {\n");
  for(square = 0; square < 64; square++)
    printf("  0x%016I64x\n", find_magic(square, RBits[square], 0));
  printf("};\n\n");

  printf("const U64 BMagic[64] = {\n");
  for(square = 0; square < 64; square++)
    printf("  0x%016I64x\n", find_magic(square, BBits[square], 1));
  printf("};\n\n");

  return 0;
}

Gerd

Aleks Peshkov · Post by **Aleks Peshkov** » Wed Feb 20, 2008 3:02 pm

I thought that count of significant bits in a magic hash multiplication factor should be not less then a number of significant bits in a relevant bitboard mask. But I discovered that many known unoptimized magic multiplicators for bishops and rooks contains as few as 4 significant bits.

Can mathematicians tell why 4 bits multiplication factor is sufficient for creating up to 12 significant bits result?

Gerd Isenberg · Post by **Gerd Isenberg** » Wed Feb 20, 2008 3:42 pm

Aleks Peshkov wrote:I thought that count of significant bits in a magic hash multiplication factor should be not less then a number of significant bits in a relevant bitboard mask. But I discovered that many known unoptimized magic multiplicators for bishops and rooks contains as few as 4 significant bits.

Can mathematicians tell why 4 bits multiplication factor is sufficient for creating up to 12 significant bits result?

You have to consider that we mask off the outer squares. Thus for a rook on h1 one bit for the six rank-bits, and three further bit-factors to somehow shift and squeeze the six file-bit to the upper 12 seems the minimum.

Code: Select all

0x008080808080807E
 . . . . . . . .
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . 1 1 1 1 1 1 .

Aleks Peshkov · Post by **Aleks Peshkov** » Wed Feb 20, 2008 4:32 pm

Gerd Isenberg wrote:You have to consider that we mask off the outer squares. Thus for a rook on h1 one bit for the six rank-bits, and three further bit-factors to somehow shift and squeeze the six file-bit to the upper 12 seems the minimum.
Code: Select all
0x008080808080807E
 . . . . . . . .
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . . . . . . . 1
 . 1 1 1 1 1 1 .

Thanks, picture makes me understand why 1 bit is sufficient for horizontal rook attacks. But diagonals and verticals still unclear.

My question is not idle. Each square attack table have a set significant attack bitboards separated by unused space. Used and unused bitboards seems to have a regular pattern, so it is probably possible to find a simple way to gather all separate squares hash tables in a one big hash table with less total size. At least bishops have a simple odd/even regularity, so it may be possible to interlace light/dark correspondent squares data together.

cms271828 · Post by **cms271828** » Thu Feb 21, 2008 4:15 am

I looked at the code, but the indexes for rooks are still all 10 to 12 bits.
So thats 1024 to 4096 entries per square needed.

With the rotated bitboards I use (Rooks), for each square I require an array of 6-bit entries (64 entries) to fetch the moves.

Of course, the rotated bitboards need to be used twice for each piece you're generating moves for, and you have to keep updating the total board occupancy for 3 rotated boards.

I haven't tried magic bitboards approach yet, but wouldn't the rotated bitboards work out faster since you're indexing smaller arrays?
Or is it too hard to tell which would be faster without testing both?

Thanks

wgarvin · Post by **wgarvin** » Thu Feb 21, 2008 4:08 pm

Aleks Peshkov wrote:My question is not idle. Each square attack table have a set significant attack bitboards separated by unused space. Used and unused bitboards seems to have a regular pattern, so it is probably possible to find a simple way to gather all separate squares hash tables in a one big hash table with less total size. At least bishops have a simple odd/even regularity, so it may be possible to interlace light/dark correspondent squares data together.

For some odd reason I thought you guys were already doing that. So the magic for each square goes to a separate table? Of which only some fraction of the entries contain something useful?

You might try a sparse-array-fitting approach. I.e. imagine you had a 2-dimensional array where most of the entries were zero (or something equally predictable). That sparse array could be compressed like this:

(1) slice the array into rows. Have a 1-dimensional array of pointers, with one pointer per row, that shows you where the "start" of the data for each row is.
(2) the data itself would store an additional value (a row ID) next to each used slot; this extra value says what row the slot is being used by.
(3) Now build up a 1-d array of the data by adding each row one by one in a greedy manner. All you have to do is find a starting offset such that none of the non-zero elements of the row are already being used by some element of a different row (when the data is sparse, this is pretty easy to do). Once you find the right spot, you set the pointer for that row to point to the start of its data, and go through and mark all the non-zero elements as belonging to this row and put the data in them.

For a sparse 2-d array, the resulting data structure can be a lot smaller than the original full-sized array. If the number of non-zero elements N is much lower than the total array size, the resulting data structure tends to be O(kN) in size for some smallish constant k.

When querying this thing, you would:
(1) get the pointer for the row you want to query.
(2) add the column offset to that pointer.
(3) check if the cell pointed to "belongs to that row". If it has the row ID of the row you are querying, then return the data value from the cell. If it has any other row ID, return zero.

Anyways, maybe a similar approach could be applied to the magic hash tables? Just greedily fit them together in an overlapping way, such that none of the cells is needed by the magic of more than one square. You don't even need a row ID, since the hash functions can only map to certain slots and you made sure up front that there were no collisions at those slots.

Aleks Peshkov · Post by **Aleks Peshkov** » Thu Feb 21, 2008 6:31 pm

wgarvin wrote:You might try a sparse-array-fitting approach. I.e. imagine you had a 2-dimensional array where most of the entries were zero (or something equally predictable). That sparse array could be compressed like this:...

The code of magic bitboard method is:

Code: Select all

return lookup[(occupied & mask) * magic >> shift];

The freedom only in choosing lookup and magic constants.

Harald · Post by **Harald** » Thu Feb 21, 2008 8:35 pm

cms271828 wrote:I haven't tried magic bitboards approach yet, but wouldn't the rotated bitboards work out faster since you're indexing smaller arrays?
Or is it too hard to tell which would be faster without testing both?

In a thread starting here
http://www.talkchess.com/forum/viewtopi ... s+compared
I tried to find some answers to this question.
http://www.talkchess.com/forum/viewtopi ... 11&t=16002

Harald

bob · Post by **bob** » Thu Feb 21, 2008 8:55 pm

Harald wrote:
cms271828 wrote:I haven't tried magic bitboards approach yet, but wouldn't the rotated bitboards work out faster since you're indexing smaller arrays?
Or is it too hard to tell which would be faster without testing both?
In a thread starting here
http://www.talkchess.com/forum/viewtopi ... s+compared
I tried to find some answers to this question.
http://www.talkchess.com/forum/viewtopi ... 11&t=16002

Harald

I found zero difference in the speed of the two, but magic bitboards eliminate the need for the rotated bitboards, which means you can generate attacks whenever you want, with whatever piece configuration you want, without having to update 3 other boards in addition to the primary one.

But if speed is all you are looking at, I found zero difference in speed when I removed all the rotated bitboard stuff (including rotated updates in make/unmake) and replaced it by magic generation.

Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation

Re: Magic Move Generation