Even better would be to drop the assembly language and replace it with C code. The following code will probably execute faster than your assembly code because the compiler will be better able to optimize around all the calls.
Code: Select all
// 'lsz64_tbl' source: Matt Taylor
static const int lsz64_tbl[64] =
{
0, 31, 4, 33, 60, 15, 12, 34,
61, 25, 51, 10, 56, 20, 22, 35,
62, 30, 3, 54, 52, 24, 42, 19,
57, 29, 2, 44, 47, 28, 1, 36,
63, 32, 59, 5, 6, 50, 55, 7,
16, 53, 13, 41, 8, 43, 46, 17,
26, 58, 49, 14, 11, 40, 9, 45,
21, 48, 39, 23, 18, 38, 37, 27
};
#ifdef _MSC_VER
#define FORCEINLINE __forceinline
#else
#define FORCEINLINE __inline
#endif
//______________________________________________________________________________
/* FirstPieceAndClear():
*
* Return square number (0 to 63) of the least significant set bit
* in bitboard 'bb' and clear that bit from bitboard 'bb'
*
* source: Matt Taylor's "de Bruijn method" implementation
*/
//______________________________________________________________________________
FORCEINLINE int FirstPieceAndClear(BITBOARD *bb)
{
const BITBOARD lsb = (*bb & -(s64) *bb) - 1;
register const u32 foldedLSB = ((u32) lsb) ^ ((u32) (lsb >> 32));
*bb &= *bb - 1; // clear least significant bit from bb
return lsz64_tbl[foldedLSB * 0x78291ACF >> 26];
}
If your layout is different you'll have to rearrange the lsz64_tb array entries.