Some further (final?) remarks on those didactical 64*64-lookup replacement-routines to get 1. the kind of ray (if any) of two squares (0..63) and/or 2. the inbetween bitsets. Again they are intended to initialize the lookup arrays or to replace them in high memory traffic situations and/or to relax cache pollution. The routines are based on building boolean {-1,0}-masks of nibble-differences for ...
Code: Select all
fileDifference == 0 -> -1 if same file, else < 15
rankDifference == 0 -> -1 if same rank, else < 15
fileDifference ^ rankDifference == 0 -> -1 if same diagonal, else < 15
fileDifference + rankDifference == 0 -> -1 if same antidiagonal, else < 15.
... and to perform further ~nibble-0-ands to combine (add,xor,or) them to a final result.
There are four independent
and,dec,and chains - interesting to see what assembly the compiler produces - a kind of benchmark. VC2005 is sensible to minor source code changes and really acts like a high level assembler
This seems shortest for 32-bit mode - thanks to the one-byte dec-instruction, which is not available in 64-bit mode. To get the dense 5 (instead of 7) as index for same squares, the strange "+^|" -sequence is used, instead of three times "or".
Code: Select all
// return: 1 if same rank
// 2 if same file
// 3 if same diagonal
// 4 if same antidiagonal
// 5 if same square
// 0 otherwise
u32 onSameRay(u32 sq1, u32 sq2) {
u32 rankDiff, fileDiff,
antiDiff, diaxDiff, rayindex;
rankDiff = ((sq2 | 7) - sq1) >>3;
fileDiff = (sq2 & 7) - (sq1 & 7);
antiDiff = rankDiff + fileDiff;
// differences as signed nibbles
rankDiff = rankDiff & 15;
fileDiff = fileDiff & 15;
antiDiff = antiDiff & 15;
diaxDiff = rankDiff ^ fileDiff;
rayindex = antiDiff-1 & 4 * 16;
rayindex += diaxDiff-1 & 3 * 16;
rayindex ^= fileDiff-1 & 2 * 16;
rayindex |= rankDiff-1 & 1 * 16;
return rayindex >> 4;
}
Applying De Morgan results in slightly better scheduling as well as shorter code in 64-bit mode, despite the additional not-instruction ...
Code: Select all
// return: 1 if same rank
// 2 if same file
// 3 if same diagonal
// 4 if same antidiagonal
// 7 if same square
// 0 otherwise
u32 onSameRay(u32 sq1, u32 sq2) {
u32 rankDiff, fileDiff,
antiDiff, diaxDiff, rayindex;
rankDiff = ((sq2| 7) - sq1) >>3;
fileDiff = (sq2 & 7) -(sq1 & 7);
antiDiff = rankDiff + fileDiff;
rankDiff = rankDiff & 15;
fileDiff = fileDiff & 15;
antiDiff = antiDiff & 15;
diaxDiff = rankDiff ^ fileDiff;
rayindex = -antiDiff | ~64;
rayindex &= -diaxDiff | ~48;
rayindex &= -fileDiff | ~32;
rayindex &= -rankDiff | ~16;
return ~rayindex >> 4;
}
The onSameRay-routine might be used to lookup an 8-bitboard array - to calculate the final inbetween-bitboard by shift and mask. Two additional shifts, xors safe calculating min and max of the two squares ...
Code: Select all
u64 inBetween(u32 sq1, u32 sq2)
{
const u64 o = 1;
static const u64 rays[8] =
{
0x0000000000000000,
0x000000000000007e, // 1.rank, inner six
0x0001010101010100, // a-file, inner six
0x0040201008040200, // a1h8-diagonal, inner six
0x0000040810204080, // h1a8-antidiagonal, inner six, considering h1 == 7
0x0000000000000000,
0x0000000000000000,
0x0000000000000000,
};
u64 ray, btw;
ray = rays[onSameRay(sq1, sq2)];
ray = (ray<<sq1) ^ (ray<<sq2);
btw = ((o<<sq1)-o) ^ ((o<<sq2)-o);
return ray & btw;
}
how ((1<<sq1)-1) ^ ((1<<sq2)-1) works with b3,f7 to get all bits "between" them including b3:
Code: Select all
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
... but applying min/max seems to result in shorter and faster code (vc2005 needs to load four times cl prior to the variable shifts), since it also schedules with the independent code of the inlined onSameRay-routine:
Code: Select all
u64 inBetween(u32 sq1, u32 sq2)
{
const u64 o = 1;
static const u64 rays[8] =
{
0x0000000000000000,
0x000000000000007e, // 1.rank, inner six
0x0001010101010100, // a-file, inner six
0x0040201008040200, // a1h8-diagonal, inner six
0x0000040810204080, // h1a8-antidiagonal, inner six, considering h1 == 7
0x0000000000000000,
0x0000000000000000,
0x0000000000000000,
};
u64 ray, btw;
ray = rays[onSameRay(sq1, sq2)];
int delta = sq2 - sq1;
sq1 += delta & (delta >> 31); // min
sq2 -= delta & (delta >> 31); // max
btw = (o << sq2) - o;
return (ray << sq1) & btw;
}
Direct calculation with immediate 64-bit ray-masks - without any lookups - results in a few more code-bytes (the 10-byte opcodes for the immediate 64-bit loads). It seems better to calculate min/max in advance here. A dependency during startup safes one &15 nibblemask, since rankDiff is always positive. Trying to calculate min/max independently from the ray-Differences to gain more parallelism seems to confuse the optimizer of vc2005
Code: Select all
u64 inBetween(u32 sq1, u32 sq2)
{ // ensure sq2 >= sq1
int delta = sq2 - sq1;
sq1 += delta & (delta >> 31); // min
sq2 -= delta & (delta >> 31); // max
// differences as signed nibbles (4 bit)
// rankDiff always positive (0..7) sq2>=sq1
// fileDiff, antiDiff may negative and require & 15
u32 rankDiff, fileDiff, antiDiff, diaxDiff;
rankDiff =((sq2 | 7) - sq1) >>3,
fileDiff = (sq2 & 7) - (sq1 & 7);
antiDiff = (rankDiff + fileDiff);
fileDiff &= 15, antiDiff &= 15;
diaxDiff = (rankDiff ^ fileDiff);
// if difference == 0 -> -1-mask
// otherwise -> 0..14 -> 0-mask
const u64 o = 1, m1 = -1; u64 between;
between = 2 * ((rankDiff - 1) >> 26);
between |= (m1 + fileDiff) & 0x0001010101010100,
between |= (m1 + diaxDiff) & 0x8040201008040200,
between |= (m1 + antiDiff) & 0x0002040810204080;
between = (m1 + (o<<sq2)) & ( between << sq1 );
return between;
}
One may consider the vertical flip of the 7-bit diagonal/antidiagonal masks, using bswap aka _byteswap_uint64 instead of a third immediate load. The one square "longer" rays get masked off anyway...
Some assembly by vc2005:
32-bit mode - hmm, OOE may help a lot.
Code: Select all
?onSameRay@@YAIII@Z PROC
; _sq1$ = edx
; _sq2$ = eax
00000 8b c8 mov ecx, eax
00002 83 e0 07 and eax, 7
00005 83 c9 07 or ecx, 7
00008 2b ca sub ecx, edx
0000a 83 e2 07 and edx, 7
0000d 2b c2 sub eax, edx
0000f c1 e9 03 shr ecx, 3
00012 56 push esi
00013 8b f0 mov esi, eax
00015 8d 14 0e lea edx, DWORD PTR [esi+ecx]
00018 83 e6 0f and esi, 15
0001b 83 e1 0f and ecx, 15
0001e 8b c6 mov eax, esi
00020 33 c1 xor eax, ecx
00022 48 dec eax
00023 83 e0 30 and eax, 48
00026 83 e2 0f and edx, 15
00029 4a dec edx
0002a 83 e2 40 and edx, 64
0002d 03 c2 add eax, edx
0002f 4e dec esi
00030 83 e6 20 and esi, 32
00033 33 c6 xor eax, esi
00035 49 dec ecx
00036 83 e1 10 and ecx, 16
00039 0b c1 or eax, ecx
0003b c1 e8 04 shr eax, 4
0003e 5e pop esi
0003f c3 ret 0
64-bit mode: longer opcode due to rex-prefix using r8 and one byte increment opcodes are reserved for prefixes and not available - thus neg is one byte shorter opcode than it's one's complement counterpart sub 1.
Code: Select all
sq1$ = 8
sq2$ = 16
?onSameRay@@YAIII@Z PROC
00000 44 8b c2 mov r8d, edx
00003 83 e2 07 and edx, 7
00006 41 83 c8 07 or r8d, 7
0000a 44 2b c1 sub r8d, ecx
0000d 83 e1 07 and ecx, 7
00010 2b d1 sub edx, ecx
00012 41 c1 e8 03 shr r8d, 3
00016 42 8d 0c 02 lea ecx, DWORD PTR [rdx+r8]
0001a 83 e2 0f and edx, 15
0001d 41 83 e0 0f and r8d, 15
00021 8b c2 mov eax, edx
00023 83 e1 0f and ecx, 15
00026 f7 da neg edx
00028 41 33 c0 xor eax, r8d
0002b 83 ca 10 or edx, 16
0002e 41 f7 d8 neg r8d
00031 f7 d8 neg eax
00033 41 83 c8 ef or r8d, -17
00037 f7 d9 neg ecx
00039 23 c2 and eax, edx
0003b 83 c9 bf or ecx, -65
0003e 41 23 c0 and eax, r8d
00041 83 c8 cf or eax, -49
00044 23 c1 and eax, ecx
00046 f7 d0 not eax
00048 c1 e8 04 shr eax, 4
0004b c3 ret 0
Code: Select all
sq1$ = 8
sq2$ = 16
?inBetween@@YA_KII@Z PROC
00000 8b c1 mov eax, ecx
00002 44 8b c2 mov r8d, edx
00005 44 8b d1 mov r10d, ecx
00008 83 e0 07 and eax, 7
0000b 44 8b ca mov r9d, edx
0000e 44 8b da mov r11d, edx
00011 41 83 e0 07 and r8d, 7
00015 41 83 c9 07 or r9d, 7
00019 44 2b c0 sub r8d, eax
0001c 44 2b c9 sub r9d, ecx
0001f 8b c2 mov eax, edx
00021 41 2b c2 sub eax, r10d
00024 41 c1 e9 03 shr r9d, 3
00028 8b d0 mov edx, eax
0002a 43 8d 0c 08 lea ecx, DWORD PTR [r8+r9]
0002e 41 83 e0 0f and r8d, 15
00032 83 e1 0f and ecx, 15
00035 c1 fa 1f sar edx, 31
00038 41 83 e1 0f and r9d, 15
0003c 23 d0 and edx, eax
0003e 41 8b c0 mov eax, r8d
00041 f7 d9 neg ecx
00043 41 33 c1 xor eax, r9d
00046 83 c9 bf or ecx, -65
00049 41 f7 d8 neg r8d
0004c f7 d8 neg eax
0004e 41 83 c8 10 or r8d, 16
00052 41 f7 d9 neg r9d
00055 41 23 c0 and eax, r8d
00058 41 83 c9 ef or r9d, -17
0005c 4c 8d 05 00 00
00 00 lea r8, OFFSET FLAT:?rays@?1??inBetween
00063 41 23 c1 and eax, r9d
00066 83 c8 cf or eax, -49
00069 23 c1 and eax, ecx
0006b 42 8d 0c 12 lea ecx, DWORD PTR [rdx+r10]
0006f f7 d0 not eax
00071 48 c1 e8 04 shr rax, 4
00075 49 8b 04 c0 mov rax, QWORD PTR [r8+rax*8]
00079 48 d3 e0 shl rax, cl
0007c 41 8b cb mov ecx, r11d
0007f 2b ca sub ecx, edx
00081 ba 01 00 00 00 mov edx, 1
00086 48 d3 e2 shl rdx, cl
00089 48 83 ea 01 sub rdx, 1
0008d 48 23 c2 and rax, rdx
00090 c3 ret 0
Code: Select all
sq1$ = 8
sq2$ = 16
?inBetween@@YA_KII@Z PROC
00000 8b c2 mov eax, edx
00002 49 bb 80 40 20
10 08 04 02 00 mov r11, 0002040810204080H
0000c 2b c1 sub eax, ecx
0000e 44 8b c0 mov r8d, eax
00011 41 c1 f8 1f sar r8d, 31
00015 44 23 c0 and r8d, eax
00018 41 2b d0 sub edx, r8d
0001b 45 8d 14 08 lea r10d, DWORD PTR [r8+rcx]
0001f 8b ca mov ecx, edx
00021 44 8b c2 mov r8d, edx
00024 83 e1 07 and ecx, 7
00027 41 83 c8 07 or r8d, 7
0002b 41 8b c2 mov eax, r10d
0002e 83 e0 07 and eax, 7
00031 45 2b c2 sub r8d, r10d
00034 2b c8 sub ecx, eax
00036 41 c1 e8 03 shr r8d, 3
0003a 42 8d 04 01 lea eax, DWORD PTR [rcx+r8]
0003e 83 e1 0f and ecx, 15
00041 45 8b c8 mov r9d, r8d
00044 83 e0 0f and eax, 15
00047 4c 33 c9 xor r9, rcx
0004a 48 83 c1 ff add rcx, -1
0004e 48 83 e8 01 sub rax, 1
00052 49 83 e9 01 sub r9, 1
00056 49 23 c3 and rax, r11
00059 4d 23 cb and r9, r11
0005c 49 0f c9 bswap r9
0005f 4c 0b c8 or r9, rax
00062 41 8d 40 ff lea eax, DWORD PTR [r8-1]
00066 c1 e8 1a shr eax, 26
00069 03 c0 add eax, eax
0006b 4c 0b c8 or r9, rax
0006e 48 b8 00 01 01
01 01 01 01 00 mov rax, 0001010101010100H
00078 48 23 c1 and rax, rcx
0007b 41 8b ca mov ecx, r10d
0007e 49 0b c1 or rax, r9
00081 48 d3 e0 shl rax, cl
00084 8b ca mov ecx, edx
00086 ba 01 00 00 00 mov edx, 1
0008b 48 d3 e2 shl rdx, cl
0008e 48 83 ea 01 sub rdx, 1
00092 48 23 c2 and rax, rdx
00095 c3 ret 0
Cheers,
Gerd