Update on Bobs lookup method
the non portable _bitscanforward64 made it much slower than the algo really is. I made the initializsation consteval and now it compatible with gcc/clang.
Is 2x than before which is really good because it only takes 8Kib of lookup and thus fits inside L1. Also its 2x faster than Kogge-Stone Maybe we can optimize it more?
Code: Select all
#include <cstdint>
#include <bit>
#include <array>
struct Rays {
uint64_t rayNW;
uint64_t rayNN;
uint64_t rayNE;
uint64_t rayEE;
uint64_t raySE;
uint64_t raySS;
uint64_t raySW;
uint64_t rayWW;
uint64_t rwsNW;
uint64_t rwsNN;
uint64_t rwsNE;
uint64_t rwsEE;
uint64_t rwsSE;
uint64_t rwsSS;
uint64_t rwsSW;
uint64_t rwsWW;
};
consteval std::array<Rays,64> Initialize() {
enum { FILE1, FILE2, FILE3, FILE4, FILE5, FILE6, FILE7, FILE8 };
enum { RANK1, RANK2, RANK3, RANK4, RANK5, RANK6, RANK7, RANK8 };
int sq, ts, file, rank, c;
std::array<Rays, 64> ray{};
for (sq = 0; sq <= 63; sq++) {
file = sq & 7;
rank = sq >> 3;
// Northwest
ray[sq].rayNW = 0;
for (c = 1, ts = sq + 7; file - c >= FILE1 && rank + c <= RANK8; c++, ts += 7) ray[sq].rayNW |= 1ull << ts;
ray[sq].rwsNW = ray[sq].rayNW | 0x8000000000000000;
// Northeast
ray[sq].rayNE = 0;
for (c = 1, ts = sq + 9; file + c <= FILE8 && rank + c <= RANK8; c++, ts += 9) ray[sq].rayNE |= 1ull << ts;
ray[sq].rwsNE = ray[sq].rayNE | 0x8000000000000000;
// Southeast
ray[sq].raySE = 0;
for (c = 1, ts = sq - 7; file + c <= FILE8 && rank - c >= RANK1; c++, ts -= 7) ray[sq].raySE |= 1ull << ts;
ray[sq].rwsSE = ray[sq].raySE | 0x0000000000000001;
// Southwest
ray[sq].raySW = 0;
for (c = 1, ts = sq - 9; file - c >= FILE1 && rank - c >= RANK1; c++, ts -= 9) ray[sq].raySW |= 1ull << ts;
ray[sq].rwsSW = ray[sq].raySW | 0x0000000000000001;
// North
ray[sq].rayNN = 0;
for (c = 1, ts = sq + 8; rank + c <= RANK8; c++, ts += 8) ray[sq].rayNN |= 1ull << ts;
ray[sq].rwsNN = ray[sq].rayNN | 0x8000000000000000;
// East
ray[sq].rayEE = 0;
for (c = 1, ts = sq + 1; file + c <= FILE8; c++, ts += 1) ray[sq].rayEE |= 1ull << ts;
ray[sq].rwsEE = ray[sq].rayEE | 0x8000000000000000;
// South
ray[sq].raySS = 0;
for (c = 1, ts = sq - 8; rank - c >= RANK1; c++, ts -= 8) ray[sq].raySS |= 1ull << ts;
ray[sq].rwsSS = ray[sq].raySS | 0x0000000000000001;
// West
ray[sq].rayWW = 0;
for (c = 1, ts = sq - 1; file - c >= FILE1; c++, ts -= 1) ray[sq].rayWW |= 1ull << ts;
ray[sq].rwsWW = ray[sq].rayWW | 0x0000000000000001;
}
return ray;
}
constexpr std::array<Rays, 64> ray = Initialize();
constexpr auto size = sizeof(ray);
uint64_t Queen(int sq, uint64_t occ) {
unsigned long iNW, iNN, iNE, iEE, iSE, iSS, iSW, iWW = 0;
uint64_t r1, r2, r3, r4, r5, r6;
occ |= 0x8000000000000001;
r1 = ray[sq].rwsNW & occ;
r2 = ray[sq].rwsNN & occ;
r3 = ray[sq].rwsNE & occ;
r4 = ray[sq].rwsEE & occ;
iNW = std::countr_zero(r1);
iNN = std::countr_zero(r2);
iNE = std::countr_zero(r3);
iEE = std::countr_zero(r4);
r1 = ray[sq].rayNW ^ ray[iNW].rayNW;
r2 = ray[sq].rayNN ^ ray[iNN].rayNN;
r3 = ray[sq].rayNE ^ ray[iNE].rayNE;
r4 = ray[sq].rayEE ^ ray[iEE].rayEE;
r5 = r1 | r2;
r6 = r3 | r4;
r1 = ray[sq].rwsSE & occ;
r2 = ray[sq].rwsSS & occ;
r3 = ray[sq].rwsSW & occ;
r4 = ray[sq].rwsWW & occ;
iSE = 63 - std::countl_zero(r1);
iSS = 63 - std::countl_zero(r2);
iSW = 63 - std::countl_zero(r3);
iWW = 63 - std::countl_zero(r4);
r1 = ray[sq].raySE ^ ray[iSE].raySE;
r2 = ray[sq].raySS ^ ray[iSS].raySS;
r3 = ray[sq].raySW ^ ray[iSW].raySW;
r4 = ray[sq].rayWW ^ ray[iWW].rayWW;
r1 = r1 | r3;
r2 = r2 | r4;
r3 = r5 | r6;
return r1 | r2 | r3;
}
Produces 244.82MOps performance on queen lookups:
Code: Select all
Megalookups/s:
Reference: 118.25MOps
KoggeStone: 112.38MOps
BobMike: 244.82MOps
FancyHash: 506.22MOps
Pext : 852.53MOps
HyperLookup: 926.42MOps
With the assembly here:
https://godbolt.org/z/qdbsYdnWY
Code: Select all
Queen(int, unsigned long): # @Queen(int, unsigned long)
push r14
push rbx
movsxd rax, edi
movabs rcx, -9223372036854775807
shl rax, 7
or rcx, rsi
mov rsi, qword ptr [rax + ray+72]
mov rdi, qword ptr [rax + ray+80]
mov r8, qword ptr [rax + ray+64]
mov rdx, qword ptr [rax + ray+88]
mov rbx, qword ptr [rax + ray+96]
and rsi, rcx
and rdi, rcx
and r8, rcx
and rdx, rcx
and rbx, rcx
tzcnt r9, rsi
tzcnt r10, rdi
mov rsi, qword ptr [rax + ray+104]
mov rdi, qword ptr [rax + ray+112]
tzcnt r11, rdx
lzcnt r14, rbx
mov edx, 63
mov ebx, 63
tzcnt r8, r8
sub rdx, r14
shl rdx, 7
shl r10, 7
shl r11, 7
shl r8, 7
shl r9, 7
vmovq xmm2, qword ptr [rdx + ray+32] # xmm2 = mem[0],zero
vmovq xmm3, qword ptr [r8 + ray] # xmm3 = mem[0],zero
and rsi, rcx
and rdi, rcx
and rcx, qword ptr [rax + ray+120]
lzcnt r14, rsi
mov esi, 63
sub rsi, r14
lzcnt r14, rdi
mov edi, 63
sub rdi, r14
shl rsi, 7
shl rdi, 7
lzcnt rcx, rcx
vmovq xmm1, qword ptr [rdi + ray+48] # xmm1 = mem[0],zero
sub rbx, rcx
shl rbx, 7
vmovq xmm0, qword ptr [rbx + ray+56] # xmm0 = mem[0],zero
vpunpcklqdq xmm0, xmm1, xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq xmm1, qword ptr [rsi + ray+40] # xmm1 = mem[0],zero
vpunpcklqdq xmm1, xmm2, xmm1 # xmm1 = xmm2[0],xmm1[0]
vmovq xmm2, qword ptr [r10 + ray+16] # xmm2 = mem[0],zero
vinserti128 ymm0, ymm1, xmm0, 1
vmovq xmm1, qword ptr [r11 + ray+24] # xmm1 = mem[0],zero
vpxor ymm0, ymm0, ymmword ptr [rax + ray+32]
vpunpcklqdq xmm1, xmm2, xmm1 # xmm1 = xmm2[0],xmm1[0]
vmovq xmm2, qword ptr [r9 + ray+8] # xmm2 = mem[0],zero
vpunpcklqdq xmm2, xmm3, xmm2 # xmm2 = xmm3[0],xmm2[0]
vinserti128 ymm1, ymm2, xmm1, 1
vpxor ymm1, ymm1, ymmword ptr [rax + ray]
vpor ymm0, ymm1, ymm0
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
pop rbx
pop r14
vzeroupper
ret