tcusr wrote: ↑Sun Dec 12, 2021 10:45 pm
Update: By combining the elements of all 4 rays into a single lookup and putting them right next to each other
I gained a few % of speed. (Because now elements are right next to each other during lookup)
347.67MOps -> 364.83MOps
Code: Select all
Megalookups/s:
Exploading: 116.98MOps
Reference: 123.29MOps
KoggeStone: 177.18MOps
RotatedBoard: 158.44MOps
BobMike: 340.02MOps
SlideArithm: 364.83MOps
XorRookSub: 505.78MOps
FancyHash: 656.95MOps
Pext : 926.80MOps
HyperLookup: 1552.29MOps
Assembly comparison:
Before:
Code: Select all
Chess_Lookup::SlideArithm::Queen(int, unsigned long): # @Chess_Lookup::SlideArithm::Queen(int, unsigned long)
movsxd r8, edi
mov ecx, 1
movabs r9, 9223372036854775807
mov rdi, qword ptr [8*r8 + Chess_Lookup::SlideArithm::rank_mask]
shlx r10, rcx, r8
mov r11, qword ptr [8*r8 + Chess_Lookup::SlideArithm::file_mask]
neg r10
btc r10, r8
mov rcx, r10
mov rax, r10
mov rdx, rdi
and rdx, rsi
and rcx, rdx
bzhi rdx, rdx, r8
or rdx, 1
blsmsk rcx, rcx
lzcnt rdx, rdx
shrx rdx, r9, rdx
xor rdx, rcx
and rdx, rdi
mov rdi, r11
and rdi, rsi
and rax, rdi
bzhi rdi, rdi, r8
or rdi, 1
blsmsk rax, rax
lzcnt rdi, rdi
shrx rdi, r9, rdi
xor rdi, rax
mov rax, qword ptr [8*r8 + Chess_Lookup::SlideArithm::a1h8_mask]
and rdi, r11
mov r11, qword ptr [8*r8 + Chess_Lookup::SlideArithm::h1a8_mask]
xor rdi, rdx
mov rdx, r10
mov rcx, rax
and rcx, rsi
and rdx, rcx
bzhi rcx, rcx, r8
and rsi, r11
or rcx, 1
blsmsk rdx, rdx
and r10, rsi
lzcnt rcx, rcx
blsmsk r10, r10
shrx rcx, r9, rcx
xor rcx, rdx
and rcx, rax
bzhi rax, rsi, r8
or rax, 1
lzcnt rax, rax
shrx rax, r9, rax
xor rax, r10
and rax, r11
xor rax, rcx
xor rax, rdi
ret
After:
Code: Select all
Chess_Lookup::SlideArithm::Queen(unsigned long, unsigned long): # @Chess_Lookup::SlideArithm::Queen(unsigned long, unsigned long)
mov eax, 1
vmovq xmm1, rsi
vpcmpeqd ymm3, ymm3, ymm3
vpxor xmm6, xmm6, xmm6
shlx rax, rax, rdi
vpbroadcastq ymm1, xmm1
lea rcx, [rax - 1]
neg rax
btc rax, rdi
shl rdi, 5
vmovdqu ymm0, ymmword ptr [rdi + Chess_Lookup::SlideArithm::rank_mask]
vmovq xmm2, rax
vpbroadcastq ymm2, xmm2
vpand ymm1, ymm0, ymm1
vpand ymm2, ymm2, ymm1
vpaddq ymm3, ymm2, ymm3
vpxor ymm2, ymm3, ymm2
vmovq xmm3, rcx
vpbroadcastq ymm3, xmm3
vpand ymm1, ymm1, ymm3
vpbroadcastq ymm3, qword ptr [rip + .LCPI0_0] # ymm3 = [1,1,1,1]
vpor ymm1, ymm1, ymm3
vmovdqa ymm3, ymmword ptr [rip + .LCPI0_1] # ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
vpsrlw ymm5, ymm1, 4
vpand ymm5, ymm5, ymmword ptr [rip + .LCPI0_2]
vpshufb ymm4, ymm3, ymm1
vpcmpeqb ymm7, ymm5, ymm6
vpshufb ymm3, ymm3, ymm5
vpand ymm4, ymm4, ymm7
vpaddb ymm3, ymm4, ymm3
vpcmpeqb ymm4, ymm1, ymm6
vpsrlw ymm4, ymm4, 8
vpand ymm4, ymm3, ymm4
vpsrlw ymm3, ymm3, 8
vpaddw ymm3, ymm3, ymm4
vpcmpeqw ymm4, ymm1, ymm6
vpcmpeqd ymm1, ymm1, ymm6
vpsrld ymm4, ymm4, 16
vpsrlq ymm1, ymm1, 32
vpand ymm4, ymm3, ymm4
vpsrld ymm3, ymm3, 16
vpaddd ymm3, ymm3, ymm4
vpbroadcastq ymm4, qword ptr [rip + .LCPI0_3] # ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
vpand ymm1, ymm3, ymm1
vpsrlq ymm3, ymm3, 32
vpaddq ymm1, ymm3, ymm1
vpsrlvq ymm1, ymm4, ymm1
vpxor ymm1, ymm2, ymm1
vpand ymm0, ymm1, ymm0
vextracti128 xmm1, ymm0, 1
vpxor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
vpxor xmm0, xmm0, xmm1
vmovq rax, xmm0
vzeroupper
ret
Note that because you have a elements packed in a multiple of 4 you suddenly get AVX2 instructions
Also the code shrinked by 15 lines.
Code: Select all
#include <array>
#include <cstdint>
#include <bit>
namespace Chess_Lookup::SlideArithm
{
/* Init */
constexpr bool safe_coord(int f, int r)
{
return (0 <= f && f < 8) && (0 <= r && r < 8);
}
constexpr uint64_t init_mask(int s, int df, int dr)
{
uint64_t b{}; int f{}, r{};
f = s & 7; r = s >> 3;
while (safe_coord(f += df, r += dr))
b |= 1ull << (f + r * 8);
return b;
}
constexpr std::array<uint64_t, 256> init_array()
{
std::array<uint64_t, 256> a{}; int n{};
for (int s = 0; s < 64; s++)
{
a[n++] = init_mask(s, 1, 0) | init_mask(s, -1, 0);
a[n++] = init_mask(s, 0, 1) | init_mask(s, 0, -1);
a[n++] = init_mask(s, 1, 1) | init_mask(s, -1, -1);
a[n++] = init_mask(s, -1, 1) | init_mask(s, 1, -1);
}
return a;
}
static constexpr std::array<uint64_t, 256> rank_mask = init_array();
/* Start of code */
constexpr uint64_t slide_arithmetic(int p, uint64_t line, uint64_t block) {
uint64_t piece = 1ull << p;
// mask blockers
block = block & ~piece & line;
// split the line into upper and lower rays
uint64_t bottom = piece - 1ull;
// for the upper part we can use the x^(x-1) trick to fill in from the bottom
uint64_t masked_up = block & ~bottom;
uint64_t blocked_up = (masked_up ^ (masked_up - 1ull));
// for the bottom we use CLZ + a shift to fill in from the top
uint64_t masked_down = block & bottom;
uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(masked_down | 1ull);
// the intersection of the two is the move set after masking with the line
return (blocked_up ^ blocked_down) & line;
}
constexpr uint64_t Queen(uint64_t s, uint64_t occ)
{
const uint64_t* r = rank_mask.data() + 4 * s;
return slide_arithmetic(s, r[0], occ)
^ slide_arithmetic(s, r[1], occ)
^ slide_arithmetic(s, r[2], occ)
^ slide_arithmetic(s, r[3], occ);
}
}