dangi12012 wrote: ↑Mon Jan 23, 2023 8:15 pm
So since you have AVX512 you maybe wanna take a closer look at this one - native code is in the comments I couldnt verify it. But your 13900k has avx512 + gfni extensions to be able to run it.
I couldn't figure out the correct steps to get AVX512 instructions working, however we only need the 256 bit GFNI versions for Queens which seem to work:
Code: Select all
Verify Engines...OK!
13th Gen Intel(R) Core(TM) i9-13900K
Million Lookups/s Random Squares, Random Occupation/s:
Name Performance [MQueens/s] Tablesize Dependencies Template Author Reference
SBAMG o^(o-3cbn) 332.169816 576 [4kb] countl_zero, bswap yes Syed Fahad http://www.talkchess.com/forum3/viewtopic.php?t=59845
SBAMG Inline 215.952793 0 [0kb] countl_zero, bswap yes Syed Fahad and Daniel Inführ http://www.talkchess.com/forum3/viewtopic.php?t=59845
GaloisField - AVX512 773.161004 0 [0kb] AVX512F_GFNI no Daniel Inführ (dangi12012) http://www.talkchess.com/forum3/viewtopic.php?f=7&t=81335
Hyperbola Quintessence o^(o-2r) 335.816510 256 [2kb] bswap no Ryan Mack https://www.chessprogramming.org/Hyperbola_Quintessence
Hyperbola Quintessence Inline 107.513876 0 [0kb] bswap yes Ryan Mack https://www.chessprogramming.org/Hyperbola_Quintessence
Genetic 8 Ray 58.763729 0 [0kb] bswap no Daniel Inführ (dangi12012) Abstract C++ Syntax Tree Sifter (c) Daniel Infuehr
Bitrotation 52.830621 0 [0kb] ReverseBits no Daniel Inführ (dangi12012) http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79078&start=20
Binary Neural Network 44.667767 5852 [45kb] pdep_u64, AVX2 no Daniel Inführ (dangi12012) http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79332
Exploding Bitboards 75.366895 768 [6kb] imul64 no Harald Lüßen http://www.open-aurec.com/wbforum/viewtopic.php?f=4&t=4523&start=80
Reference (Switch Lookup) 57.220132 0 [0kb] none yes Daniel Inführ (dangi12012) http://www.talkchess.com/forum3/viewtopic.php?f=7&t=78235&p=907362&hilit=espresso#p907362
AVX Branchless Shift 247.297757 0 [0kb] AVX2 no Daniel Inführ (dangi12012) http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79005&start=60
Pext Emulated 86.823393 107904 [843kb] none no Zach Wegner https://randombit.net/bitbashing/posts/haswell_bit_permutations.html
Dumb7 Fill 90.561322 0 [0kb] none no Gunnar Andersson https://www.chessprogramming.org/Dumb7Fill
Kogge-Stone 143.259463 0 [0kb] none no Peter M. Kogge, Harold S. Stone https://www.chessprogramming.org/Kogge-Stone_Algorithm
Rotated Bitboards 48.882147 1848 [14kb] none no Robert Hyatt https://www.chessprogramming.org/Rotated_Bitboards
QBBEngine 245.401282 0 [0kb] countr_zero, countl_zero yes Fabio Gobbato https://www.chessprogramming.org/QBBEngine
QBBEngine - Shifted Mask 248.537255 0 [0kb] countr_zero, countl_zero no Fabio Gobbato http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79005&start=90#p924623
Classical Bob-Mike 317.352015 1024 [8kb] countr_zero, countl_zero yes Robert Hyatt and Michael Sherwin https://www.chessprogramming.org/Classical_Approach
Advanced Bob-Mike 360.594259 520 [4kb] countr_zero, countl_zero no Michael Sherwin and Daniel Inführ http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79078&start=50#p924653
Leorik 317.960811 128 [1kb] countl_zero no Thomas Jahn (lithander) https://github.com/lithander/MinimalChessEngine
Leorik Inline 125.383071 0 [0kb] countl_zero no Thomas Jahn (lithander) https://github.com/lithander/MinimalChessEngine
Obstruction Difference 348.109620 768 [6kb] countl_zero no Michael Hoffmann http://www.talkchess.com/forum3/viewtopic.php?t=29087
Obstruction Difference Inline 111.217179 0 [0kb] countl_zero yes Michael Hoffmann http://www.talkchess.com/forum3/viewtopic.php?t=29087
Genetic Obstruction Difference 335.078534 384 [3kb] countl_zero no Daniel Inführ and Michael Hoffmann http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79701
Genetic Obstruction Difference V2 388.473977 768 [6kb] countl_zero no Daniel Inführ http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79701
Slide Arithmetic 317.034250 256 [2kb] bzhi_u64, blsmsk_u64 no Jakob Progsch and Daniel Inführ http://www.talkchess.com/forum3/viewtopic.php?f=7&t=78693&p=914767&hilit=SlideArithm#p914767
Slide Arithmetic Inline 120.568965 0 [0kb] bzhi_u64, blsmsk_u64 no Jakob Progsch and Daniel Inführ http://www.talkchess.com/forum3/viewtopic.php?f=7&t=78693&p=914767&hilit=SlideArithm#p914767
Kindergarten 644.520235 16640 [130kb] imul64 no Urban Koistinen https://www.chessprogramming.org/Kindergarten_Bitboards
SISSY Bitboards 460.233876 180416 [1409kb] none no Michael Sherwin http://www.talkchess.com/forum3/viewtopic.php?f=7&t=73083
Fancy Magic BB - Variable shift 669.429925 93376 [729kb] imul64 yes Pradu Kannan https://www.chessprogramming.org/Magic_Bitboards#Fancy
FoldingHash - 4x fancy magic 313.984887 6468 [50kb] none no Daniel Inführ tbd
Plain Magic BB 729.709515 295168 [2306kb] imul64 no Lasse Hansen https://www.chessprogramming.org/Magic_Bitboards#Plain
Black Magic BB - Fixed shift 911.632430 88891 [694kb] imul64 no Onno Garms and Volker Annuss https://www.chessprogramming.org/Magic_Bitboards#Fixed_shift_Fancy
Pext constexpr 1392.418282 107904 [843kb] pext_u64 yes Zach Wegner https://www.chessprogramming.org/BMI2#PEXTBitboards
HyperCube 64.780195 107680 [841kb] none yes Daniel Inführ (dangi12012) http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79004&p=916723&hilit=hypercube#p916723
Here is the updated code:
Code: Select all
namespace Chess_Lookup::GaloisField
{
constexpr auto Size = 0;
template<uint64_t bb>
constexpr uint64_t mask_shift(int ranks) {
return ranks > 0 ? bb >> (ranks << 3) : bb << -(ranks << 3);
}
# define dir_HO(X) (0xFFull << (X & 56))
# define dir_VE(X) (0x0101010101010101ull << (X & 7))
# define dir_D1(X) (mask_shift<0x8040201008040201ull>((X & 7) - (X >> 3)))
# define dir_D2(X) (mask_shift<0x0102040810204080ull>(7 - (X & 7) - (X >> 3)))
static __m256i* boardMask = new __m256i[64];
static void InitMask() {
for (int square = 0; square < 64; ++square) {
boardMask[square] = _mm256_set_epi64x(dir_HO(square) ^ (1ull << square), dir_VE(square) ^ (1ull << square), dir_D1(square) ^ (1ull << square), dir_D2(square) ^ (1ull << square));
}
}
//Reverses bits in all 64 bytes at once
static __m256i bit_reverse(__m256i input) {
__m256i b = _mm256_gf2p8affine_epi64_epi8(input, _mm256_set1_epi64x(0x8040201008040201), 0x00);
const __m256i shuffle_mask = _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23);
return _mm256_shuffle_epi8(b, shuffle_mask);
}
//This can solve 8 rays, so all moves of two queens at once or 4 (rooks, bishops)
static __m256i attack8(uint64_t occ, int square, __m256i mask) {
__m256i o = _mm256_and_epi32(_mm256_set1_epi64x(occ), mask);
__m256i sq = _mm256_set1_epi64x((1ull << square));
__m256i sqRev = _mm256_set1_epi64x((0x8000000000000000ull >> square));
return _mm256_and_epi32(_mm256_xor_epi32(_mm256_sub_epi64(o, sq), bit_reverse(_mm256_sub_epi64(bit_reverse(o), sqRev))), mask);
}
static uint64_t Queen(int sq, uint64_t occ) {
__m256i result = attack8(occ, sq, _mm256_loadu_si256(boardMask + sq));
__m256i result2 = _mm256_or_epi32(result, _mm256_permute4x64_epi64(result, 0x4E));
return _mm256_or_epi32(result2, _mm256_permute4x64_epi64(result2, 0x4D)).m256i_u64[0];
}
#undef dir_HO
#undef dir_VE
#undef dir_D1
#undef dir_D2
}
and corresponding assembly:
Code: Select all
00007FF7D8BF7C60 movzx edx,byte ptr [r8]
00007FF7D8BF7C64 movsxd rax,edx
00007FF7D8BF7C67 shl rax,5
00007FF7D8BF7C6B vmovdqu ymm4,ymmword ptr [rax+r13]
00007FF7D8BF7C71 vpand ymm3,ymm7,ymm4
00007FF7D8BF7C75 vgf2p8affineqb ymm0,ymm3,ymm8,0
00007FF7D8BF7C7B vpshufb ymm1,ymm0,ymm9
00007FF7D8BF7C80 shrx rcx,r14,rdx
00007FF7D8BF7C85 vmovq xmm0,rcx
00007FF7D8BF7C8A vpbroadcastq ymm0,xmm0
00007FF7D8BF7C8F vpsubq ymm0,ymm1,ymm0
00007FF7D8BF7C93 vgf2p8affineqb ymm1,ymm0,ymm8,0
00007FF7D8BF7C99 vpshufb ymm2,ymm1,ymm9
00007FF7D8BF7C9E shlx rax,r12,rdx
00007FF7D8BF7CA3 vmovq xmm0,rax
00007FF7D8BF7CA8 vpbroadcastq ymm0,xmm0
00007FF7D8BF7CAD vpsubq ymm0,ymm3,ymm0
00007FF7D8BF7CB1 vpxor ymm1,ymm0,ymm2
00007FF7D8BF7CB5 vpand ymm2,ymm1,ymm4
00007FF7D8BF7CB9 vpermq ymm0,ymm2,4Eh
00007FF7D8BF7CBF vpor ymm3,ymm2,ymm0
00007FF7D8BF7CC3 vpermq ymm1,ymm3,4Dh
00007FF7D8BF7CC9 vpor ymm0,ymm3,ymm1
00007FF7D8BF7CCD vmovq rax,xmm0
00007FF7D8BF7CD2 xor rdi,rax