Combining two of Bob's classic bitboard attack getters

dangi12012 · Post by **dangi12012** » Sun Dec 12, 2021 2:11 pm

tcusr wrote: ↑Sun Dec 12, 2021 1:14 pm are you sure the code is correct?
i used your cleaned up code
Code: Select all
for (int i = 0; i < 64; ++i)
		_map(XorRookSub::Queen(i, 0xffull<<6)),getchar();

Yes it is correct. Verify url: https://onlinegdb.com/z1D0csXwp
Add compiler flags: -std=c++2a -O3

Just scroll down and run

Greetings!

tcusr · Post by **tcusr** » Sun Dec 12, 2021 3:35 pm

dangi12012 wrote: ↑Sun Dec 12, 2021 2:11 pm
tcusr wrote: ↑Sun Dec 12, 2021 1:14 pm are you sure the code is correct?
i used your cleaned up code
Code: Select all
for (int i = 0; i < 64; ++i)
		_map(XorRookSub::Queen(i, 0xffull<<6)),getchar();
Yes it is correct. Verify url: https://onlinegdb.com/z1D0csXwp
Add compiler flags: -std=c++2a -O3

Just scroll down and run
Greetings!

nvm i'm just stupid, ofc it's correct
btw i fixed my code, i had an error in the rank attacks

Code: Select all

#include <cstdint>
#include <array>

constexpr uint64_t byteswap(uint64_t b)
{
	return __builtin_bswap64(b);
}

constexpr uint64_t to_bit(int s)
{
	return 1ULL << s;
}

constexpr bool safe_coord(int f, int r)
{
	return (0 <= f && f < 8) && (0 <= r && r < 8);
}

constexpr uint64_t init_mask(int s, int df, int dr)
{
	uint64_t b{};
	int f, r;

	f = s & 7;
	r = s >> 3;
	while (safe_coord(f += df, r += dr))
		b |= to_bit(f + r * 8);

	return b;
}

template<typename F>
constexpr auto init_array(F&& f)
{
	std::array<uint64_t, 64> a{};

	for (int s = 0; s < 64; ++s)
		a[s] = f(s);

	return a;
}

constexpr auto file_mask{ init_array([](int s) {
		return init_mask(s, 0, 1) | init_mask(s, 0, -1);
}) };

constexpr auto a1h8_mask{ init_array([](int s) {
		return init_mask(s, 1, 1) | init_mask(s, -1, -1);
}) };

constexpr auto h1a8_mask{ init_array([](int s) {
		return init_mask(s, -1, 1) | init_mask(s, 1, -1);
}) };

constexpr int generate_rank_attack(int o, int  f) {
	int x, y;
	int b;

	y = 0;
	for (x = f - 1; x >= 0; --x) {
		b = 1 << x;
		y |= b;
		if ((o & b) == b) break;
	}
	for (x = f + 1; x < 8; ++x) {
		b = 1 << x;
		y |= b;
		if ((o & b) == b) break;
	}
	return y;
}

constexpr auto rank_table{ [] {
	std::array<uint8_t, 512> a{};

	for (int s = 0; s < 64; ++s) {
		for (int f = 0; f < 8; ++f)
			a[s * 8 + f] = generate_rank_attack(s * 2, f);
	}

	return a;
} () };

constexpr uint64_t sliding_attack(int s, uint64_t mask, uint64_t occ)
{
	occ &= mask;

	return ((occ - to_bit(s)) ^ byteswap(byteswap(occ) - to_bit(s ^ 56))) & mask;
}

constexpr uint64_t rank_attack(int s, uint64_t occ)
{
	int f{ s & 7 };
	int r{ s & 56 };
	uint64_t o = (occ >> r) & 126;

	return static_cast<uint64_t>(rank_table[o * 4  + f]) << r;
}

constexpr uint64_t bishop_attacks(int s, uint64_t occ)
{
	return sliding_attack(s, a1h8_mask[s], occ) ^ sliding_attack(s, h1a8_mask[s], occ);
}

constexpr uint64_t rook_attacks(int s, uint64_t occ)
{
	return sliding_attack(s, file_mask[s], occ) ^ rank_attack(s, occ);
}

constexpr uint64_t Queen(int sq, uint64_t occ)
{
	return rook_attacks(sq, occ) ^ bishop_attacks(sq, occ);
}

void Init()
{
}

tcusr · Post by **tcusr** » Sun Dec 12, 2021 10:45 pm

sorry to pollute this thread but daniel could you also please add this algo?
i took it from this post

Code: Select all

#include <array>
#include <cstdint>
#include <bit>

constexpr uint64_t to_bit(int s)
{
	return 1ULL << s;
}

constexpr bool safe_coord(int f, int r)
{
	return (0 <= f && f < 8) && (0 <= r && r < 8);
}

constexpr uint64_t slide_arithmetic(uint64_t piece, uint64_t line, uint64_t block) {
    // mask blockers
    block = block & ~piece & line;

    // split the line into upper and lower rays
    uint64_t bottom = piece-UINT64_C(1);

    // for the upper part we can use the x^(x-1) trick to fill in from the bottom
    uint64_t masked_up = block & ~bottom;
    uint64_t blocked_up = (masked_up ^ (masked_up-UINT64_C(1)));

    // for the bottom we use CLZ + a shift to fill in from the top
    uint64_t masked_down = block & bottom;
    uint64_t blocked_down = (UINT64_C(0x7FFFFFFFFFFFFFFF) >> std::countl_zero(masked_down|UINT64_C(1)));

    // the intersection of the two is the move set after masking with the line
    return (blocked_up ^ blocked_down) & line;
}

constexpr uint64_t init_mask(int s, int df, int dr)
{
	uint64_t b{};
	int f{}, r{};

	f = s & 7;
	r = s >> 3;
	while (safe_coord(f += df, r += dr))
		b |= to_bit(f + r * 8);

	return b;
}

template<typename F>
constexpr auto init_array(F&& f)
{
	std::array<uint64_t, 64> a{};

	for (int s = 0; s < 64; ++s)
		a[s] = f(s);

	return a;
}

constexpr auto rank_mask{ init_array([](int s) {
		return init_mask(s, 1, 0) | init_mask(s, -1, 0);
}) };

constexpr auto file_mask{ init_array([](int s) {
		return init_mask(s, 0, 1) | init_mask(s, 0, -1);
}) };

constexpr auto a1h8_mask{ init_array([](int s) {
		return init_mask(s, 1, 1) | init_mask(s, -1, -1);
}) };

constexpr auto h1a8_mask{ init_array([](int s) {
		return init_mask(s, -1, 1) | init_mask(s, 1, -1);
}) };

constexpr uint64_t Queen(int s, uint64_t occ)
{
	return slide_arithmetic(to_bit(s), rank_mask[s], occ)
		^  slide_arithmetic(to_bit(s), file_mask[s], occ)
		^  slide_arithmetic(to_bit(s), a1h8_mask[s], occ)
		^  slide_arithmetic(to_bit(s), h1a8_mask[s], occ);
}

void Init()
{
}

dangi12012 · Post by **dangi12012** » Sun Dec 12, 2021 11:57 pm

tcusr wrote: ↑Sun Dec 12, 2021 10:45 pm sorry to pollute this thread but daniel could you also please add this algo?

Im not thinking you are polluting anything at all. Keep em coming

We just compare to Bob and its on topic.
I like the assortment - every algo is very very different from each other. Some use lookups - some dont - some bigger. Some smaller but all have different ideas.

The SlideArithmetic algo you posted gets 347.67MOps which is faster than Bobs lookup method. Very nice code.

Code: Select all

Megalookups/s:
Exploading:     114.63MOps
Reference:      122.65MOps
KoggeStone:     176.11MOps
RotatedBoard:   158.94MOps
BobMike:        339.19MOps
SlideArithm:    347.67MOps
XorRookSub:     474.33MOps
FancyHash:      655.60MOps
Pext  :         901.59MOps
HyperLookup:    1548.59MOps

Where did you get that code from? Init + Lookup code is just 70 Lines with all overhead. The ratio Lines of Code / Performance is great!

Code: Select all

#include <array>
#include <cstdint>
#include <bit>


namespace Chess_Lookup::SlideArithm
{
	/* Init */
	constexpr bool safe_coord(int f, int r)
	{
		return (0 <= f && f < 8) && (0 <= r && r < 8);
	}

	constexpr uint64_t init_mask(int s, int df, int dr)
	{
		uint64_t b{}; int f{}, r{};
		f = s & 7; r = s >> 3;
		while (safe_coord(f += df, r += dr))
			b |= 1ull << (f + r * 8);

		return b;
	}

	template<typename F>
	constexpr std::array<uint64_t, 64> init_array(F&& f)
	{
		std::array<uint64_t, 64> a{};

		for (int s = 0; s < 64; ++s)
			a[s] = f(s);

		return a;
	}

	constexpr auto rank_mask{ init_array([](int s) { return init_mask(s,  1,  0) | init_mask(s, -1,  0); }) };
	constexpr auto file_mask{ init_array([](int s) { return init_mask(s,  0,  1) | init_mask(s,  0, -1); }) };
	constexpr auto a1h8_mask{ init_array([](int s) { return init_mask(s,  1,  1) | init_mask(s, -1, -1); }) };
	constexpr auto h1a8_mask{ init_array([](int s) { return init_mask(s, -1,  1) | init_mask(s,  1, -1); }) };

	/* Start of code */
	constexpr uint64_t slide_arithmetic(int p, uint64_t line, uint64_t block) {
		uint64_t piece = 1ull << p;

		// mask blockers
		block = block & ~piece & line;

		// split the line into upper and lower rays
		uint64_t bottom = piece - 1ull;

		// for the upper part we can use the x^(x-1) trick to fill in from the bottom
		uint64_t masked_up = block & ~bottom;
		uint64_t blocked_up = (masked_up ^ (masked_up - 1ull));

		// for the bottom we use CLZ + a shift to fill in from the top
		uint64_t masked_down = block & bottom;
		uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(masked_down | 1ull);

		// the intersection of the two is the move set after masking with the line
		return (blocked_up ^ blocked_down) & line;
	}

	constexpr uint64_t Queen(int s, uint64_t occ)
	{
		return slide_arithmetic(s,  rank_mask[s], occ)
			 ^ slide_arithmetic(s,  file_mask[s], occ)
			 ^ slide_arithmetic(s,  a1h8_mask[s], occ)
			 ^ slide_arithmetic(s,  h1a8_mask[s], occ);
	}
}

Also I will release everything soon. Just need to make sure that everything is solid.

dangi12012 · Post by **dangi12012** » Mon Dec 13, 2021 12:50 am

tcusr wrote: ↑Sun Dec 12, 2021 10:45 pm

Update: By combining the elements of all 4 rays into a single lookup and putting them right next to each other I gained a few % of speed. (Because now elements are right next to each other during lookup)

347.67MOps -> 364.83MOps

Code: Select all

Megalookups/s:
Exploading:     116.98MOps
Reference:      123.29MOps
KoggeStone:     177.18MOps
RotatedBoard:   158.44MOps
BobMike:        340.02MOps
SlideArithm:    364.83MOps
XorRookSub:     505.78MOps
FancyHash:      656.95MOps
Pext  :         926.80MOps
HyperLookup:    1552.29MOps

Assembly comparison:
Before:

Code: Select all

Chess_Lookup::SlideArithm::Queen(int, unsigned long): # @Chess_Lookup::SlideArithm::Queen(int, unsigned long)
        movsxd  r8, edi
        mov     ecx, 1
        movabs  r9, 9223372036854775807
        mov     rdi, qword ptr [8*r8 + Chess_Lookup::SlideArithm::rank_mask]
        shlx    r10, rcx, r8
        mov     r11, qword ptr [8*r8 + Chess_Lookup::SlideArithm::file_mask]
        neg     r10
        btc     r10, r8
        mov     rcx, r10
        mov     rax, r10
        mov     rdx, rdi
        and     rdx, rsi
        and     rcx, rdx
        bzhi    rdx, rdx, r8
        or      rdx, 1
        blsmsk  rcx, rcx
        lzcnt   rdx, rdx
        shrx    rdx, r9, rdx
        xor     rdx, rcx
        and     rdx, rdi
        mov     rdi, r11
        and     rdi, rsi
        and     rax, rdi
        bzhi    rdi, rdi, r8
        or      rdi, 1
        blsmsk  rax, rax
        lzcnt   rdi, rdi
        shrx    rdi, r9, rdi
        xor     rdi, rax
        mov     rax, qword ptr [8*r8 + Chess_Lookup::SlideArithm::a1h8_mask]
        and     rdi, r11
        mov     r11, qword ptr [8*r8 + Chess_Lookup::SlideArithm::h1a8_mask]
        xor     rdi, rdx
        mov     rdx, r10
        mov     rcx, rax
        and     rcx, rsi
        and     rdx, rcx
        bzhi    rcx, rcx, r8
        and     rsi, r11
        or      rcx, 1
        blsmsk  rdx, rdx
        and     r10, rsi
        lzcnt   rcx, rcx
        blsmsk  r10, r10
        shrx    rcx, r9, rcx
        xor     rcx, rdx
        and     rcx, rax
        bzhi    rax, rsi, r8
        or      rax, 1
        lzcnt   rax, rax
        shrx    rax, r9, rax
        xor     rax, r10
        and     rax, r11
        xor     rax, rcx
        xor     rax, rdi
        ret

After:

Code: Select all

Chess_Lookup::SlideArithm::Queen(unsigned long, unsigned long): # @Chess_Lookup::SlideArithm::Queen(unsigned long, unsigned long)
        mov     eax, 1
        vmovq   xmm1, rsi
        vpcmpeqd        ymm3, ymm3, ymm3
        vpxor   xmm6, xmm6, xmm6
        shlx    rax, rax, rdi
        vpbroadcastq    ymm1, xmm1
        lea     rcx, [rax - 1]
        neg     rax
        btc     rax, rdi
        shl     rdi, 5
        vmovdqu ymm0, ymmword ptr [rdi + Chess_Lookup::SlideArithm::rank_mask]
        vmovq   xmm2, rax
        vpbroadcastq    ymm2, xmm2
        vpand   ymm1, ymm0, ymm1
        vpand   ymm2, ymm2, ymm1
        vpaddq  ymm3, ymm2, ymm3
        vpxor   ymm2, ymm3, ymm2
        vmovq   xmm3, rcx
        vpbroadcastq    ymm3, xmm3
        vpand   ymm1, ymm1, ymm3
        vpbroadcastq    ymm3, qword ptr [rip + .LCPI0_0] # ymm3 = [1,1,1,1]
        vpor    ymm1, ymm1, ymm3
        vmovdqa ymm3, ymmword ptr [rip + .LCPI0_1] # ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
        vpsrlw  ymm5, ymm1, 4
        vpand   ymm5, ymm5, ymmword ptr [rip + .LCPI0_2]
        vpshufb ymm4, ymm3, ymm1
        vpcmpeqb        ymm7, ymm5, ymm6
        vpshufb ymm3, ymm3, ymm5
        vpand   ymm4, ymm4, ymm7
        vpaddb  ymm3, ymm4, ymm3
        vpcmpeqb        ymm4, ymm1, ymm6
        vpsrlw  ymm4, ymm4, 8
        vpand   ymm4, ymm3, ymm4
        vpsrlw  ymm3, ymm3, 8
        vpaddw  ymm3, ymm3, ymm4
        vpcmpeqw        ymm4, ymm1, ymm6
        vpcmpeqd        ymm1, ymm1, ymm6
        vpsrld  ymm4, ymm4, 16
        vpsrlq  ymm1, ymm1, 32
        vpand   ymm4, ymm3, ymm4
        vpsrld  ymm3, ymm3, 16
        vpaddd  ymm3, ymm3, ymm4
        vpbroadcastq    ymm4, qword ptr [rip + .LCPI0_3] # ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
        vpand   ymm1, ymm3, ymm1
        vpsrlq  ymm3, ymm3, 32
        vpaddq  ymm1, ymm3, ymm1
        vpsrlvq ymm1, ymm4, ymm1
        vpxor   ymm1, ymm2, ymm1
        vpand   ymm0, ymm1, ymm0
        vextracti128    xmm1, ymm0, 1
        vpxor   xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        vpxor   xmm0, xmm0, xmm1
        vmovq   rax, xmm0
        vzeroupper
        ret

Note that because you have a elements packed in a multiple of 4 you suddenly get AVX2 instructions

Also the code shrinked by 15 lines.

Code: Select all

#include <array>
#include <cstdint>
#include <bit>


namespace Chess_Lookup::SlideArithm
{
	/* Init */
	constexpr bool safe_coord(int f, int r)
	{
		return (0 <= f && f < 8) && (0 <= r && r < 8);
	}

	constexpr uint64_t init_mask(int s, int df, int dr)
	{
		uint64_t b{}; int f{}, r{};
		f = s & 7; r = s >> 3;
		while (safe_coord(f += df, r += dr))
			b |= 1ull << (f + r * 8);

		return b;
	}

	constexpr std::array<uint64_t, 256> init_array()
	{
		std::array<uint64_t, 256> a{}; int n{};
		for (int s = 0; s < 64; s++)
		{
			a[n++] = init_mask(s, 1, 0) | init_mask(s, -1, 0);
			a[n++] = init_mask(s, 0, 1) | init_mask(s, 0, -1);
			a[n++] = init_mask(s, 1, 1) | init_mask(s, -1, -1);
			a[n++] = init_mask(s, -1, 1) | init_mask(s, 1, -1);
		}
		return a;
	}
	static constexpr std::array<uint64_t, 256> rank_mask = init_array();

	/* Start of code */
	constexpr uint64_t slide_arithmetic(int p, uint64_t line, uint64_t block) {
		uint64_t piece = 1ull << p;

		// mask blockers
		block = block & ~piece & line;

		// split the line into upper and lower rays
		uint64_t bottom = piece - 1ull;

		// for the upper part we can use the x^(x-1) trick to fill in from the bottom
		uint64_t masked_up = block & ~bottom;
		uint64_t blocked_up = (masked_up ^ (masked_up - 1ull));

		// for the bottom we use CLZ + a shift to fill in from the top
		uint64_t masked_down = block & bottom;
		uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(masked_down | 1ull);

		// the intersection of the two is the move set after masking with the line
		return (blocked_up ^ blocked_down) & line;
	}

	constexpr uint64_t Queen(uint64_t s, uint64_t occ)
	{
		const uint64_t* r = rank_mask.data() + 4 * s;
		return slide_arithmetic(s, r[0], occ)
			 ^ slide_arithmetic(s, r[1], occ)
			 ^ slide_arithmetic(s, r[2], occ)
			 ^ slide_arithmetic(s, r[3], occ);
	}
}

tcusr · Post by **tcusr** » Mon Dec 13, 2021 2:22 pm

dangi12012 wrote: ↑Sun Dec 12, 2021 11:57 pm Where did you get that code from? Init + Lookup code is just 70 Lines with all overhead. The ratio Lines of Code / Performance is great!

i took it from https://talkchess.com/forum3/viewtopic. ... 33#p890486, all credit goes to OP, who afaik is the inventor of the algorithm

dangi12012 wrote: ↑Mon Dec 13, 2021 12:50 am
tcusr wrote: ↑Sun Dec 12, 2021 10:45 pm
Update: By combining the elements of all 4 rays into a single lookup and putting them right next to each other I gained a few % of speed. (Because now elements are right next to each other during lookup)
Note that because you have a elements packed in a multiple of 4 you suddenly get AVX2 instructions

really interesting, i didn't know it was possible, would this also happen if they rays were packed in a struct?

dangi12012 · Post by **dangi12012** » Mon Dec 13, 2021 3:12 pm

tcusr wrote: ↑Mon Dec 13, 2021 2:22 pm i took it from https://talkchess.com/forum3/viewtopic. ... 33#p890486, all credit goes to OP, who afaik is the inventor of the algorithm
really interesting, i didn't know it was possible, would this also happen if they rays were packed in a struct?

Ah thats nice I will look at the other comparisons there. This is a treasure because in the first post it uses the rdtscp instruction - which many people dont even know exists. Its the "cpu instruction counter" which allows you to measure sub nanosecond code. Hard to use correctly and not very compatible with dynamic boost clocks of current cpus.
https://stackoverflow.com/questions/276 ... dtsc-cpuid

But the only way to measure picoseconds.

Yes struct with 4 elements would be treated exactly the same. Its a good practice if you notice 4,8 uint64_t elements somewhere to put them right next to each other.
As you see above it is the same idea - but the assemlby looks totally different.

Jakob Progsch wrote: ↑Thu Apr 22, 2021 10:27 am One other thing that bugs me a bit is that we make all this effort to calculate the entire moveset of a piece in one go. But then later we probably end up iterating over that bitset anyway, right?

Jakob do you have the sourcecode from this thread? I would like to incorporate all 3 implementations into a comparison project for the current state of are sliding algorithms.

tcusr · Post by **tcusr** » Mon Dec 13, 2021 7:11 pm

dangi12012 wrote: ↑Mon Dec 13, 2021 3:12 pm
Jakob Progsch wrote: ↑Thu Apr 22, 2021 10:27 am One other thing that bugs me a bit is that we make all this effort to calculate the entire moveset of a piece in one go. But then later we probably end up iterating over that bitset anyway, right?

this quote is the reason why i opened the 'bitboards like mailbox' thread.
for quite moves it works perfectly

Code: Select all

while ((rooks <<= 8) & empty) // and other directions considering wraps
    // add moves

but the problem is that, as hgm rightfully says, most engines spend most of their time on captures so they won't even generate the rest of quiet moves.
a quick way to generate captures is much more useful, both for qsearch and staged move generation

dangi12012 · Post by **dangi12012** » Mon Dec 13, 2021 8:32 pm

tcusr wrote: ↑Mon Dec 13, 2021 7:11 pm but the problem is that, as hgm rightfully says, most engines spend most of their time on captures so they won't even generate the rest of quiet moves.
a quick way to generate captures is much more useful, both for qsearch and staged move generation

Nah hgm also said on other threads with bitboards you can select capturing moves with a single "and operation" on the enemy occupation. So you just do atk & enemy and have the moves to search first.

Its that branching 24 times for a queen like you said above is much much much more expensive than looking her atk set up in 2 operations and then with one AND you get the right moves. I tried it because the recurive code in my signature would be perfect for that.

Anyways tcusr I need your help on understanding this line of code:
uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(block & mask | 1ull);

I made the inner core for arithmetic much cleaner now because I saw some intrinsics are hidden in there:
For example there was X & (1 << p) - 1 which perfecly maps to BZHI. Not all compilers find that. Now its another 5% faster and the code shrank by 5 lines.

If we find an elegant solution for blocked_down part it would be perfect. I can feel that there is an elegant solution hidden in there somewhere. I dont know where that 0x7FFFFFFFFFFFFFFFull comes from and why to shift by that amount. The lookup for any slider is getting close to 3 lines of code with a lookup table of 2Kb.
So a small table and short code without too many dependencies. I like that a lot.

Code: Select all

	/* Start of code */
	static const inline uint64_t slide_arithmetic(int p, uint64_t block) {
		//BZHI
		//[src & (1 << inx) - 1] ;
		// split the line into upper and lower rays
		uint64_t mask = _bzhi_u64(block, p);

		// for the bottom we use CLZ + a shift to fill in from the top
		uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(block & mask | 1ull);

		//_blsmsk_u64 = X^X-1
		// the intersection of the two is the move set after masking with the line
		return (_blsmsk_u64(block & ~mask) ^ blocked_down);
	}

	static const inline uint64_t Queen(uint64_t s, uint64_t occ)
	{
		const uint64_t* r = rank_mask.data() + 4 * s;
		return slide_arithmetic(s, r[0] & occ) & r[0]
			 ^ slide_arithmetic(s, r[1] & occ) & r[1]
			 ^ slide_arithmetic(s, r[2] & occ) & r[2]
			 ^ slide_arithmetic(s, r[3] & occ) & r[3];
	}

tcusr · Post by **tcusr** » Mon Dec 13, 2021 9:03 pm

dangi12012 wrote: ↑Mon Dec 13, 2021 8:32 pm
tcusr wrote: ↑Mon Dec 13, 2021 7:11 pm but the problem is that, as hgm rightfully says, most engines spend most of their time on captures so they won't even generate the rest of quiet moves.
a quick way to generate captures is much more useful, both for qsearch and staged move generation
Nah hgm also said on other threads with bitboards you can select capturing moves with a single "and operation" on the enemy occupation. So you just do atk & enemy and have the moves to search first.

Its that branching 24 times for a queen like you said above is much much much more expensive than looking her atk set up in 2 operations and then with one AND you get the right moves. I tried it because the recurive code in my signature would be perfect for that.

Anyways tcusr I need your help on understanding this line of code:
uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(block & mask | 1ull);

I made the inner core for arithmetic much cleaner now because I saw some intrinsics are hidden in there:
For example there was X & (1 << p) - 1 which perfecly maps to BZHI. Not all compilers find that. Now its another 5% faster and the code shrank by 5 lines.

If we find an elegant solution for blocked_down part it would be perfect. I can feel that there is an elegant solution hidden in there somewhere. I dont know where that 0x7FFFFFFFFFFFFFFFull comes from and why to shift by that amount. The lookup for any slider is getting close to 3 lines of code with a lookup table of 2Kb.
So a small table and short code without too many dependencies. I like that a lot.
Code: Select all
	/* Start of code */
	static const inline uint64_t slide_arithmetic(int p, uint64_t block) {
		//BZHI
		//[src & (1 << inx) - 1] ;
		// split the line into upper and lower rays
		uint64_t mask = _bzhi_u64(block, p);

		// for the bottom we use CLZ + a shift to fill in from the top
		uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(block & mask | 1ull);

		//_blsmsk_u64 = X^X-1
		// the intersection of the two is the move set after masking with the line
		return (_blsmsk_u64(block & ~mask) ^ blocked_down);
	}

	static const inline uint64_t Queen(uint64_t s, uint64_t occ)
	{
		const uint64_t* r = rank_mask.data() + 4 * s;
		return slide_arithmetic(s, r[0] & occ) & r[0]
			 ^ slide_arithmetic(s, r[1] & occ) & r[1]
			 ^ slide_arithmetic(s, r[2] & occ) & r[2]
			 ^ slide_arithmetic(s, r[3] & occ) & r[3];
	}

tbh i have trouble understanding the code too, but i can try
this is 0x7FFFFFFFFFFFFFFFull

Code: Select all

+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X |   | 8
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 7
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 6
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 5
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 4
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 3
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 2
+---+---+---+---+---+---+---+---+
| X | X | X | X | X | X | X | X | 1
+---+---+---+---+---+---+---+---+
  a   b   c   d   e   f   g   h

by shifting right it makes sure to only fill the bottom of the first encountered piece (msb), but idk if it's right

Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters

Re: Combining two of Bob's classic bitboard attack getters