Comparison of all known Sliding lookup algorithms

Discussion of chess software programming and technical issues.

Moderator: Ras

tcusr
Posts: 325
Joined: Tue Aug 31, 2021 10:32 pm
Full name: Mateo

Re: Comparison of all known Sliding lookup algorithms

Post by tcusr »

kogge stone vectorized seems promising, see https://zeta-chess.app26.de/post/64-bit ... tor-based/
i implemented kogge stone with GCC's builtin vectors in my CPU engine and while they were much faster than normal kogge stone it only gave a 7% speed up in search. it's not worth it
tcusr
Posts: 325
Joined: Tue Aug 31, 2021 10:32 pm
Full name: Mateo

Re: Comparison of all known Sliding lookup algorithms

Post by tcusr »

dangi12012 wrote: Thu Jan 06, 2022 4:43 pm If you want to use cuda yourself:
cu_common.h

Code: Select all

#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand.h>
#include <curand_kernel.h>
#include <device_functions.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <vector_functions.h>

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include "device_atomic_functions.h"


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

#ifdef __INTELLISENSE__
//    Reverse the bit order of a 32 bit unsigned integer. 
__device__ unsigned int __brev(unsigned int  x) {};

//Reverse the bit order of a 64 bit unsigned integer.
__device__ unsigned long long int __brevll(unsigned long long int x) {};


//Return selected bytes from two 32 bit unsigned integers.
__device__ unsigned int __byte_perm(unsigned int  x, unsigned int  y, unsigned int  s) {};


//Return the number of consecutive high - order zero bits in a 32 bit integer.
__device__ int __clz(int  x) {};


//Count the number of consecutive high - order zero bits in a 64 bit integer.
__device__ int __clzll(long long int x) {};


//Find the position of the least significant bit set to 1 in a 32 bit integer.
__device__ int __ffs(int  x) {};


//Find the position of the least significant bit set to 1 in a 64 bit integer.Concatenate hi : lo, shift left by shift & 31 bits, return the most significant 32 bits.
__device__ int __ffsll(long long int x) {};


//Concatenate hi : lo, shift left by shift & 31 bits, return the most significant 32 bits.
__device__ unsigned int __funnelshift_l(unsigned int  lo, unsigned int  hi, unsigned int  shift) {};


//Concatenate hi : lo, shift left by min(shift, 32) bits, return the most significant 32 bits.
__device__ unsigned int __funnelshift_lc(unsigned int  lo, unsigned int  hi, unsigned int  shift) {};


//Concatenate hi : lo, shift right by shift & 31 bits, return the least significant 32 bits.
__device__ unsigned int __funnelshift_r(unsigned int  lo, unsigned int  hi, unsigned int  shift) {};


//Concatenate hi : lo, shift right by min(shift, 32) bits, return the least significant 32 bits.
__device__ unsigned int __funnelshift_rc(unsigned int  lo, unsigned int  hi, unsigned int  shift) {};


//Compute average of signed input arguments, avoiding overflow in the intermediate sum.
__device__ int __hadd(int, int) {};


//Calculate the least significant 32 bits of the product of the least significant 24 bits of two integers.
__device__ int __mul24(int  x, int  y) {};


//Calculate the most significant 64 bits of the product of the two 64 bit integers.
__device__ long long int __mul64hi(long long int x, long long int y) {};


//Calculate the most significant 32 bits of the product of the two 32 bit integers.
__device__ int __mulhi(int  x, int  y) {};


//Count the number of bits that are set to 1 in a 32 bit integer.
__device__ int __popc(unsigned int  x) {};


//Count the number of bits that are set to 1 in a 64 bit integer.
__device__ int __popcll(unsigned long long int x) {};


//Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum.
__device__ int __rhadd(int, int) {};


//Calculate | x − y | +z, the sum of absolute difference.
__device__ unsigned int __sad(int  x, int  y, unsigned int  z) {};


//Compute average of unsigned input arguments, avoiding overflow in the intermediate sum.
__device__ unsigned int __uhadd(unsigned int, unsigned int) {};


//Calculate the least significant 32 bits of the product of the least significant 24 bits of two unsigned integers.
__device__ unsigned int __umul24(unsigned int  x, unsigned int  y) {};


//Calculate the most significant 64 bits of the product of the two 64 unsigned bit integers.
__device__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y) {};


//Calculate the most significant 32 bits of the product of the two 32 bit unsigned integers.
__device__ unsigned int __umulhi(unsigned int  x, unsigned int  y) {};


//Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum.
__device__ unsigned int __urhadd(unsigned int, unsigned int) {};


//Calculate | x − y | +z, the sum of absolute difference.
__device__ unsigned int __usad(unsigned int  x, unsigned int  y, unsigned int  z) {};

//////////////////////////////////////////////////////
//atomic functions

int atomicAdd(int* address, int val) {};
unsigned int atomicAdd(unsigned int* address, unsigned int val) {};
unsigned long long int atomicAdd(unsigned long long int* address, unsigned long long int val) {};
float atomicAdd(float* address, float val) {};
double atomicAdd(double* address, double val) {};

typedef int __half2;
typedef short __half;
__half2 atomicAdd(__half2* address, __half2 val) {};
__half atomicAdd(__half* address, __half val) {};

int atomicSub(int* address, int val) {};
unsigned int atomicSub(unsigned int* address, unsigned int val) {};

int atomicExch(int* address, int val) {};
unsigned int atomicExch(unsigned int* address, unsigned int val) {};
unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val) {};
float atomicExch(float* address, float val) {};

int atomicMin(int* address, int val) {};
unsigned int atomicMin(unsigned int* address, unsigned int val) {};
unsigned long long int atomicMin(unsigned long long int* address, unsigned long long int val) {};

int atomicMax(int* address, int val) {};
unsigned int atomicMax(unsigned int* address, unsigned int val) {};
unsigned long long int atomicMax(unsigned long long int* address, unsigned long long int val) {};

unsigned int atomicInc(unsigned int* address, unsigned int val) {};

unsigned int atomicDec(unsigned int* address, unsigned int val) {};

int atomicCAS(int* address, int compare, int val) {};
unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {};
unsigned long long int atomicCAS(unsigned long long int* address,
    unsigned long long int compare,
    unsigned long long int val) {};
unsigned short int atomicCAS(unsigned short int* address,
    unsigned short int compare,
    unsigned short int val) {};

int atomicAnd(int* address, int val) {};
unsigned int atomicAnd(unsigned int* address,
    unsigned int val) {};
unsigned long long int atomicAnd(unsigned long long int* address,
    unsigned long long int val) {};

int atomicOr(int* address, int val) {};
unsigned int atomicOr(unsigned int* address,
    unsigned int val) {};
unsigned long long int atomicOr(unsigned long long int* address,
    unsigned long long int val) {};

int atomicXor(int* address, int val) {};
unsigned int atomicXor(unsigned int* address, unsigned int val) {};
unsigned long long int atomicXor(unsigned long long int* address, unsigned long long int val) {};

template <typename T>
unsigned int __match_any_sync(unsigned mask, T value) {};
template <typename T>
unsigned int __match_all_sync(unsigned mask, T value, int* pred) {};

template <typename T>
T __shfl_sync(unsigned mask, T var, int srcLane, int width = warpSize) {};
template <typename T>
T __shfl_up_sync(unsigned mask, T var, unsigned int delta, int width = warpSize) {};
template <typename T>
T __shfl_down_sync(unsigned mask, T var, unsigned int delta, int width = warpSize) {};
template <typename T>
T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) {};
#endif
SlideArithm.h

Code: Select all

#pragma once
#include "Cu_Common.h"

namespace SlideArithm {
    __device__ uint64_t MSB(uint64_t value)
    {
        return 63ull - __clzll(value);
    }

    __device__ uint64_t LSB(uint64_t value)
    {
        return __ffsll(value);
    }

    /* return the bitboard with the rook destinations */
    __device__ uint64_t Rook(uint64_t sq, uint64_t occupation)
    {
        uint64_t piece = 1ULL << sq;
        occupation ^= piece; /* remove the selected piece from the occupation */
        uint64_t piecesup = (0x0101010101010101ULL << sq) & (occupation | 0xFF00000000000000ULL); /* find the pieces up */
        uint64_t piecesdo = (0x8080808080808080ULL >> (63 - sq)) & (occupation | 0x00000000000000FFULL); /* find the pieces down */
        uint64_t piecesri = (0x00000000000000FFULL << sq) & (occupation | 0x8080808080808080ULL); /* find pieces on the right */
        uint64_t piecesle = (0xFF00000000000000ULL >> (63 - sq)) & (occupation | 0x0101010101010101ULL); /* find pieces on the left */
        return (((0x8080808080808080ULL >> (63 - LSB(piecesup))) & (0x0101010101010101ULL << MSB(piecesdo))) |
            ((0xFF00000000000000ULL >> (63 - LSB(piecesri))) & (0x00000000000000FFULL << MSB(piecesle)))) ^ piece;
        /* From every direction find the first piece and from that piece put a mask in the opposite direction.
           Put togheter all the 4 masks and remove the moving piece */
    }

    /* return the bitboard with the bishops destinations */
    __device__ uint64_t Bishop(uint64_t sq, uint64_t occupation)
    {  /* it's the same as the rook */
        uint64_t piece = 1ULL << sq;
        occupation ^= piece;
        uint64_t piecesup = (0x8040201008040201ULL << sq) & (occupation | 0xFF80808080808080ULL);
        uint64_t piecesdo = (0x8040201008040201ULL >> (63 - sq)) & (occupation | 0x01010101010101FFULL);
        uint64_t piecesle = (0x8102040810204081ULL << sq) & (occupation | 0xFF01010101010101ULL);
        uint64_t piecesri = (0x8102040810204081ULL >> (63 - sq)) & (occupation | 0x80808080808080FFULL);
        return (((0x8040201008040201ULL >> (63 - LSB(piecesup))) & (0x8040201008040201ULL << MSB(piecesdo))) |
            ((0x8102040810204081ULL >> (63 - LSB(piecesle))) & (0x8102040810204081ULL << MSB(piecesri)))) ^ piece;
    }

    __device__ uint64_t Queen(uint64_t sq, uint64_t occupation) {
        return Rook(sq, occupation) | Bishop(sq, occupation);
    }
}
You can see that when no lookup table is needed is really simple to translate! I think every algo can be translated with high performance. Except pext - and maybe Hyperbola Qsc will be slow with bswap64. 100% wont translate (useless for gpu): SISSY BB, Hash Plain, Hash Variable, RotatedBoard

Bonus:
If the algorithms lookup fits in 64KB of space then a __constant__ memory location can be used - which is a manual entry in each cuda cores L2 cache. That will be extremely fast!

So I will continue to translate c++ to cuda and keep updates here.
i just noticed that in LSB you return __ffsll(value) but shouldn't you return __ffsll(value) - 1 to get the number of zeros?
dangi12012
Posts: 1062
Joined: Tue Apr 28, 2020 10:03 pm
Full name: Daniel Infuehr

Re: Comparison of all known Sliding lookup algorithms

Post by dangi12012 »

tcusr wrote: Thu Jan 06, 2022 5:18 pm kogge stone vectorized seems promising
Kogge stone has strong dependency chain and is not the best algo in any category. For the cpu and gpu there are better algorithms that also use 0 (or a small) lookup.
IMO if you want to stay below 8kb of lookup (any of those is faster than kogge-stone even with fallbacks):

Code: Select all

QBB Algo: 	119.15MOps	0 kB	Optimal perf: countr_zero, countl_zero
BobMike: 	126.89MOps	8 kB	Optimal perf: countr_zero, countl_zero
SlideArithm: 	129.22MOps	2 kB	Optimal perf: bzhi_u64, blsmsk_u64
Hyperbola Qsc: 	201.79MOps	2 kB	Optimal perf: bswap
1MB lookup: Fancy, Pext (the wiki still hasnt fixed the sourcecode- it uses 1 op more than my implementation)
https://www.chessprogramming.org/BMI2#PEXTBitboards
- you can consolidate arrAttacks and arrRookBase[sq] into a single attacks[sq] pointer
Mailslot: Hypercube
Worlds-fastest-Bitboard-Chess-Movegenerator
Daniel Inführ - Software Developer
dangi12012
Posts: 1062
Joined: Tue Apr 28, 2020 10:03 pm
Full name: Daniel Infuehr

Re: Comparison of all known Sliding lookup algorithms

Post by dangi12012 »

tcusr wrote: Thu Jan 06, 2022 6:14 pm i just noticed that in LSB you return __ffsll(value) but shouldn't you return __ffsll(value) - 1 to get the number of zeros?
Im not sure. if __ffsll === std::countr_zero then it should be ok. Is Fabio Gobbato still around here?
Because if you look at the code you can also do this:

Code: Select all

 static constexpr uint64_t MSB(uint64_t value)
    {
        return 63ull - std::countl_zero(value);
    }

    //Here we also add 63 - and change the Rook/Bishop code accordingly
    static constexpr uint64_t LSB(uint64_t value)
    {
        return 63ull - std::countr_zero(value);
    }
And then I wonder if we even need the 63-. Because IMO you would just need to change a leftshift to a rightshift with a different constant and then you can drop all 63- terms and be yet a small amount faster.

This probably is one of the best "Lines of Code" to MLU/s ratios. It is the best - no lookup algo by far.
Worlds-fastest-Bitboard-Chess-Movegenerator
Daniel Inführ - Software Developer
tcusr
Posts: 325
Joined: Tue Aug 31, 2021 10:32 pm
Full name: Mateo

Re: Comparison of all known Sliding lookup algorithms

Post by tcusr »

dangi12012 wrote: Thu Jan 06, 2022 6:24 pm
tcusr wrote: Thu Jan 06, 2022 6:14 pm i just noticed that in LSB you return __ffsll(value) but shouldn't you return __ffsll(value) - 1 to get the number of zeros?
Im not sure. if __ffsll === std::countr_zero then it should be ok. Is Fabio Gobbato still around here?
Because if you look at the code you can also do this:

Code: Select all

 static constexpr uint64_t MSB(uint64_t value)
    {
        return 63ull - std::countl_zero(value);
    }

    //Here we also add 63 - and change the Rook/Bishop code accordingly
    static constexpr uint64_t LSB(uint64_t value)
    {
        return 63ull - std::countr_zero(value);
    }
And then I wonder if we even need the 63-. Because IMO you would just need to change a leftshift to a rightshift with a different constant and then you can drop all 63- terms and be yet a small amount faster.

This probably is one of the best "Lines of Code" to MLU/s ratios. It is the best - no lookup algo by far.
ffs == ctz + 1, at least with GCC's builtins
User avatar
hgm
Posts: 28331
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: Comparison of all known Sliding lookup algorithms

Post by hgm »

Did you try the incrementally updated attack sets?
dangi12012
Posts: 1062
Joined: Tue Apr 28, 2020 10:03 pm
Full name: Daniel Infuehr

Re: Comparison of all known Sliding lookup algorithms

Post by dangi12012 »

CUDA Performance Preview (preliminary)

This is the kernel that ranomly samples sq[0..64] and occupancy[0..uint64::max]

If you find something wrong with the code below - let me know! :)

Code: Select all

__global__ void cu_GetQueenAttacks(Cuda_Chessprocessor params)
{
    const int gid = getIdx();
    uint32_t x = 123456789 * gid, y = 362436069 ^ gid, z = 521288629; //Seed for Xorshift. curandStateXORWOW too slow

    volatile uint64_t* occs = params.attacks;
    for (int i = 0; i < params.loops; i++) {
        //occs[gid] = cu_Queen(cu_rand32(x, y, z) & 63, cu_rand64(x, y, z)); //7Billion
        //occs[gid] = SlideArithm::Queen(cu_rand32(x, y, z) & 63, cu_rand64(x, y, z)); //60 Billion
        //occs[gid] = BobLU::Queen(cu_rand32(x, y, z) & 63, cu_rand64(x, y, z)); //0.7 Billion?!
        //occs[gid] = KoggeStone::Queen(cu_rand32(x, y, z) & 63, cu_rand64(x, y, z)); //40 Billion
        //occs[gid] = HyperbolaQsc::Queen(cu_rand32(x,y,z) & 63, cu_rand64(x,y,z)); //16 Billion
    }
}
So take it with a grain of salt so far - i probably will have more time on the weekend and a bug to fix in bob lookup. I also want to verify if the algos that fit inside 64k really are the best fit for cuda. (then its constant readonly memory which is very, very fast)

Code: Select all

FancyMagic::Queen - //7Billion
SlideArithm::Queen - //60 Billion
BobLU::Queen - //0.7 Billion?!
KoggeStone::Queen - //40 Billion
HyperbolaQsc::Queen - //16 Billion
I am very happy that I wrote the comparison code the way I did because all the headers are without dependency and translate really well into cuda. (Might even be that there are gpu only algorithms that are not discovered yet and are a lot faster yet)
Keep in mind that a queen is worst case and rook/bishops are twice as fast generally.

For example reversing the bit order (not byte order) is not a luxury we have as a single instruction on x64:
https://docs.nvidia.com/cuda/cuda-math- ... __INT.html
Worlds-fastest-Bitboard-Chess-Movegenerator
Daniel Inführ - Software Developer
dangi12012
Posts: 1062
Joined: Tue Apr 28, 2020 10:03 pm
Full name: Daniel Infuehr

Re: Comparison of all known Sliding lookup algorithms

Post by dangi12012 »

CUDA Performance Release

Now I took this Saturday to translate all algos to CUDA. Now I am very interested in the performance you get.
If you read this - run this cuda executable and post results here: https://github.com/Gigantua/Chess_Moveg ... ompare.exe
Nvidia only for now.

Results:

Code: Select all

NVIDIA GeForce RTX 3080
Fancy Magic:    6.82 GigaQueens/s
QBB Algo:       58.02 GigaQueens/s
Bob Lookup:     0.77 GigaQueens/s
Kogge Stone:    40.43 GigaQueens/s
Hyperbola Qsc:  17.43 GigaQueens/s
Switch Lookup:  4.40 GigaQueens/s
Slide Arithm:   18.38 GigaQueens/s
Pext Lookup:    16.85 GigaQueens/s
SISSY Lookup:   8.08 GigaQueens/s
Hypercube Alg:  1.31 GigaQueens/s
I am very suprised how some algos performend. Seems to me that cuda really does not care about dependency chains (makes sense because it doesnt do uOp fusing like modern x64) so Kogge Stone and QBB really are good performers here.

Now everyone here can see that switching hardware from the usual x64 to some other architecture really changes the usual performance metrics we are used to. PEXT emulated is faster than fancy magic! This is something i did not expect.

The general performance is much much higher than the maximum on 32 Threads with a Ryzen 3cpu which was 10 Gigalookups/s.
I am looking forward to some discussions. All memory accessing algorithms seem to be much slower than the zero table lookups we have.
Sourcecode: https://github.com/Gigantua/Chess_Movegen

Winners: QBB, Kogge Stone
Losers: Fancy Magic, Bob Lookup

Edit: Seperation of topics warrants a seperate gpu thread - where gpu specifics can be discussed. This should stay a cpu (speficically x64) discussion thread. So multithreading L2 etc. play a role here.

If you are missing a cpu slider lookup algorithm you let me know here.
Worlds-fastest-Bitboard-Chess-Movegenerator
Daniel Inführ - Software Developer
dangi12012
Posts: 1062
Joined: Tue Apr 28, 2020 10:03 pm
Full name: Daniel Infuehr

Re: Comparison of all known Sliding lookup algorithms

Post by dangi12012 »

UPDATE: Obstruction Difference Added

Thanks to Michael Hoffmann for sending me some sourcecode. I optimized it a lot and made it a header only constexpr/consteval library header. So it now is like all the other - a drop in header with no dependencies and no initialisation that is needed.

I optimized it for modern C++ and I had a thought that I want to create a 0kb version for all algorithms where the initialisation is sufficiently fast to be done during runtime. (So no lookup table - cache pressure etc. pure register math only). This has implications for cuda and I will release the results there soon! These algos with have the suffix _IL or Inline.

Run this on Windows: (or build from souce). But please share your results here!
https://github.com/Gigantua/Chess_Moveg ... ompare.exe

Other findings:
CLANG produces a much faster executable - but that is not a finding that translates to your hardware (I confirmed for Zen3).

Results:

Code: Select all

AMD Ryzen 9 5950X 16-Core Processor
Megalooks Random Positions/s:
Exploading:     137.34MOps      6 kB    Optimal perf: imul64
Reference:      127.48MOps      8 kB    Optimal perf: none
KoggeStone:     171.57MOps      0 kB    Optimal perf: none
RotatedBoard:   107.07MOps      14 kB   Optimal perf: none
QBB Algo:       209.19MOps      0 kB    Optimal perf: countr_zero, countl_zero
BobMike:        316.85MOps      8 kB    Optimal perf: countr_zero, countl_zero
Obstr. Diff:    381.39MOps      6 kB    Optimal perf: countl_zero
Obstr. Inline:  196.87MOps      0 kB    Optimal perf: countl_zero
SlideArithm:    342.51MOps      2 kB    Optimal perf: bzhi_u64, blsmsk_u64
SISSY BB:       230.89MOps      1409 kB Optimal perf: none
Hyperbola Qsc:  525.39MOps      2 kB    Optimal perf: bswap
Hash Variable:  614.58MOps      729 kB  Optimal perf: imul64
Hash Plain:     638.67MOps      2306 kB Optimal perf: imul64
Hash Fancy:     821.48MOps      694 kB  Optimal perf: imul64
Pext  :         903.96MOps      843 kB  Optimal perf: pext_u64
HyperCube:      260.24MOps      841 kB  Optimal perf: none
Multithreaded:

Code: Select all

Summary:
Exploading:     2270.47MOps     32Threads
Reference:      1830.46MOps     31Threads
KoggeStone:     2195.68MOps     31Threads
RotatedBoard:   2153.06MOps     32Threads
QBB Algo:       3013.69MOps     32Threads
BobMike:        4140.99MOps     31Threads
Obstr. Diff:    5040.64MOps     32Threads
Obstr. Inline:  2667.82MOps     32Threads
SlideArithm:    4783.24MOps     32Threads
SISSY BB:       2753.38MOps     27Threads
Hyperbola Qsc:  6795.23MOps     31Threads
Hash Variable:  8051.02MOps     32Threads
Hash Plain:     6117.87MOps     31Threads
Hash Fancy:     9161.70MOps     31Threads
Pext  :         10671.81MOps    31Threads
HyperCube:      4210.65MOps     31Threads
Worlds-fastest-Bitboard-Chess-Movegenerator
Daniel Inführ - Software Developer
dangi12012
Posts: 1062
Joined: Tue Apr 28, 2020 10:03 pm
Full name: Daniel Infuehr

Re: Comparison of all known Sliding lookup algorithms

Post by dangi12012 »

Interesting observation:

Slide Arithmetic:

Code: Select all

	static const inline uint64_t slide_arithmetic(uint32_t p, uint64_t block) {
		//BZHI
		//[src & (1 << inx) - 1] ;
		// split the line into upper and lower rays
		uint64_t mask = bzhi(block, p);

		// for the bottom we use CLZ + a shift to fill in from the top
		uint64_t blocked_down = 0x7FFFFFFFFFFFFFFFull >> std::countl_zero(block & mask | 1ull);

		//_blsmsk_u64 = X^X-1
		// the intersection of the two is the move set after masking with the line
		return (blsmsk(block & ~mask) ^ blocked_down);
	}

	static const inline uint64_t Queen(uint32_t s, uint64_t occ)
	{
		const uint64_t* r = rank_mask.data() + 4 * s;
		return slide_arithmetic(s, r[0] & occ) & r[0]
			 ^ slide_arithmetic(s, r[1] & occ) & r[1]
			 ^ slide_arithmetic(s, r[2] & occ) & r[2]
			 ^ slide_arithmetic(s, r[3] & occ) & r[3];
	}
}
Obstruction Difference:

Code: Select all

    static constexpr uint64_t line_attack(int sq, uint64_t occ, int dir)
    {
        const lineEx& line = lines[4 * sq + dir];
        uint64_t lower = line.lower & occ;
        uint64_t upper = line.upper & occ;
        uint64_t msb = (1ull << 63ull) >> std::countl_zero(lower | 1);  //Extract Highest Set Isolated Bit
        uint64_t lsb = upper & -upper;                                  //Extract Lowest Set Isolated Bit
        uint64_t oDif = lsb * 2 - msb;
        return line.uni & oDif;
    }
    #pragma warning(pop)

    static constexpr uint64_t Bishop(int sq, uint64_t occ)
    {
        return line_attack(sq, occ, 2) | line_attack(sq, occ, 3);
    }

    static constexpr uint64_t Rook(int sq, uint64_t occ)
    {
        return line_attack(sq, occ, 0) | line_attack(sq, occ, 1);
    }

    static constexpr uint64_t Queen(int sq, uint64_t occ)
    {
        return Bishop(sq, occ) | Rook(sq, occ);
    }
Either they are the same algorithm - but it seems that they have subtle differences.
For example Obstruction Difference does not contain any XOR operations - yet they seem too similar to me. Maybe a blsmsk optimisation is also possible for Obstruction Difference.
Worlds-fastest-Bitboard-Chess-Movegenerator
Daniel Inführ - Software Developer