Bit Scan (equivalent to ASM instructions bsr and bsf)

Discussion of chess software programming and technical issues.

Moderators: hgm, Rebel, chrisw

pgeorges

Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by pgeorges »

Hi,

In order to port an engine from x86 to ARM I had to get rid of assembly code. Various solutions are available, particularly here :
http://chessprogramming.wikispaces.com/BitScan
and on this forum.
The conversion of bsr to C code led to a 3% penalty so I considered various solutions and here are my results (for 1,000,000,000 iterations on a random dataset) :

------------------ BSR_asm ----------------
time BSR_asm 2.510000 sec
------------------ BSR_in_C ----------------
time BSR_in_C 6.730000 sec
------------------ BSR_double ----------------
time BSR_double 20.010000 sec
------------------ BSF_asm ----------------
time BSF_asm 2.500000 sec
------------------ BSF_folded ----------------
time BSF_folded 6.230000 sec

The code is attached below, hope this will save time in research for others ;-)

So here is my question : can you give a better solution for the two best C functions (BSR_in_C and BSF_folded) ?

Code: Select all

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define uint64 unsigned long long 
#define uint32 unsigned int

const int lsb_64_table&#91;64&#93; = &#123;
   63, 30,  3, 32, 59, 14, 11, 33,
   60, 24, 50,  9, 55, 19, 21, 34,
   61, 29,  2, 53, 51, 23, 41, 18,
   56, 28,  1, 43, 46, 27,  0, 35,
   62, 31, 58,  4,  5, 49, 54,  6,
   15, 52, 12, 40,  7, 42, 45, 16,
   25, 57, 48, 13, 10, 39,  8, 44,
   20, 47, 38, 22, 17, 37, 36, 26
&#125;;

/* ============================================ */
static inline int BSR_double &#40;uint64 bb&#41; &#123;
   union &#123;
      double d;
      struct &#123;
         unsigned int mantissal &#58; 32;
         unsigned int mantissah &#58; 20;
         unsigned int exponent &#58; 11;
         unsigned int sign &#58; 1;
      &#125;;
   &#125; ud;
   ud.d = &#40;double&#41;&#40;bb & ~&#40;bb >> 32&#41;);
   return ud.exponent - 1023;
&#125;

/* ============================================ */

const char msb_256_table&#91;256&#93; = &#123;
  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
  4, 4, 4, 4, 4, 4, 4, 4,4, 4, 4, 4,4, 4, 4, 4,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,   
&#125;;

static inline int BSR32&#40;uint32 bb&#41; &#123;
  int result = 0;

   if &#40;bb > 0xFFFF&#41; &#123;
      bb >>= 16;
      result += 16;
   &#125;
   if &#40;bb > 0xFF&#41; &#123;
      bb >>= 8;
      result += 8;
   &#125;

   return &#40;result + msb_256_table&#91;bb&#93;);
&#125;

/* ============================================ */
static inline int BSR_in_C&#40;uint64 bb&#41; &#123;
  const uint32 hb = bb >> 32;
  return hb ? 32 + BSR32&#40;&#40;uint32&#41;hb&#41; &#58; BSR32&#40;&#40;uint32&#41;bb&#41;;
   
&#125;

/* ============================================ */
static inline int BSR_asm &#40;uint64 w&#41; &#123;
  int x1, x2;
asm ("bsr %1,%0\n" "jnz 1f\n" "bsr %0,%0\n" "subl $32,%0\n"
     "1&#58; addl $32,%0\n"&#58; "=&q" &#40;x1&#41;, "=&q" &#40;x2&#41;&#58;"1" (&#40;int&#41; &#40;w >> 32&#41;),
     "0" (&#40;int&#41; w&#41;);
  return x1;
&#125;

/* ============================================ */
static inline int BSF_folded &#40;uint64 bb&#41; &#123;
   unsigned int folded;
   bb ^= bb - 1;
   folded = &#40;int&#41; bb ^ &#40;bb >> 32&#41;;
   return lsb_64_table&#91;folded * 0x78291ACF >> 26&#93;;
&#125;

/* ============================================ */
static inline int BSF_asm &#40;uint64 w&#41; &#123;
  int x1, x2;
asm ("bsf %0,%0\n" "jnz 1f\n" "bsf %1,%0\n" "jz 1f\n" "addl $32,%0\n"
     "1&#58;"&#58; "=&q" &#40;x1&#41;, "=&q" &#40;x2&#41;&#58;"1" (&#40;int&#41; &#40;w >> 32&#41;),
     "0" (&#40;int&#41; w&#41;);
  return x1;
&#125;

#define TEST 1000
uint64 test&#91;TEST&#93;;
int restab&#91;TEST&#93;;

#define REPEAT 1000000
  clock_t start, finish;
  int i, j, res;
  double duration;
  
void init_data&#40;) &#123;  
  int i = 0;
  srand&#40;time&#40;NULL&#41;);
  for &#40;i = 0 ; i< TEST; i++) &#123;
    uint64 x = rand&#40;);
    uint64 y = (&#40;uint64&#41; rand&#40;)) << 32 ;
    test&#91;i&#93; = x + y;
  &#125;
&#125;
// ============================================================
void print_result&#40;char *s&#41; &#123;
  int i = 0;
  finish = clock&#40;);
  duration = &#40;double&#41;&#40;finish - start&#41; / CLOCKS_PER_SEC;
  printf&#40;"------------------ %s ----------------\n", s&#41;;
  printf&#40;"time %s %f sec\n", s, duration&#41;;
  printf&#40;"\n");
&#125;
// ============================================================
int main&#40;) &#123;
  
  init_data&#40;);

/* BSR */
  start = clock&#40;);
  for &#40;j=0; j < REPEAT; j++)
    for &#40;i=0; i< TEST; i++) 
     restab&#91;i&#93; = BSR_asm&#40;test&#91;i&#93;);
  
  print_result&#40;"BSR_asm");

  start = clock&#40;);
  for &#40;j=0; j < REPEAT; j++)
    for &#40;i=0; i< TEST; i++) 
     restab&#91;i&#93; = BSR_in_C&#40;test&#91;i&#93;);

  print_result&#40;"BSR_in_C");
  
  start = clock&#40;);
  for &#40;j=0; j < REPEAT; j++)
    for &#40;i=0; i< TEST; i++) 
     restab&#91;i&#93; = BSR_double&#40;test&#91;i&#93;);

  print_result&#40;"BSR_double");

    /* BSF */
  start = clock&#40;);
  for &#40;j=0; j < REPEAT; j++)
    for &#40;i=0; i< TEST; i++) 
     restab&#91;i&#93; = BSF_asm&#40;test&#91;i&#93;);
  
  print_result&#40;"BSF_asm");
    
  start = clock&#40;);
  for &#40;j=0; j < REPEAT; j++)
    for &#40;i=0; i< TEST; i++) 
     restab&#91;i&#93; = BSF_folded&#40;test&#91;i&#93;);

  print_result&#40;"BSF_folded");

  return 0;
&#125;

Pascal Georges
Gerd Isenberg
Posts: 2250
Joined: Wed Mar 08, 2006 8:47 pm
Location: Hattingen, Germany

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by Gerd Isenberg »

pgeorges wrote:Hi,

In order to port an engine from x86 to ARM I had to get rid of assembly code. Various solutions are available, particularly here :
http://chessprogramming.wikispaces.com/BitScan
and on this forum.
The conversion of bsr to C code led to a 3% penalty so I considered various solutions and here are my results (for 1,000,000,000 iterations on a random dataset) :

------------------ BSR_asm ----------------
time BSR_asm 2.510000 sec
------------------ BSR_in_C ----------------
time BSR_in_C 6.730000 sec
------------------ BSR_double ----------------
time BSR_double 20.010000 sec
------------------ BSF_asm ----------------
time BSF_asm 2.500000 sec
------------------ BSF_folded ----------------
time BSF_folded 6.230000 sec

The code is attached below, hope this will save time in research for others ;-)

So here is my question : can you give a better solution for the two best C functions (BSR_in_C and BSF_folded) ?
Walter Faxon's routine might be worth a try as bsf replacement on ARM. What processor did you use for your bench? Guess intel core2.
pgeorges

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by pgeorges »

I made the tests with an Intel T2300 @ 1.66 GHz.

The BSF function you propose is 35% slower than the BSF_folded one (x86). Do you think on ARM it could be the opposite, and prove to be faster ?

Pascal Georges
Gerd Isenberg
Posts: 2250
Joined: Wed Mar 08, 2006 8:47 pm
Location: Hattingen, Germany

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by Gerd Isenberg »

pgeorges wrote:I made the tests with an Intel T2300 @ 1.66 GHz.

The BSF function you propose is 35% slower than the BSF_folded one (x86). Do you think on ARM it could be the opposite, and prove to be faster ?

Pascal Georges
No idea how fast multiplication on ARM is. Walter's routine has only simple instructions and might be faster.

Gerd
Michael Sherwin
Posts: 3196
Joined: Fri May 26, 2006 3:00 am
Location: WY, USA
Full name: Michael Sherwin

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by Michael Sherwin »

Gerd Isenberg wrote:
pgeorges wrote:I made the tests with an Intel T2300 @ 1.66 GHz.

The BSF function you propose is 35% slower than the BSF_folded one (x86). Do you think on ARM it could be the opposite, and prove to be faster ?

Pascal Georges
No idea how fast multiplication on ARM is. Walter's routine has only simple instructions and might be faster.

Gerd
Because, the ARM is a RISC style processor and in theory can run more simple instructions faster than an x86 (given the same clock rate) can run fewer complicated instructions!
If you are on a sidewalk and the covid goes beep beep
Just step aside or you might have a bit of heat
Covid covid runs through the town all day
Can the people ever change their ways
Sherwin the covid's after you
Sherwin if it catches you you're through
bnemias
Posts: 373
Joined: Thu Aug 14, 2008 3:21 am
Location: Albuquerque, NM

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by bnemias »

Does your machine have the "clz" instruction?

if you are using GCC, then you should have __builtin_clz().

I have MIPS assembly routines for BSF and BSR using "clz" instruction if you need them. (Ported both stockfish and robbolito so they run on my MIPS and MIPS32v2 routers)
bnemias
Posts: 373
Joined: Thu Aug 14, 2008 3:21 am
Location: Albuquerque, NM

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by bnemias »

Actually, here's the implementation I used. I did make UINT64 versions, but I found this to be as fast. And I ran into a stability problem writing the complete 64bit assembly routines because I don't understand the ATT syntax properly, I guess.

Code: Select all

typedef unsigned long long int uint64;
typedef unsigned int uint32;

#define BB_LOWORD&#40;x&#41; (&#40;uint32&#41;&#40;x&#41;)
#define BB_HIWORD&#40;x&#41; (&#40;uint64&#41;&#40;x&#41;>>32&#41;

static inline int ffs_mips32&#40;uint32 word&#41;
&#123;
    asm (
      // LS1B isolation&#58; &#40;x&#41; & (-x&#41;
      "negu $t0, %0\n"
      "and %0, $t0, %0\n"

      // now just fls
      "clz %0, %0\n"
      "li $t0, 31\n"
      "subu %0, $t0, %0\n"

      &#58; "=g" &#40;word&#41;
      &#58; "0" &#40;word&#41;
      &#58; "t0"
    );
    return &#40;int&#41;word;
&#125;

static inline int fls_mips32&#40;uint32 word&#41;
&#123;
  asm (
    "clz %0, %0\n"
    "li $t0, 31\n"
    "subu %0, $t0, %0\n"

    &#58; "=g" &#40;word&#41;
    &#58; "0" &#40;word&#41;
    &#58; "t0"
  );
  return &#40;int&#41;word;
&#125;

int BSR&#40;uint64 bb&#41;
&#123;
    uint32 w = BB_HIWORD&#40;bb&#41;;
    if &#40;w&#41;
        return 32 + fls_mips32&#40;w&#41;;
    return fls_mips32&#40;BB_LOWORD&#40;bb&#41;);
&#125;

int BSF&#40;uint64 bb&#41;
&#123;
    uint32 ls = BB_LOWORD&#40;bb&#41;;
    if &#40;ls&#41;
        return ffs_mips32&#40;ls&#41;;
    return 32 + ffs_mips32&#40;BB_HIWORD&#40;bb&#41;);
&#125;
pgeorges

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by pgeorges »

Thank you very much for the hint, I was not aware of the clz instruction.

So I end up with this code for a BitScanReverse64 function, that should be the most optimized for X86, ARM and generic C :

Code: Select all

static inline int BSR32&#40;uint32 bb&#41; &#123;

#ifdef USE_ARM_ASM
  
  return ( 31 - __builtin_clz&#40;bb&#41; );
      
#else  
  int result = 0;
   
  if &#40;bb > 0xFFFF&#41; &#123;
      bb >>= 16;
      result += 16;
   &#125;
   if &#40;bb > 0xFF&#41; &#123;
      bb >>= 8;
      result += 8;
   &#125;

   return &#40;result + msb_256_table&#91;bb&#93;);
#endif
&#125;

static inline int BSR &#40;uint64 bb&#41; &#123;
#ifdef USE_X86_ASM
    int x1, x2;
asm ("bsr %1,%0\n" "jnz 1f\n" "bsr %0,%0\n" "subl $32,%0\n"
     "1&#58; addl $32,%0\n"&#58; "=&q" &#40;x1&#41;, "=&q" &#40;x2&#41;&#58;"1" (&#40;int&#41; &#40;bb >> 32&#41;),
     "0" (&#40;int&#41; bb&#41;);
  return x1;
#else
  const uint32 hb = bb >> 32;
  return hb ? 32 + BSR32&#40;&#40;uint32&#41;hb&#41; &#58; BSR32&#40;&#40;uint32&#41;bb&#41;;
#endif
&#125;
Pascal Georges
Gerd Isenberg
Posts: 2250
Joined: Wed Mar 08, 2006 8:47 pm
Location: Hattingen, Germany

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by Gerd Isenberg »

pgeorges wrote:Thank you very much for the hint, I was not aware of the clz instruction.

So I end up with this code for a BitScanReverse64 function, that should be the most optimized for X86, ARM and generic C :

Code: Select all

static inline int BSR32&#40;uint32 bb&#41; &#123;

#ifdef USE_ARM_ASM
  
  return ( 31 - __builtin_clz&#40;bb&#41; );
      
#else  
  int result = 0;
   
  if &#40;bb > 0xFFFF&#41; &#123;
      bb >>= 16;
      result += 16;
   &#125;
   if &#40;bb > 0xFF&#41; &#123;
      bb >>= 8;
      result += 8;
   &#125;

   return &#40;result + msb_256_table&#91;bb&#93;);
#endif
&#125;

static inline int BSR &#40;uint64 bb&#41; &#123;
#ifdef USE_X86_ASM
    int x1, x2;
asm ("bsr %1,%0\n" "jnz 1f\n" "bsr %0,%0\n" "subl $32,%0\n"
     "1&#58; addl $32,%0\n"&#58; "=&q" &#40;x1&#41;, "=&q" &#40;x2&#41;&#58;"1" (&#40;int&#41; &#40;bb >> 32&#41;),
     "0" (&#40;int&#41; bb&#41;);
  return x1;
#else
  const uint32 hb = bb >> 32;
  return hb ? 32 + BSR32&#40;&#40;uint32&#41;hb&#41; &#58; BSR32&#40;&#40;uint32&#41;bb&#41;;
#endif
&#125;
Pascal Georges
Since ARM has no trailing zero count, you may even end up with following code for bsf32:

Code: Select all

static inline int BSF32&#40;uint32 bb&#41; &#123;

#ifdef USE_ARM_ASM
 
  return  __builtin_clz&#40;bb & -bb&#41;  ^ 31;
     
#else  
diep
Posts: 1822
Joined: Thu Mar 09, 2006 11:54 pm
Location: The Netherlands

Re: Bit Scan (equivalent to ASM instructions bsr and bsf)

Post by diep »

Small question: why use bitboards at a 32 bits ARM processor?

You know, you can try put car wheels underneath a bicycle, but the question is whether that makes any sense.

Vincent