In order to port an engine from x86 to ARM I had to get rid of assembly code. Various solutions are available, particularly here :
http://chessprogramming.wikispaces.com/BitScan
and on this forum.
The conversion of bsr to C code led to a 3% penalty so I considered various solutions and here are my results (for 1,000,000,000 iterations on a random dataset) :
------------------ BSR_asm ----------------
time BSR_asm 2.510000 sec
------------------ BSR_in_C ----------------
time BSR_in_C 6.730000 sec
------------------ BSR_double ----------------
time BSR_double 20.010000 sec
------------------ BSF_asm ----------------
time BSF_asm 2.500000 sec
------------------ BSF_folded ----------------
time BSF_folded 6.230000 sec
The code is attached below, hope this will save time in research for others
So here is my question : can you give a better solution for the two best C functions (BSR_in_C and BSF_folded) ?
Code: Select all
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define uint64 unsigned long long
#define uint32 unsigned int
const int lsb_64_table[64] = {
63, 30, 3, 32, 59, 14, 11, 33,
60, 24, 50, 9, 55, 19, 21, 34,
61, 29, 2, 53, 51, 23, 41, 18,
56, 28, 1, 43, 46, 27, 0, 35,
62, 31, 58, 4, 5, 49, 54, 6,
15, 52, 12, 40, 7, 42, 45, 16,
25, 57, 48, 13, 10, 39, 8, 44,
20, 47, 38, 22, 17, 37, 36, 26
};
/* ============================================ */
static inline int BSR_double (uint64 bb) {
union {
double d;
struct {
unsigned int mantissal : 32;
unsigned int mantissah : 20;
unsigned int exponent : 11;
unsigned int sign : 1;
};
} ud;
ud.d = (double)(bb & ~(bb >> 32));
return ud.exponent - 1023;
}
/* ============================================ */
const char msb_256_table[256] = {
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4,4, 4, 4, 4,4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
};
static inline int BSR32(uint32 bb) {
int result = 0;
if (bb > 0xFFFF) {
bb >>= 16;
result += 16;
}
if (bb > 0xFF) {
bb >>= 8;
result += 8;
}
return (result + msb_256_table[bb]);
}
/* ============================================ */
static inline int BSR_in_C(uint64 bb) {
const uint32 hb = bb >> 32;
return hb ? 32 + BSR32((uint32)hb) : BSR32((uint32)bb);
}
/* ============================================ */
static inline int BSR_asm (uint64 w) {
int x1, x2;
asm ("bsr %1,%0\n" "jnz 1f\n" "bsr %0,%0\n" "subl $32,%0\n"
"1: addl $32,%0\n": "=&q" (x1), "=&q" (x2):"1" ((int) (w >> 32)),
"0" ((int) w));
return x1;
}
/* ============================================ */
static inline int BSF_folded (uint64 bb) {
unsigned int folded;
bb ^= bb - 1;
folded = (int) bb ^ (bb >> 32);
return lsb_64_table[folded * 0x78291ACF >> 26];
}
/* ============================================ */
static inline int BSF_asm (uint64 w) {
int x1, x2;
asm ("bsf %0,%0\n" "jnz 1f\n" "bsf %1,%0\n" "jz 1f\n" "addl $32,%0\n"
"1:": "=&q" (x1), "=&q" (x2):"1" ((int) (w >> 32)),
"0" ((int) w));
return x1;
}
#define TEST 1000
uint64 test[TEST];
int restab[TEST];
#define REPEAT 1000000
clock_t start, finish;
int i, j, res;
double duration;
void init_data() {
int i = 0;
srand(time(NULL));
for (i = 0 ; i< TEST; i++) {
uint64 x = rand();
uint64 y = ((uint64) rand()) << 32 ;
test[i] = x + y;
}
}
// ============================================================
void print_result(char *s) {
int i = 0;
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf("------------------ %s ----------------\n", s);
printf("time %s %f sec\n", s, duration);
printf("\n");
}
// ============================================================
int main() {
init_data();
/* BSR */
start = clock();
for (j=0; j < REPEAT; j++)
for (i=0; i< TEST; i++)
restab[i] = BSR_asm(test[i]);
print_result("BSR_asm");
start = clock();
for (j=0; j < REPEAT; j++)
for (i=0; i< TEST; i++)
restab[i] = BSR_in_C(test[i]);
print_result("BSR_in_C");
start = clock();
for (j=0; j < REPEAT; j++)
for (i=0; i< TEST; i++)
restab[i] = BSR_double(test[i]);
print_result("BSR_double");
/* BSF */
start = clock();
for (j=0; j < REPEAT; j++)
for (i=0; i< TEST; i++)
restab[i] = BSF_asm(test[i]);
print_result("BSF_asm");
start = clock();
for (j=0; j < REPEAT; j++)
for (i=0; i< TEST; i++)
restab[i] = BSF_folded(test[i]);
print_result("BSF_folded");
return 0;
}