What CPU architectures do you support?

Discussion of chess software programming and technical issues.

Moderator: Ras

smatovic
Posts: 3226
Joined: Wed Mar 10, 2010 10:18 pm
Location: Hamburg, Germany
Full name: Srdja Matovic

What CPU architectures do you support?

Post by smatovic »

Pondering about CPU chess engine development and architectures, I did GPGPU via OpenCL on Intel, AMD and Nvidia devices (Apple dropped support).

ARM-Cortex with NEON seems affordable via Raspberry Pi, also RISC-V via developer boards, Apple M-series via Mac Mini already on another level, ARM Neoverse is available in the cloud.

Would be fun to tinker with IBM POWER with VMX/VSX, looks like Stockfish dropped POWER from their Makefile. Recent POWER10 and SPARC64 XII seem not to be available as workstations/developer systems but as servers only, maybe there is a chance to get hand on these via cloud...

What CPU architectures do you support?

Maybe I will just stick with x86-64, and as option emulate other systems via QEMU to ensure compatibility w/o tuning/profiling...

--
Srdja
smatovic
Posts: 3226
Joined: Wed Mar 10, 2010 10:18 pm
Location: Hamburg, Germany
Full name: Srdja Matovic

Re: What CPU architectures do you support?

Post by smatovic »

Ah, "ppc-64 ppc-32" is still in the Stockfish Makefile (and meanwhile riscv64), dunno about their SIMD code for NNUE inference.

--
Srdja
chrisw
Posts: 4624
Joined: Tue Apr 03, 2012 4:28 pm
Location: Midi-Pyrénées
Full name: Christopher Whittington

Re: What CPU architectures do you support?

Post by chrisw »

smatovic wrote: Sat Apr 15, 2023 9:00 am Ah, "ppc-64 ppc-32" is still in the Stockfish Makefile (and meanwhile riscv64), dunno about their SIMD code for NNUE inference.

--
Srdja
SF SIM inference code:

#ifdef USE_AVX512
using vec_t = __m512i;
using psqt_vec_t = __m256i;
#define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
#define vec_zero() _mm512_setzero_epi32()
#define vec_set_16(a) _mm512_set1_epi16(a)
#define vec_max_16(a,b) _mm512_max_epi16(a,b)
#define vec_min_16(a,b) _mm512_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
}
#define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 32
#define MaxChunkSize 64

#elif USE_AVX2
using vec_t = __m256i;
using psqt_vec_t = __m256i;
#define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
#define vec_zero() _mm256_setzero_si256()
#define vec_set_16(a) _mm256_set1_epi16(a)
#define vec_max_16(a,b) _mm256_max_epi16(a,b)
#define vec_min_16(a,b) _mm256_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
return _mm256_permute4x64_epi64(compacted, 0b11011000);
}
#define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 16
#define MaxChunkSize 32

#elif USE_SSE2
using vec_t = __m128i;
using psqt_vec_t = __m128i;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
#define vec_zero() _mm_setzero_si128()
#define vec_set_16(a) _mm_set1_epi16(a)
#define vec_max_16(a,b) _mm_max_epi16(a,b)
#define vec_min_16(a,b) _mm_min_epi16(a,b)
#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
#define vec_zero_psqt() _mm_setzero_si128()
#define NumRegistersSIMD (Is64Bit ? 16 : 8)
#define MaxChunkSize 16

#elif USE_MMX
using vec_t = __m64;
using psqt_vec_t = __m64;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
#define vec_zero() _mm_setzero_si64()
#define vec_set_16(a) _mm_set1_pi16(a)
inline vec_t vec_max_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
}
inline vec_t vec_min_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
}
#define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
#define vec_zero_psqt() _mm_setzero_si64()
#define vec_cleanup() _mm_empty()
#define NumRegistersSIMD 8
#define MaxChunkSize 8

#elif USE_NEON
using vec_t = int16x8_t;
using psqt_vec_t = int32x4_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_mul_16(a,b) vmulq_s16(a,b)
#define vec_zero() vec_t{0}
#define vec_set_16(a) vdupq_n_s16(a)
#define vec_max_16(a,b) vmaxq_s16(a,b)
#define vec_min_16(a,b) vminq_s16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
const int8x8_t shifta = vshrn_n_s16(a, 7);
const int8x8_t shiftb = vshrn_n_s16(b, 7);
const int8x16_t compacted = vcombine_s8(shifta,shiftb);
return *reinterpret_cast<const vec_t*> (&compacted);
}
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) vaddq_s32(a,b)
#define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
#define vec_zero_psqt() psqt_vec_t{0}
#define NumRegistersSIMD 16
#define MaxChunkSize 16
jdart
Posts: 4398
Joined: Fri Mar 10, 2006 5:23 am
Location: http://www.arasanchess.org

Re: What CPU architectures do you support?

Post by jdart »

I support pretty much everything, with SIMD optimizations for Intel and ARM (including M1). Arasan's code works on both little- and big-endian systems. At one point I used PolarHome (http://aix.polarhome.com/) for testing - they had a variety of hardware and software systems available for remote shell access, but that site is no longer operating.
smatovic
Posts: 3226
Joined: Wed Mar 10, 2010 10:18 pm
Location: Hamburg, Germany
Full name: Srdja Matovic

Re: What CPU architectures do you support?

Post by smatovic »

Haha, IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay :roll:

--
Srdja
jdart
Posts: 4398
Joined: Fri Mar 10, 2006 5:23 am
Location: http://www.arasanchess.org

Re: What CPU architectures do you support?

Post by jdart »

smatovic wrote: Sat Apr 15, 2023 10:44 pm IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay
True, but one of the problems with older hardware is that you also have older software. In particular, it can be hard to get a modern C++ compiler for these machines, unless you can build GCC from source (and have the disk/CPU to do that).
User avatar
flok
Posts: 558
Joined: Tue Jul 03, 2018 10:19 am
Full name: Folkert van Heusden

Re: What CPU architectures do you support?

Post by flok »

smatovic wrote: Sat Apr 15, 2023 8:29 amWhat CPU architectures do you support?
Anything exporting the posix api as well as the esp32 microcontroller.
smatovic
Posts: 3226
Joined: Wed Mar 10, 2010 10:18 pm
Location: Hamburg, Germany
Full name: Srdja Matovic

Re: What CPU architectures do you support?

Post by smatovic »

jdart wrote: Sun Apr 16, 2023 7:07 pm
smatovic wrote: Sat Apr 15, 2023 10:44 pm IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay
True, but one of the problems with older hardware is that you also have older software. In particular, it can be hard to get a modern C++ compiler for these machines, unless you can build GCC from source (and have the disk/CPU to do that).
True, outdated tool-chain on outdated OS...

--
Srdja
syzygy
Posts: 5694
Joined: Tue Feb 28, 2012 11:56 pm

Re: What CPU architectures do you support?

Post by syzygy »

jdart wrote: Sun Apr 16, 2023 7:07 pm
smatovic wrote: Sat Apr 15, 2023 10:44 pm IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay
True, but one of the problems with older hardware is that you also have older software. In particular, it can be hard to get a modern C++ compiler for these machines, unless you can build GCC from source (and have the disk/CPU to do that).
If the machine is not powerful enough to build GCC from source, then a cross-compiler can help.
You can even do profile-guided optimisation with qemu.