What CPU architectures do you support?

smatovic · Post by **smatovic** » Sat Apr 15, 2023 8:29 am

Pondering about CPU chess engine development and architectures, I did GPGPU via OpenCL on Intel, AMD and Nvidia devices (Apple dropped support).

ARM-Cortex with NEON seems affordable via Raspberry Pi, also RISC-V via developer boards, Apple M-series via Mac Mini already on another level, ARM Neoverse is available in the cloud.

Would be fun to tinker with IBM POWER with VMX/VSX, looks like Stockfish dropped POWER from their Makefile. Recent POWER10 and SPARC64 XII seem not to be available as workstations/developer systems but as servers only, maybe there is a chance to get hand on these via cloud...

What CPU architectures do you support?

Maybe I will just stick with x86-64, and as option emulate other systems via QEMU to ensure compatibility w/o tuning/profiling...

--
Srdja

smatovic · Post by **smatovic** » Sat Apr 15, 2023 9:00 am

Ah, "ppc-64 ppc-32" is still in the Stockfish Makefile (and meanwhile riscv64), dunno about their SIMD code for NNUE inference.

--
Srdja

chrisw · Post by **chrisw** » Sat Apr 15, 2023 2:55 pm

smatovic wrote: ↑Sat Apr 15, 2023 9:00 am Ah, "ppc-64 ppc-32" is still in the Stockfish Makefile (and meanwhile riscv64), dunno about their SIMD code for NNUE inference.

--
Srdja

SF SIM inference code:

#ifdef USE_AVX512
using vec_t = __m512i;
using psqt_vec_t = __m256i;
#define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm512_mullo_epi16(a,b)
#define vec_zero() _mm512_setzero_epi32()
#define vec_set_16(a) _mm512_set1_epi16(a)
#define vec_max_16(a,b) _mm512_max_epi16(a,b)
#define vec_min_16(a,b) _mm512_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm512_packs_epi16(_mm512_srli_epi16(a,7),_mm512_srli_epi16(b,7));
return _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), compacted);
}
#define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 32
#define MaxChunkSize 64

#elif USE_AVX2
using vec_t = __m256i;
using psqt_vec_t = __m256i;
#define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm256_mullo_epi16(a,b)
#define vec_zero() _mm256_setzero_si256()
#define vec_set_16(a) _mm256_set1_epi16(a)
#define vec_max_16(a,b) _mm256_max_epi16(a,b)
#define vec_min_16(a,b) _mm256_min_epi16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
vec_t compacted = _mm256_packs_epi16(_mm256_srli_epi16(a,7), _mm256_srli_epi16(b,7));
return _mm256_permute4x64_epi64(compacted, 0b11011000);
}
#define vec_load_psqt(a) _mm256_load_si256(a)
#define vec_store_psqt(a,b) _mm256_store_si256(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
#define NumRegistersSIMD 16
#define MaxChunkSize 32

#elif USE_SSE2
using vec_t = __m128i;
using psqt_vec_t = __m128i;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_epi16(a,b)
#define vec_zero() _mm_setzero_si128()
#define vec_set_16(a) _mm_set1_epi16(a)
#define vec_max_16(a,b) _mm_max_epi16(a,b)
#define vec_min_16(a,b) _mm_min_epi16(a,b)
#define vec_msb_pack_16(a,b) _mm_packs_epi16(_mm_srli_epi16(a,7),_mm_srli_epi16(b,7))
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
#define vec_zero_psqt() _mm_setzero_si128()
#define NumRegistersSIMD (Is64Bit ? 16 : 8)
#define MaxChunkSize 16

#elif USE_MMX
using vec_t = __m64;
using psqt_vec_t = __m64;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_mul_16(a,b) _mm_mullo_pi16(a,b)
#define vec_zero() _mm_setzero_si64()
#define vec_set_16(a) _mm_set1_pi16(a)
inline vec_t vec_max_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, a), _mm_andnot_si64(comparison, b));
}
inline vec_t vec_min_16(vec_t a,vec_t b){
vec_t comparison = _mm_cmpgt_pi16(a,b);
return _mm_or_si64(_mm_and_si64(comparison, b), _mm_andnot_si64(comparison, a));
}
#define vec_msb_pack_16(a,b) _mm_packs_pi16(_mm_srli_pi16(a,7),_mm_srli_pi16(b,7))
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
#define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
#define vec_zero_psqt() _mm_setzero_si64()
#define vec_cleanup() _mm_empty()
#define NumRegistersSIMD 8
#define MaxChunkSize 8

#elif USE_NEON
using vec_t = int16x8_t;
using psqt_vec_t = int32x4_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_mul_16(a,b) vmulq_s16(a,b)
#define vec_zero() vec_t{0}
#define vec_set_16(a) vdupq_n_s16(a)
#define vec_max_16(a,b) vmaxq_s16(a,b)
#define vec_min_16(a,b) vminq_s16(a,b)
inline vec_t vec_msb_pack_16(vec_t a, vec_t b){
const int8x8_t shifta = vshrn_n_s16(a, 7);
const int8x8_t shiftb = vshrn_n_s16(b, 7);
const int8x16_t compacted = vcombine_s8(shifta,shiftb);
return *reinterpret_cast<const vec_t*> (&compacted);
}
#define vec_load_psqt(a) (*(a))
#define vec_store_psqt(a,b) *(a)=(b)
#define vec_add_psqt_32(a,b) vaddq_s32(a,b)
#define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
#define vec_zero_psqt() psqt_vec_t{0}
#define NumRegistersSIMD 16
#define MaxChunkSize 16

jdart · Post by **jdart** » Sat Apr 15, 2023 5:34 pm

I support pretty much everything, with SIMD optimizations for Intel and ARM (including M1). Arasan's code works on both little- and big-endian systems. At one point I used PolarHome (http://aix.polarhome.com/) for testing - they had a variety of hardware and software systems available for remote shell access, but that site is no longer operating.

smatovic · Post by **smatovic** » Sat Apr 15, 2023 10:44 pm

Haha, IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay

--
Srdja

jdart · Post by **jdart** » Sun Apr 16, 2023 7:07 pm

smatovic wrote: ↑Sat Apr 15, 2023 10:44 pm IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay

True, but one of the problems with older hardware is that you also have older software. In particular, it can be hard to get a modern C++ compiler for these machines, unless you can build GCC from source (and have the disk/CPU to do that).

flok · Post by **flok** » Sun Apr 16, 2023 7:12 pm

smatovic wrote: ↑Sat Apr 15, 2023 8:29 amWhat CPU architectures do you support?

Anything exporting the posix api as well as the esp32 microcontroller.

smatovic · Post by **smatovic** » Sun Apr 16, 2023 7:16 pm

jdart wrote: ↑Sun Apr 16, 2023 7:07 pm
smatovic wrote: ↑Sat Apr 15, 2023 10:44 pm IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay
True, but one of the problems with older hardware is that you also have older software. In particular, it can be hard to get a modern C++ compiler for these machines, unless you can build GCC from source (and have the disk/CPU to do that).

True, outdated tool-chain on outdated OS...

--
Srdja

syzygy · Post by **syzygy** » Tue Apr 18, 2023 1:57 am

jdart wrote: ↑Sun Apr 16, 2023 7:07 pm
smatovic wrote: ↑Sat Apr 15, 2023 10:44 pm IBM POWER7 (VMX+VSX) and SPARC T4 (v9+VIS3) multisocket servers are available for 200 to 400 bucks on ebay
True, but one of the problems with older hardware is that you also have older software. In particular, it can be hard to get a modern C++ compiler for these machines, unless you can build GCC from source (and have the disk/CPU to do that).

If the machine is not powerful enough to build GCC from source, then a cross-compiler can help.
You can even do profile-guided optimisation with qemu.

What CPU architectures do you support?

What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?

Re: What CPU architectures do you support?