Its because no cycle is wasted in "Am I at my max depth yet"? For all recursions below depth N. So that is really a high number of unnecessary stop recursion tests. You can see in the code that most tests go away in the code!
Also you have a specialisation template where you can decide if to do quiesce search to the maxdepth AND dont have any cpu instruction to test for "!Tactiacal Move below max depth"
I get 800M evals/s which is still way too slow IMO. Parallel versions of Alphabeta are needed (but the algorithm itself forces serialisation) so its a very hard problem to solve.
TLDR: 200% faster alphabeta
Code: Select all
#pragma once
#include <immintrin.h>
#include <stdint.h>
#include <bit>
#include <string>
#include <iostream>
#include <array>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
namespace Quiesc {
constexpr int calc_depth = 7;
constexpr int max_depth = 12;
int g_seed;
inline int eval() {
g_seed = (214013 * g_seed + 2531011);
return (g_seed >> 16) & 0x7FFF;
}
//Quiet move search extension here
inline bool StopExpand(int depth) {
//return depth >= calc_depth && !move.isTactical()) || (depth >= maxDepth)
return depth >= calc_depth;
}
int bestEval(int alpha, int depth, bool maxplayer) {
int score, beta = maxplayer ? -32000 : +32000;
int movelist = 22;
//while (getMove()) {
while (movelist--) {
if (StopExpand(depth))
score = eval();
else
score = bestEval(beta, depth + 1, !maxplayer);
if (maxplayer) {
if (score > beta) beta = score;
if (beta >= alpha && depth > 2) break;
}
else { // minplayer
if (score < beta) beta = score;
if (beta <= alpha && depth > 2) break;
}
}
return beta;
}
template<int depth, bool maxplayer>
int bestEval(int alpha) {
int score, beta = maxplayer ? -32000 : +32000;
int movelist = 22;
//while (getMove()) {
while (movelist--) {
score = bestEval<depth + 1, !maxplayer>(beta);
if constexpr (maxplayer) {
if (score > beta) beta = score;
if (beta >= alpha && depth > 2) break;
}
else { // minplayer
if (score < beta) beta = score;
if (beta <= alpha && depth > 2) break;
}
}
return beta;
}
template<>
inline int bestEval<calc_depth, true>(int alpha) {
int beta = -32000;
int movelist = 22;
while (movelist--) {
if ((beta = std::max(eval(), beta)) >= alpha) break;
}
return beta;
}
template<>
inline int bestEval<calc_depth, false>(int alpha) {
int beta = +32000;
int movelist = 22;
while (movelist--) {
if ((beta = std::min(eval(), beta)) <= alpha) break;
}
return beta;
}
double Quiesc() {
g_seed = 2168210766;
auto t0 = std::chrono::high_resolution_clock::now();
if (bestEval(32000, 1, true) != 29501) std::cout << "Programming Error";
auto t1 = std::chrono::high_resolution_clock::now();
return 1000.0 * 94136204.0 / std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
}
double Quiesc_Template() {
g_seed = 2168210766;
auto t0 = std::chrono::high_resolution_clock::now();
if (bestEval<1, true>(32000) != 29501) std::cout << "Programming Error";
auto t1 = std::chrono::high_resolution_clock::now();
return 1000.0 * 94136204.0 / std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
}
void QuiescensePerf() {
std::cout << "Quiesc: " << Quiesc() << "M Eval/s\n";
std::cout << "Quiesc_Template: " << Quiesc_Template() << "M Eval/s\n";
}
}
Instead of doing beta = eval(); if )beta <= alpha) break; One could have a eval() that evaluates four boards at once (with AVX2 if possible) and since the moves should be ordered anyways the likelyhood of having a break in the first 4 evals is high. AVX2 can do 4 comparisons in one clock cycle and the moveptr can be decremented by 4 at once (and not --, --, --, --)