Initial work on new merged kings network. Delete third-party code factorizer. Refactor Makefile to move networks out of the main repo. Move book to appropriate directory (bench 5472385)

This commit is contained in:
2025-01-18 12:18:08 +01:00
parent cf6fe22513
commit a8af4cd70d
10 changed files with 41 additions and 255 deletions

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "networks"]
path = networks
url = https://git.nocturn9x.space/heimdall-engine/networks

230
3rdparty/fcq.cpp vendored
View File

@@ -1,230 +0,0 @@
/*
* MIT License
*
* Copyright (c) 2024 Ciekce
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <cstdint>
#include <array>
#include <memory>
#include <cstddef>
#include <cstring>
#include <cassert>
#include <algorithm>
#include <cmath>
namespace
{
enum class QuantiseMode
{
Truncate,
Round,
};
constexpr auto InFile = "raw.bin";
constexpr auto OutFile = "factorised.bin";
constexpr bool Factorised = true;
constexpr bool PairwiseMul = false;
constexpr bool TransposeOutputWeights = false;
constexpr std::uint32_t InputSize = 768;
constexpr std::uint32_t InputBuckets = 16;
constexpr std::uint32_t L1 = 1280;
constexpr std::uint32_t OutputBuckets = 8;
constexpr float Clip = 1.98F;
constexpr std::uint32_t L1Q = 255;
constexpr std::uint32_t OutputQ = 64;
constexpr QuantiseMode Mode = QuantiseMode::Round;
constexpr std::size_t PaddingBlockSize = 64;
// ========================================================================
namespace internal
{
template <typename T, std::size_t N, std::size_t... Ns>
struct MultiArrayImpl
{
using Type = std::array<typename MultiArrayImpl<T, Ns...>::Type, N>;
};
template <typename T, std::size_t N>
struct MultiArrayImpl<T, N>
{
using Type = std::array<T, N>;
};
}
template <typename T, std::size_t... Ns>
using MultiArray = typename internal::MultiArrayImpl<T, Ns...>::Type;
constexpr auto L1Weights = 2 * L1 / (1 + PairwiseMul);
template <typename Param, std::uint32_t InputBuckets>
struct Network
{
MultiArray<Param, InputBuckets, InputSize * L1> ftWeights;
std::array<Param, L1> ftBiases;
std::array<Param, L1Weights * OutputBuckets> l1Weights;
std::array<Param, OutputBuckets> l1Biases;
};
using RawNetwork = Network<float, InputBuckets + Factorised>;
using QuantisedNetwork = Network<std::int16_t, InputBuckets>;
using RawNetworkUnfactorised = Network<float, InputBuckets>;
template <std::uint32_t Q>
[[nodiscard]] inline auto quantise(float v)
{
v = std::clamp(v, -Clip, Clip);
v *= static_cast<float>(Q);
if constexpr (Mode == QuantiseMode::Round)
v = std::round(v);
assert(std::abs(v) <= static_cast<float>(std::numeric_limits<std::int16_t>::max()));
return static_cast<std::int16_t>(v);
}
template <std::size_t Block>
[[nodiscard]] inline auto pad(std::size_t v)
{
return ((v + Block - 1) / Block) * Block;
}
}
auto main() -> int
{
auto raw = std::make_unique<RawNetwork>();
{
std::ifstream in{InFile, std::ios::binary};
if (!in)
{
std::cerr << "failed to open source network" << std::endl;
std::cerr << std::strerror(errno) << std::endl;
return 1;
}
if (!in.read(reinterpret_cast<char *>(raw.get()), sizeof(RawNetwork)))
{
std::cerr << "failed to load source network" << std::endl;
if (in.eof())
{
std::cerr << "Source network too small";
if (Factorised && in.gcount() >= sizeof(RawNetworkUnfactorised))
std::cerr << " - unfactorised network?";
std::cerr << std::endl;
}
else std::cerr << std::strerror(errno) << std::endl;
return 1;
}
}
auto quantised = std::make_unique<QuantisedNetwork>();
for (std::uint32_t bucket = 0; bucket < InputBuckets; ++bucket)
{
for (std::uint32_t weight = 0; weight < InputSize * L1; ++weight)
{
auto param = raw->ftWeights[bucket + Factorised][weight];
if constexpr (Factorised)
param += raw->ftWeights[0][weight];
quantised->ftWeights[bucket][weight] = quantise<L1Q>(param);
}
}
for (std::uint32_t bias = 0; bias < L1; ++bias)
{
quantised->ftBiases[bias] = quantise<L1Q>(raw->ftBiases[bias]);
}
if constexpr (TransposeOutputWeights)
{
for (std::uint32_t weight = 0; weight < L1Weights; ++weight)
{
for (std::uint32_t bucket = 0; bucket < OutputBuckets; ++bucket)
{
const auto src = weight * OutputBuckets + bucket;
const auto dst = bucket * L1Weights + weight;
quantised->l1Weights[dst] = quantise<OutputQ>(raw->l1Weights[src]);
}
}
}
else
{
for (std::uint32_t weight = 0; weight < L1Weights * OutputBuckets; ++weight)
{
quantised->l1Weights[weight] = quantise<OutputQ>(raw->l1Weights[weight]);
}
}
for (std::uint32_t bias = 0; bias < OutputBuckets; ++bias)
{
quantised->l1Biases[bias] = quantise<L1Q * OutputQ>(raw->l1Biases[bias]);
}
{
std::ofstream out{OutFile, std::ios::binary};
if (!out.write(reinterpret_cast<const char *>(quantised.get()), sizeof(QuantisedNetwork)))
{
std::cerr << "failed to write transposed network" << std::endl;
std::cerr << std::strerror(errno) << std::endl;
return 1;
}
if constexpr (PaddingBlockSize > 1)
{
if (const auto padding = pad<PaddingBlockSize>(sizeof(QuantisedNetwork)) - sizeof(QuantisedNetwork);
padding != 0)
{
static const std::array<std::byte, PaddingBlockSize> empty{};
if (!out.write(reinterpret_cast<const char *>(empty.data()), static_cast<std::streamsize>(padding)))
{
std::cerr << "failed to write padding" << std::endl;
std::cerr << std::strerror(errno) << std::endl;
return 1;
}
}
}
}
return 0;
}

View File

@@ -4,30 +4,37 @@
CC := clang
EXE := bin/heimdall
EVALFILE := ../hofud-v2.bin
EVALFILE := ../networks/files/mistilteinn.bin
NET_NAME := $(notdir $(EVALFILE))
LD := ld
SRCDIR := src
LFLAGS := -flto -fuse-ld=$(LD)
NFLAGS := --cc:$(CC) --mm:atomicArc -d:useMalloc -o:$(EXE) -d:evalFile=$(EVALFILE)
NFLAGS := --panics:on --cc:$(CC) --mm:atomicArc -d:useMalloc -o:$(EXE) --passL:"$(LFLAGS)" -d:evalFile=$(EVALFILE)
CFLAGS := -flto -static
CFLAGS_MODERN := -flto -mtune=haswell -march=haswell -static
NFLAGS_MODERN := $(NFLAGS) -d:danger --passC:"$(CFLAGS_MODERN)" --passL:"$(LFLAGS)" -d:simd -d:avx2
CFLAGS_MODERN := $(CFLAGS) -mtune=haswell -march=haswell
NFLAGS_MODERN := $(NFLAGS) -d:danger --passC:"$(CFLAGS_MODERN)" -d:simd -d:avx2
CFLAGS_NATIVE:= -flto -mtune=native -march=native -static
NFLAGS_NATIVE := $(NFLAGS) -d:danger --passC:"$(CFLAGS_MODERN)" --passL:"$(LFLAGS)" -d:simd -d:avx2
CFLAGS_NATIVE:= $(CFLAGS) -mtune=native -march=native
NFLAGS_NATIVE := $(NFLAGS) -d:danger --passC:"$(CFLAGS_NATIVE)" -d:simd -d:avx2
CFLAGS_LEGACY := -flto -mtune=core2 -march=core2 -static
NFLAGS_LEGACY := $(NFLAGS) -d:danger --passC:"$(CFLAGS_LEGACY)" --passL:"$(LFLAGS)" -u:simd -u:avx2
CFLAGS_LEGACY := $(CFLAGS) -mtune=core2 -march=core2
NFLAGS_LEGACY := $(NFLAGS) -d:danger --passC:"$(CFLAGS_LEGACY)" -u:simd -u:avx2
deps:
nimble install -d
modern: deps
net:
git submodule update --init --recursive
cd networks && git fetch origin && git checkout FETCH_HEAD
git lfs fetch --include files/$(NET_NAME)
modern: deps net
nim c $(NFLAGS_MODERN) $(SRCDIR)/heimdall.nim
legacy: deps
legacy: deps net
nim c $(NFLAGS_LEGACY) $(SRCDIR)/heimdall.nim
native: deps
native: deps net
nim c $(NFLAGS_NATIVE) $(SRCDIR)/heimdall.nim

View File

@@ -24,6 +24,8 @@ architecture the compile was done on (and is what you want for releases/sharing
Or you can grab the latest version from the [releases](https://git.nocturn9x.space/nocturn9x/heimdall/releases) page
**Note**: Unless you know what you're doing and how `nim.cfg` works, you probably don't want to build Heimdall using nimble. Just use the Makefile
**P.S.**: If you want to install Heimdall on your system you can also run `nimble install` (making sure that nimble's
own binary directory is in your system's path), which will build the same executable that a bare `make` would (no
legacy/generic installation support as of now)

1
networks Submodule

Submodule networks added at 62058cd6f5

View File

@@ -17,8 +17,9 @@
#-d:mimalloc
#-d:enableTuning
#-d:pinSearchThreads
-d:evalFile="../hofud-v2.bin"
-d:hlSize=1280
-d:evalFile="../mistilteinn.bin"
-d:hlSize=1536
-d:ftSize=704
-d:inputBuckets=16
-d:evalNormalizeFactor=298
--panics:on

View File

@@ -74,7 +74,13 @@ proc newEvalState*(networkPath: string = ""): EvalState =
func feature(perspective: PieceColor, color: PieceColor, piece: PieceKind, square: Square): int =
## Constructs a feature from the given perspective for a piece
## of the given type and color on the given square
let colorIndex = if perspective == color: 0 else: 1
# We always use index 0 for the king because we do something called merged kings:
# due to the layout of our input buckets (i.e. they don't span more than 2x2 squares),
# it is impossible for two kings to be in the same bucket at any given time, so we can
# save a bunch of space (about 8%) by only accounting for one king per bucket, shrinking
# the size of the feature transformer from 768 inputs to 704
let colorIndex = if (perspective == color or piece == King): 0 else: 1
let pieceIndex = piece.int
let squareIndex = if perspective == White: int(square.flipRank()) else: int(square)

View File

@@ -58,10 +58,13 @@ proc loadNet*(stream: Stream): Network =
for i in 0..<HL_SIZE:
result.ft.bias[i] = stream.readInt16().toLittleEndian()
for i in 0..<(HL_SIZE * 2):
for j in 0..<NUM_OUTPUT_BUCKETS:
# We transpose the output layer for faster CPU inference
result.l1.weight[j][i] = stream.readInt16().toLittleEndian()
for i in 0..<NUM_OUTPUT_BUCKETS:
for j in 0..<(HL_SIZE * 2):
# Note to self: bullet already transposes the weights for us
# so we don't need to do it manually (this is done because it
# allows for faster CPU inference). Just something to keep in
# mind!
result.l1.weight[i][j] = stream.readInt16().toLittleEndian()
for i in 0..<NUM_OUTPUT_BUCKETS:
result.l1.bias[i] = stream.readInt16().toLittleEndian()

Binary file not shown.

View File

@@ -574,9 +574,6 @@ proc log(self: SearchManager, depth, variation: int, line: array[256, Move], bes
proc shouldStop*(self: var SearchManager, inTree=true): bool {.inline.} =
## Returns whether searching should
## stop
if self.expired:
# Search limit has expired before
return true
if self.cancelled():
# Search has been cancelled!
return true
@@ -590,7 +587,6 @@ proc shouldStop*(self: var SearchManager, inTree=true): bool {.inline.} =
self.expired = result
proc getReduction(self: SearchManager, move: Move, depth, ply, moveNumber: int, isPV: static bool, improving, cutNode: bool): int {.inline.} =
## Returns the amount a search depth should be reduced to
let moveCount = when isPV: self.parameters.lmrMoveNumber.pv else: self.parameters.lmrMoveNumber.nonpv