Add pairwise multiplication support (bench 5349640)

This commit is contained in:
2026-01-13 13:58:34 +01:00
parent 4870972dcb
commit 8cc9e200cc
4 changed files with 51 additions and 17 deletions

View File

@@ -30,6 +30,7 @@ MERGED_KINGS := 0
EVAL_NORMALIZE_FACTOR := 337
HORIZONTAL_MIRRORING := 1
VERBATIM_NET := 1
PAIRWISE_NET := 0
HL_SIZE := 1536
FT_SIZE := 768
ENABLE_TUNING := 0
@@ -70,6 +71,12 @@ else
CUSTOM_FLAGS += -d:verbatimNet=false
endif
ifeq ($(PAIRWISE_NET),1)
CUSTOM_FLAGS += -d:pairwiseNet=true
else
CUSTOM_FLAGS += -d:pairwiseNet=false
endif
ifeq ($(HORIZONTAL_MIRRORING),1)
CUSTOM_FLAGS += -d:horizontalMirroring=true
else

View File

@@ -61,13 +61,15 @@ set the `EvalFile` UCI option to the path of the network file.
If you _do_ intend to embed a different neural network than the one heimdall defaults with, there are a bunch of things to change. You can see
that the Makefile defines the following options:
```Makefile
EVALFILE := ../networks/files/mistilteinn-v3-verbatim.bin
EVALFILE := ../networks/files/laevateinn-v2-verbatim.bin
# [...]
INPUT_BUCKETS := 16
OUTPUT_BUCKETS := 8
MERGED_KINGS := 0
EVAL_NORMALIZE_FACTOR := 337
HORIZONTAL_MIRRORING := 1
VERBATIM_NET := 1
PAIRWISE_NET := 0
HL_SIZE := 1536
FT_SIZE := 768
```
@@ -85,6 +87,10 @@ it to build with a different one. Specifically:
Feel free to ask for help on how to do this. Not doing this will make Heimdall's normalized eval output completely unreliable, as it will be based
on the parameters for a different network
- `HORIZONTAL_MIRRORING` enables horizontal mirroring
- `VERBATIM_NET` builds the network into the executable in a way that requires no post-processing at runtime. Generally used when embedding the output of the `dump` command
when running heimdall in mixed mode (doing this with normal nets *will* break things!)
- `PAIRWISE_NET` enables pairwise multiplication (see [here](https://cosmo.tardis.ac/files/2024-08-17-multilayer.html) for details). Requires a network trained with pairwise
activation to work
- `HL_SIZE` controls the size of the first hidden layer
- `FT_SIZE` controls the size of the feature transformer (aka input layer)

View File

@@ -352,12 +352,21 @@ proc evaluate*(position: Position, state: EvalState): Score {.inline.} =
var weightOffset = 0
for accumulator in [state.accumulators[position.sideToMove][state.current].data,
state.accumulators[position.sideToMove.opposite()][state.current].data]:
for i in 0..<HL_SIZE:
let input = accumulator[i]
let weight = network.l1.weight[outputBucket][i + weightOffset]
let clipped = clamp(input, 0, QA).int32
sum += int16(clipped * weight) * clipped
weightOffset += HL_SIZE
for i in 0..<HL_SIZE div (when PAIRWISE_NET: 2 else: 1):
when PAIRWISE_NET:
let input1 = accumulator[i]
let input2 = accumulator[i + HL_SIZE div 2]
let weight = network.l1.weight[outputBucket][i + weightOffset]
let clipped1 = clamp(input1, 0, QA).int32
let clipped2 = clamp(input2, 0, QA).int32
sum += int16(clipped1 * weight) * clipped2
else:
let input = accumulator[i]
let weight = network.l1.weight[outputBucket][i + weightOffset]
let clipped = clamp(input, 0, QA).int32
sum += int16(clipped * weight) * clipped
weightOffset += HL_SIZE div (when PAIRWISE_NET: 2 else: 1)
# Profit! Now we just need to scale the result
return ((sum div QA + network.l1.bias[outputBucket]) * EVAL_SCALE) div (QA * QB)
else:
@@ -368,17 +377,28 @@ proc evaluate*(position: Position, state: EvalState): Score {.inline.} =
for accumulator in [state.accumulators[position.sideToMove][state.current].data,
state.accumulators[position.sideToMove.opposite()][state.current].data]:
var i = 0
while i < HL_SIZE:
var input = vecLoad(addr accumulator[i])
var weight = vecLoad(addr network.l1.weight[outputBucket][i + weightOffset])
var clipped = vecMin16(vecMax16(input, vecZero16()), vecSetOne16(QA))
while i < HL_SIZE div (when PAIRWISE_NET: 2 else: 1):
# Pairwise Multiplication: instead of doing clip(relu(n*n)) we do clip(relu(n1*n2)),
# with n1!=n2: this dimensionality reduction technique helps speed up inference for
# large L1s. More details: https://cosmo.tardis.ac/files/2024-08-17-multilayer.html
# (see "Pairwise Multiplication")
when PAIRWISE_NET:
var input1 = vecLoad(addr accumulator[i])
var input2 = vecLoad(addr accumulator[i + HL_SIZE div 2])
var weight = vecLoad(addr network.l1.weight[outputBucket][i + weightOffset])
var clipped1 = vecMin16(vecMax16(input1, vecZero16()), vecSetOne16(QA))
var clipped2 = vecMin16(vecMax16(input2, vecZero16()), vecSetOne16(QA))
var product = vecMadd16(vecMullo16(clipped1, weight), clipped2)
else:
var input = vecLoad(addr accumulator[i])
var weight = vecLoad(addr network.l1.weight[outputBucket][i + weightOffset])
var clipped = vecMin16(vecMax16(input, vecZero16()), vecSetOne16(QA))
var product = vecMadd16(vecMullo16(clipped, weight), clipped)
var product = vecMadd16(vecMullo16(clipped, weight), clipped)
sum = vecAdd32(sum, product)
i += CHUNK_SIZE
weightOffset += HL_SIZE
weightOffset += HL_SIZE div (when PAIRWISE_NET: 2 else: 1)
return (vecReduceAdd32(sum) div QA + network.l1.bias[outputBucket]) * EVAL_SCALE div (QA * QB)

View File

@@ -42,6 +42,7 @@ const
MERGED_KINGS* {.booldefine: "mergedKings".} = true
MIRRORED* {.booldefine: "horizontalMirroring".} = true
VERBATIM_NET* {.booldefine: "verbatimNet".} = true
PAIRWISE_NET* {.booldefine: "pairwiseNet".} = false
NET_ID* {.define: "netID".} = ""
# LUT mapping king square to buckets (it's mirrored
# because we do HM)
@@ -92,7 +93,7 @@ type
Network* = object
ft*: IntLayer[FT_SIZE * NUM_INPUT_BUCKETS, HL_SIZE]
l1*: TransposedIntLayer[HL_SIZE * 2, NUM_OUTPUT_BUCKETS]
l1*: TransposedIntLayer[HL_SIZE * (when PAIRWISE_NET: 1 else: 2), NUM_OUTPUT_BUCKETS]
func toLittleEndian[T: int16 or uint16](x: T): T {.inline.} =
@@ -111,7 +112,7 @@ proc dumpNet*(net: Network, path: string) =
for i in 0..<HL_SIZE:
file.writeData(addr net.ft.bias[i], 2)
for i in 0..<(HL_SIZE * 2):
for i in 0..<HL_SIZE * (when PAIRWISE_NET: 1 else: 2):
for j in 0..<NUM_OUTPUT_BUCKETS:
file.writeData(addr net.l1.weight[j][i], 2)
@@ -128,7 +129,7 @@ proc loadNet*(stream: Stream): Network =
result.ft.bias[i] = stream.readInt16().toLittleEndian()
for i in 0..<NUM_OUTPUT_BUCKETS:
for j in 0..<(HL_SIZE * 2):
for j in 0..<HL_SIZE * (when PAIRWISE_NET: 1 else: 2):
# Note to self: bullet already transposes the weights for us
# so we don't need to do it manually (this is done because it
# allows for faster CPU inference). Just something to keep in