Add pairwise multiplication support (bench 5349640)
This commit is contained in:
7
Makefile
7
Makefile
@@ -30,6 +30,7 @@ MERGED_KINGS := 0
|
||||
EVAL_NORMALIZE_FACTOR := 337
|
||||
HORIZONTAL_MIRRORING := 1
|
||||
VERBATIM_NET := 1
|
||||
PAIRWISE_NET := 0
|
||||
HL_SIZE := 1536
|
||||
FT_SIZE := 768
|
||||
ENABLE_TUNING := 0
|
||||
@@ -70,6 +71,12 @@ else
|
||||
CUSTOM_FLAGS += -d:verbatimNet=false
|
||||
endif
|
||||
|
||||
ifeq ($(PAIRWISE_NET),1)
|
||||
CUSTOM_FLAGS += -d:pairwiseNet=true
|
||||
else
|
||||
CUSTOM_FLAGS += -d:pairwiseNet=false
|
||||
endif
|
||||
|
||||
ifeq ($(HORIZONTAL_MIRRORING),1)
|
||||
CUSTOM_FLAGS += -d:horizontalMirroring=true
|
||||
else
|
||||
|
||||
@@ -61,13 +61,15 @@ set the `EvalFile` UCI option to the path of the network file.
|
||||
If you _do_ intend to embed a different neural network than the one heimdall defaults with, there are a bunch of things to change. You can see
|
||||
that the Makefile defines the following options:
|
||||
```Makefile
|
||||
EVALFILE := ../networks/files/mistilteinn-v3-verbatim.bin
|
||||
EVALFILE := ../networks/files/laevateinn-v2-verbatim.bin
|
||||
# [...]
|
||||
INPUT_BUCKETS := 16
|
||||
OUTPUT_BUCKETS := 8
|
||||
MERGED_KINGS := 0
|
||||
EVAL_NORMALIZE_FACTOR := 337
|
||||
HORIZONTAL_MIRRORING := 1
|
||||
VERBATIM_NET := 1
|
||||
PAIRWISE_NET := 0
|
||||
HL_SIZE := 1536
|
||||
FT_SIZE := 768
|
||||
```
|
||||
@@ -85,6 +87,10 @@ it to build with a different one. Specifically:
|
||||
Feel free to ask for help on how to do this. Not doing this will make Heimdall's normalized eval output completely unreliable, as it will be based
|
||||
on the parameters for a different network
|
||||
- `HORIZONTAL_MIRRORING` enables horizontal mirroring
|
||||
- `VERBATIM_NET` builds the network into the executable in a way that requires no post-processing at runtime. Generally used when embedding the output of the `dump` command
|
||||
when running heimdall in mixed mode (doing this with normal nets *will* break things!)
|
||||
- `PAIRWISE_NET` enables pairwise multiplication (see [here](https://cosmo.tardis.ac/files/2024-08-17-multilayer.html) for details). Requires a network trained with pairwise
|
||||
activation to work
|
||||
- `HL_SIZE` controls the size of the first hidden layer
|
||||
- `FT_SIZE` controls the size of the feature transformer (aka input layer)
|
||||
|
||||
|
||||
@@ -352,12 +352,21 @@ proc evaluate*(position: Position, state: EvalState): Score {.inline.} =
|
||||
var weightOffset = 0
|
||||
for accumulator in [state.accumulators[position.sideToMove][state.current].data,
|
||||
state.accumulators[position.sideToMove.opposite()][state.current].data]:
|
||||
for i in 0..<HL_SIZE:
|
||||
let input = accumulator[i]
|
||||
let weight = network.l1.weight[outputBucket][i + weightOffset]
|
||||
let clipped = clamp(input, 0, QA).int32
|
||||
sum += int16(clipped * weight) * clipped
|
||||
weightOffset += HL_SIZE
|
||||
for i in 0..<HL_SIZE div (when PAIRWISE_NET: 2 else: 1):
|
||||
when PAIRWISE_NET:
|
||||
let input1 = accumulator[i]
|
||||
let input2 = accumulator[i + HL_SIZE div 2]
|
||||
let weight = network.l1.weight[outputBucket][i + weightOffset]
|
||||
let clipped1 = clamp(input1, 0, QA).int32
|
||||
let clipped2 = clamp(input2, 0, QA).int32
|
||||
sum += int16(clipped1 * weight) * clipped2
|
||||
else:
|
||||
let input = accumulator[i]
|
||||
let weight = network.l1.weight[outputBucket][i + weightOffset]
|
||||
let clipped = clamp(input, 0, QA).int32
|
||||
sum += int16(clipped * weight) * clipped
|
||||
|
||||
weightOffset += HL_SIZE div (when PAIRWISE_NET: 2 else: 1)
|
||||
# Profit! Now we just need to scale the result
|
||||
return ((sum div QA + network.l1.bias[outputBucket]) * EVAL_SCALE) div (QA * QB)
|
||||
else:
|
||||
@@ -368,17 +377,28 @@ proc evaluate*(position: Position, state: EvalState): Score {.inline.} =
|
||||
for accumulator in [state.accumulators[position.sideToMove][state.current].data,
|
||||
state.accumulators[position.sideToMove.opposite()][state.current].data]:
|
||||
var i = 0
|
||||
while i < HL_SIZE:
|
||||
var input = vecLoad(addr accumulator[i])
|
||||
var weight = vecLoad(addr network.l1.weight[outputBucket][i + weightOffset])
|
||||
var clipped = vecMin16(vecMax16(input, vecZero16()), vecSetOne16(QA))
|
||||
while i < HL_SIZE div (when PAIRWISE_NET: 2 else: 1):
|
||||
# Pairwise Multiplication: instead of doing clip(relu(n*n)) we do clip(relu(n1*n2)),
|
||||
# with n1!=n2: this dimensionality reduction technique helps speed up inference for
|
||||
# large L1s. More details: https://cosmo.tardis.ac/files/2024-08-17-multilayer.html
|
||||
# (see "Pairwise Multiplication")
|
||||
when PAIRWISE_NET:
|
||||
var input1 = vecLoad(addr accumulator[i])
|
||||
var input2 = vecLoad(addr accumulator[i + HL_SIZE div 2])
|
||||
var weight = vecLoad(addr network.l1.weight[outputBucket][i + weightOffset])
|
||||
var clipped1 = vecMin16(vecMax16(input1, vecZero16()), vecSetOne16(QA))
|
||||
var clipped2 = vecMin16(vecMax16(input2, vecZero16()), vecSetOne16(QA))
|
||||
var product = vecMadd16(vecMullo16(clipped1, weight), clipped2)
|
||||
else:
|
||||
var input = vecLoad(addr accumulator[i])
|
||||
var weight = vecLoad(addr network.l1.weight[outputBucket][i + weightOffset])
|
||||
var clipped = vecMin16(vecMax16(input, vecZero16()), vecSetOne16(QA))
|
||||
var product = vecMadd16(vecMullo16(clipped, weight), clipped)
|
||||
|
||||
var product = vecMadd16(vecMullo16(clipped, weight), clipped)
|
||||
sum = vecAdd32(sum, product)
|
||||
|
||||
i += CHUNK_SIZE
|
||||
|
||||
weightOffset += HL_SIZE
|
||||
weightOffset += HL_SIZE div (when PAIRWISE_NET: 2 else: 1)
|
||||
return (vecReduceAdd32(sum) div QA + network.l1.bias[outputBucket]) * EVAL_SCALE div (QA * QB)
|
||||
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ const
|
||||
MERGED_KINGS* {.booldefine: "mergedKings".} = true
|
||||
MIRRORED* {.booldefine: "horizontalMirroring".} = true
|
||||
VERBATIM_NET* {.booldefine: "verbatimNet".} = true
|
||||
PAIRWISE_NET* {.booldefine: "pairwiseNet".} = false
|
||||
NET_ID* {.define: "netID".} = ""
|
||||
# LUT mapping king square to buckets (it's mirrored
|
||||
# because we do HM)
|
||||
@@ -92,7 +93,7 @@ type
|
||||
|
||||
Network* = object
|
||||
ft*: IntLayer[FT_SIZE * NUM_INPUT_BUCKETS, HL_SIZE]
|
||||
l1*: TransposedIntLayer[HL_SIZE * 2, NUM_OUTPUT_BUCKETS]
|
||||
l1*: TransposedIntLayer[HL_SIZE * (when PAIRWISE_NET: 1 else: 2), NUM_OUTPUT_BUCKETS]
|
||||
|
||||
|
||||
func toLittleEndian[T: int16 or uint16](x: T): T {.inline.} =
|
||||
@@ -111,7 +112,7 @@ proc dumpNet*(net: Network, path: string) =
|
||||
for i in 0..<HL_SIZE:
|
||||
file.writeData(addr net.ft.bias[i], 2)
|
||||
|
||||
for i in 0..<(HL_SIZE * 2):
|
||||
for i in 0..<HL_SIZE * (when PAIRWISE_NET: 1 else: 2):
|
||||
for j in 0..<NUM_OUTPUT_BUCKETS:
|
||||
file.writeData(addr net.l1.weight[j][i], 2)
|
||||
|
||||
@@ -128,7 +129,7 @@ proc loadNet*(stream: Stream): Network =
|
||||
result.ft.bias[i] = stream.readInt16().toLittleEndian()
|
||||
|
||||
for i in 0..<NUM_OUTPUT_BUCKETS:
|
||||
for j in 0..<(HL_SIZE * 2):
|
||||
for j in 0..<HL_SIZE * (when PAIRWISE_NET: 1 else: 2):
|
||||
# Note to self: bullet already transposes the weights for us
|
||||
# so we don't need to do it manually (this is done because it
|
||||
# allows for faster CPU inference). Just something to keep in
|
||||
|
||||
Reference in New Issue
Block a user