From 5b3c244206a8652fe0d60c6966d28fb665356b68 Mon Sep 17 00:00:00 2001 From: Mattia Giambirtone Date: Tue, 14 May 2024 11:20:03 +0200 Subject: [PATCH] Fix issues with joinThread and fix nps scaling issues by not using atomic node counters --- Chess/nimfish/nimfishpkg/search.nim | 157 ++++++++++++++++------------ Chess/nimfish/nimfishpkg/uci.nim | 4 +- 2 files changed, 94 insertions(+), 67 deletions(-) diff --git a/Chess/nimfish/nimfishpkg/search.nim b/Chess/nimfish/nimfishpkg/search.nim index 961dafe..8022a4c 100644 --- a/Chess/nimfish/nimfishpkg/search.nim +++ b/Chess/nimfish/nimfishpkg/search.nim @@ -115,8 +115,8 @@ type searchStart: MonoTime hardLimit: MonoTime softLimit: MonoTime - nodeCount: ptr Atomic[uint64] - maxNodes: ptr Atomic[uint64] + nodeCount: uint64 + maxNodes: uint64 searchMoves: seq[Move] transpositionTable: ptr TTable history: ptr HistoryTable @@ -125,8 +125,14 @@ type # We keep one extra entry so we don't need any special casing # inside the search function when constructing pv lines pvMoves: array[MAX_DEPTH + 1, array[MAX_DEPTH + 1, Move]] + # The highest depth we explored to, including extensions selectiveDepth: int + # Are we the main worker? isMainWorker: bool + # We keep track of all the worker + # threads' respective search states + # to collct statistics efficiently + children: seq[ptr SearchManager] proc newSearchManager*(position: Position, positions: seq[Position], transpositions: ptr TTable, @@ -138,19 +144,15 @@ proc newSearchManager*(position: Position, positions: seq[Position], transpositi searchFlag: ptr Atomic[bool] stopFlag: ptr Atomic[bool] ponderFlag: ptr Atomic[bool] - nodeCounter: ptr Atomic[uint64] - maxNodes: ptr Atomic[uint64] if mainWorker: searchFlag = create(Atomic[bool], sizeof(Atomic[bool])) stopFlag = create(Atomic[bool], sizeof(Atomic[bool])) ponderFlag = create(Atomic[bool], sizeof(Atomic[bool])) - nodeCounter = create(Atomic[uint64], sizeof(Atomic[uint64])) - maxNodes = create(Atomic[uint64], sizeof(Atomic[uint64])) # If we're not the main worker, we expect the shared atomic metadata to be filled in by the # main worker result = SearchManager(board: newChessboard(), transpositionTable: transpositions, stop: stopFlag, - searching: searchFlag, pondering: ponderFlag, history: history, nodeCount: nodeCounter, - maxNodes: maxNodes, killers: killers, isMainWorker: mainWorker) + searching: searchFlag, pondering: ponderFlag, history: history, + killers: killers, isMainWorker: mainWorker) result.board.position = position result.board.positions = positions for i in 0..MAX_DEPTH: @@ -168,8 +170,6 @@ proc `destroy=`*(self: var SearchManager) = dealloc(self.stop) dealloc(self.searching) dealloc(self.pondering) - dealloc(self.maxNodes) - dealloc(self.nodeCount) else: # This state is thread-local and is fine to # destroy *unless* we're the main worker. This @@ -307,16 +307,23 @@ proc stopPondering*(self: var SearchManager) = proc log(self: var SearchManager, depth: int) = if not self.isMainWorker: - # We restrict logging to the main worker. Since - # all important state is shared across threads using - # atomics, the statistics will still be correct (maybe - # out of date, but correct) + # We restrict logging to the main worker to reduce + # noise return + # Using an atomic for such frequently updated counters kills + # performance and cripples nps scaling, so instead we let each + # thread have its own local counters and then aggregate the results + # here + var + nodeCount = self.nodeCount + selDepth = self.selectiveDepth + for child in self.children: + nodeCount += child.nodeCount + selDepth = max(selDepth, child.selectiveDepth) let - nodeCount = self.nodeCount[].load() elapsedMsec = self.elapsedTime().uint64 nps = 1000 * (nodeCount div max(elapsedMsec, 1)) - var logMsg = &"info depth {depth} seldepth {self.selectiveDepth} time {elapsedMsec} nodes {nodeCount} nps {nps}" + var logMsg = &"info depth {depth} seldepth {selDepth} time {elapsedMsec} nodes {nodeCount} nps {nps}" logMsg &= &" hashfull {self.transpositionTable[].getFillEstimate()}" if abs(self.bestRootScore) >= mateScore() - MAX_DEPTH: if self.bestRootScore > 0: @@ -343,10 +350,7 @@ proc shouldStop(self: var SearchManager): bool = if self.timedOut() and not self.isPondering(): # We ran out of time! return true - let - nodeCount = self.nodeCount[].load() - maxNodes = self.maxNodes[].load() - if maxNodes > 0 and nodeCount >= maxNodes: + if self.maxNodes > 0 and self.nodeCount >= self.maxNodes: # Ran out of nodes return true @@ -407,7 +411,7 @@ proc qsearch(self: var SearchManager, ply: int, alpha, beta: Score): Score = if self.board.position.see(move) < 0: continue self.board.doMove(move) - self.nodeCount[].atomicInc() + inc(self.nodeCount) let score = -self.qsearch(ply + 1, -beta, -alpha) self.board.unmakeMove() bestScore = max(score, bestScore) @@ -565,7 +569,7 @@ proc search(self: var SearchManager, depth, ply: int, alpha, beta: Score, isPV: let extension = self.getSearchExtension(move) reduction = self.getReduction(move, depth, ply, i, isPV) - self.nodeCount[].atomicInc() + inc(self.nodeCount) # Find the best move for us (worst move # for our opponent, hence the negative sign) var score: Score @@ -695,25 +699,9 @@ proc aspirationWindow(self: var SearchManager, score: Score, depth: int): Score delta = highestEval() -proc findBestLine*(self: var SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], +proc findBestLine(self: var SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], timePerMove=false, ponder=false): seq[Move] = - ## Finds the principal variation in the current position - ## and returns it, limiting search time according the - ## the remaining time and increment values provided (in - ## milliseconds) and only up to maxDepth ply (if maxDepth - ## is -1, a reasonable limit is picked). If maxNodes is supplied - ## and is nonzero, search will stop once it has analyzed maxNodes - ## nodes. If searchMoves is provided and is not empty, search will - ## be restricted to the moves in the list. Note that regardless of - ## any time limitations or explicit cancellations, the search will - ## not stop until it has at least cleared depth one. Search depth - ## is always constrained to at most MAX_DEPTH ply from the root. If - ## timePerMove is true, the increment is assumed to be zero and the - ## remaining time is considered the time limit for the entire search - ## (note that soft time management is disabled in that case). If ponder - ## is true, the search is performed in pondering mode (i.e. no explicit - ## time limit) and can be switched to a regular search by calling the - ## stopPondering() procedure + ## Internal, singl-threaded search for the principal variation # Apparently negative remaining time is a thing. Welp self.maxSearchTime = if not timePerMove: max(1, (timeRemaining div 10) + ((increment div 3) * 2)) else: timeRemaining @@ -721,8 +709,8 @@ proc findBestLine*(self: var SearchManager, timeRemaining, increment: int64, max result = @[] var pv: array[256, Move] if self.isMainWorker: - self.maxNodes[].store(maxNodes) self.pondering[].store(ponder) + self.maxNodes = maxNodes self.searchMoves = searchMoves self.searchStart = getMonoTime() self.hardLimit = self.searchStart + initDuration(milliseconds=self.maxSearchTime) @@ -730,8 +718,6 @@ proc findBestLine*(self: var SearchManager, timeRemaining, increment: int64, max var maxDepth = maxDepth if maxDepth == -1: maxDepth = 60 - if self.isMainWorker: - self.searching[].store(true) # Iterative deepening loop var score = Score(0) for depth in 1..min(MAX_DEPTH, maxDepth): @@ -750,53 +736,94 @@ proc findBestLine*(self: var SearchManager, timeRemaining, increment: int64, max # anyway if getMonoTime() >= self.softLimit and not self.isPondering(): break - if self.isMainWorker: - self.searching[].store(false) - self.stop[].store(false) for move in pv: if move == nullMove(): break result.add(move) -proc workerFunc(args: tuple[self: SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], +proc workerFunc(args: tuple[self: ptr SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], timePerMove, ponder: bool]) {.thread.} = ## Worker that calls findBestLine in a new thread # Gotta lie to nim's thread analyzer lest it shout at us that we're not # GC safe! {.cast(gcsafe).}: - var self = args.self - discard self.findBestLine(args.timeRemaining, args.increment, args.maxDepth, args.maxNodes, args.searchMoves, args.timePerMove, args.ponder) + discard args.self[].findBestLine(args.timeRemaining, args.increment, args.maxDepth, args.maxNodes, args.searchMoves, args.timePerMove, args.ponder) # Creating threads is expensive, so there's no need to make new ones for every call # to our parallel search. Also, nim leaks thread vars: this keeps the resource leaks # to a minimum -var workers: seq[ref Thread[tuple[self: SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], +var workers: seq[ref Thread[tuple[self: ptr SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], timePerMove, ponder: bool]]] = @[] -proc parallelSearch*(self: var SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], +proc search*(self: var SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], timePerMove=false, ponder=false, numWorkers: int): seq[Move] = - ## Parallel version of findBestLine(): the search is performed - ## using the provided number of worker threads using a shared - ## transposition table. + ## Finds the principal variation in the current position + ## and returns it, limiting search time according the + ## the remaining time and increment values provided (in + ## milliseconds) and only up to maxDepth ply (if maxDepth + ## is -1, a reasonable limit is picked). If maxNodes is supplied + ## and is nonzero, search will stop once it has analyzed maxNodes + ## nodes. If searchMoves is provided and is not empty, search will + ## be restricted to the moves in the list. Note that regardless of + ## any time limitations or explicit cancellations, the search will + ## not stop until it has at least cleared depth one. Search depth + ## is always constrained to at most MAX_DEPTH ply from the root. If + ## timePerMove is true, the increment is assumed to be zero and the + ## remaining time is considered the time limit for the entire search + ## (note that soft time management is disabled in that case). If ponder + ## is true, the search is performed in pondering mode (i.e. no explicit + ## time limit) and can be switched to a regular search by calling the + ## stopPondering() procedure. If numWorkers is > 1, the search is performed + ## in parallel using numWorkers threads while workers.len() + 1 < numWorkers: # We create n - 1 workers because we'll also be searching # ourselves. We use the lazy SMP approach, so we'll exploit the # other threads just to fill up our transposition table and # not much else (for now) - workers.add(new Thread[tuple[self: SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], + workers.add(new Thread[tuple[self: ptr SearchManager, timeRemaining, increment: int64, maxDepth: int, maxNodes: uint64, searchMoves: seq[Move], timePerMove, ponder: bool]]) + self.searching[].store(true) for i in 0..