Make MemCache / Wrapper backends thread-safe (#2112)

mooskagh · web-flow · commit 49d9f12238e4 · 2025-02-07T08:33:59.000+01:00
* Atomic vector

* Lockless MemCache

* Lockless threadsafe wrapper.

* Bugfix

* Build fix.

* Fixing elo-gaining bug.
diff --git a/src/neural/memcache.cc b/src/neural/memcache.cc
@@ -28,6 +28,7 @@
 #include "neural/memcache.h"
 
 #include "neural/cache.h"
+#include "utils/atomic_vector.h"
 #include "utils/smallarray.h"
 
 namespace lczero {
@@ -79,11 +80,8 @@ class MemCacheComputation : public BackendComputation {
   MemCacheComputation(std::unique_ptr<BackendComputation> wrapped_computation,
                       MemCache* memcache)
       : wrapped_computation_(std::move(wrapped_computation)),
-        memcache_(memcache) {
-    keys_.reserve(memcache_->max_batch_size_);
-    values_.reserve(memcache_->max_batch_size_);
-    result_ptrs_.reserve(memcache_->max_batch_size_);
-  }
+        memcache_(memcache),
+        entries_(memcache->max_batch_size_) {}
 
  private:
   size_t UsedBatchSize() const override {
@@ -99,10 +97,11 @@ class MemCacheComputation : public BackendComputation {
         return AddInputResult::FETCHED_IMMEDIATELY;
       }
     }
-    keys_.push_back(hash);
-    auto value = std::make_unique<CachedValue>();
-    value->p.reset(new float[result.p.size()]);
-    result_ptrs_.push_back(result);
+    size_t entry_idx = entries_.emplace_back(
+        Entry{hash, std::make_unique<CachedValue>(), result});
+    auto& value = entries_[entry_idx].value;
+    value->p.reset(pos.legal_moves.empty() ? nullptr
+                                           : new float[pos.legal_moves.size()]);
     return wrapped_computation_->AddInput(
         pos, EvalResultPtr{&value->q,
                            &value->d,
@@ -112,17 +111,21 @@ class MemCacheComputation : public BackendComputation {
 
   virtual void ComputeBlocking() override {
     wrapped_computation_->ComputeBlocking();
-    for (size_t i = 0; i < keys_.size(); ++i) {
-      CachedValueToEvalResult(*values_[i], result_ptrs_[i]);
-      memcache_->cache_.Insert(keys_[i], std::move(values_[i]));
+    for (auto& entry : entries_) {
+      CachedValueToEvalResult(*entry.value, entry.result_ptr);
+      memcache_->cache_.Insert(entry.key, std::move(entry.value));
     }
   }
 
+  struct Entry {
+    uint64_t key;
+    std::unique_ptr<CachedValue> value;
+    EvalResultPtr result_ptr;
+  };
+
   std::unique_ptr<BackendComputation> wrapped_computation_;
-  std::vector<uint64_t> keys_;
-  std::vector<std::unique_ptr<CachedValue>> values_;
-  std::vector<EvalResultPtr> result_ptrs_;
   MemCache* memcache_;
+  AtomicVector<Entry> entries_;
 };
 
 std::unique_ptr<BackendComputation> MemCache::CreateComputation() {
@@ -138,8 +141,11 @@ std::optional<EvalResult> MemCache::GetCachedEvaluation(
   result.d = lock->d;
   result.q = lock->q;
   result.m = lock->m;
-  std::copy(lock->p.get(), lock->p.get() + pos.legal_moves.size(),
-            result.p.begin());
+  if (lock->p) {
+    result.p.reserve(pos.legal_moves.size());
+    std::copy(lock->p.get(), lock->p.get() + pos.legal_moves.size(),
+              std::back_inserter(result.p));
+  }
   return result;
 }
 
diff --git a/src/neural/wrapper.cc b/src/neural/wrapper.cc
@@ -32,6 +32,7 @@
 
 #include "neural/encoder.h"
 #include "neural/shared_params.h"
+#include "utils/atomic_vector.h"
 #include "utils/fastmath.h"
 
 namespace lczero {
@@ -77,30 +78,30 @@ class NetworkAsBackend : public Backend {
 class NetworkAsBackendComputation : public BackendComputation {
  public:
   NetworkAsBackendComputation(NetworkAsBackend* backend)
-      : backend_(backend), computation_(backend_->network_->NewComputation()) {
-    results_.reserve(backend_->attrs_.maximum_batch_size);
-    moves_.reserve(backend_->attrs_.maximum_batch_size);
-    transforms_.reserve(backend_->attrs_.maximum_batch_size);
-  }
+      : backend_(backend),
+        computation_(backend_->network_->NewComputation()),
+        entries_(backend_->attrs_.maximum_batch_size) {}
 
-  size_t UsedBatchSize() const override { return computation_->GetBatchSize(); }
+  size_t UsedBatchSize() const override { return entries_.size(); }
 
   AddInputResult AddInput(const EvalPosition& pos,
                           EvalResultPtr result) override {
     int transform;
-    computation_->AddInput(EncodePositionForNN(backend_->input_format_, pos.pos,
-                                               8, backend_->fill_empty_history_,
-                                               &transform));
-    results_.push_back(result);
-    moves_.emplace_back(pos.legal_moves.begin(), pos.legal_moves.end());
-    transforms_.push_back(transform);
+    const size_t idx = entries_.emplace_back(Entry{
+        .input = EncodePositionForNN(backend_->input_format_, pos.pos, 8,
+                                     backend_->fill_empty_history_, &transform),
+        .legal_moves = MoveList(pos.legal_moves.begin(), pos.legal_moves.end()),
+        .result = result,
+        .transform = 0});
+    entries_[idx].transform = transform;
     return ENQUEUED_FOR_EVAL;
   }
 
   void ComputeBlocking() override {
+    for (auto& entry : entries_) computation_->AddInput(std::move(entry.input));
     computation_->ComputeBlocking();
-    for (size_t i = 0; i < results_.size(); ++i) {
-      const EvalResultPtr& result = results_[i];
+    for (size_t i = 0; i < entries_.size(); ++i) {
+      const EvalResultPtr& result = entries_[i].result;
       if (result.q) *result.q = computation_->GetQVal(i);
       if (result.d) *result.d = computation_->GetDVal(i);
       if (result.m) *result.m = computation_->GetMVal(i);
@@ -110,8 +111,8 @@ class NetworkAsBackendComputation : public BackendComputation {
 
   void SoftmaxPolicy(std::span<float> dst,
                      const NetworkComputation* computation, int idx) {
-    const std::vector<Move>& moves = moves_[idx];
-    const int transform = transforms_[idx];
+    const std::vector<Move>& moves = entries_[idx].legal_moves;
+    const int transform = entries_[idx].transform;
     // Copy the values to the destination array and compute the maximum.
     const float max_p = std::accumulate(
         moves.begin(), moves.end(), -std::numeric_limits<float>::infinity(),
@@ -131,11 +132,16 @@ class NetworkAsBackendComputation : public BackendComputation {
   }
 
  private:
+  struct Entry {
+    InputPlanes input;
+    MoveList legal_moves;
+    EvalResultPtr result;
+    int transform;
+  };
+
   NetworkAsBackend* backend_;
   std::unique_ptr<NetworkComputation> computation_;
-  std::vector<std::vector<Move>> moves_;
-  std::vector<EvalResultPtr> results_;
-  std::vector<int> transforms_;
+  AtomicVector<Entry> entries_;
 };
 
 std::unique_ptr<BackendComputation> NetworkAsBackend::CreateComputation() {
diff --git a/src/utils/atomic_vector.h b/src/utils/atomic_vector.h
@@ -0,0 +1,86 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+namespace lczero {
+
+template <typename T>
+class AtomicVector {
+ public:
+  explicit AtomicVector(size_t capacity) : capacity_(capacity), size_(0) {
+    data_ = new
+        typename std::aligned_storage<sizeof(T), alignof(T)>::type[capacity];
+  }
+
+  ~AtomicVector() {
+    clear();
+    delete[] data_;
+  }
+
+  // Thread safe, returns the index of the inserted element.
+  template <typename... Args>
+  size_t emplace_back(Args&&... args) {
+    size_t i = size_.fetch_add(1, std::memory_order_relaxed);
+    assert(i < capacity_);
+    new (&data_[i]) T(std::forward<Args>(args)...);
+    return i;
+  }
+
+  T& operator[](size_t i) {
+    assert(i < size());
+    return *reinterpret_cast<T*>(&data_[i]);
+  }
+
+  const T& operator[](size_t i) const {
+    assert(i < size());
+    return *reinterpret_cast<const T*>(&data_[i]);
+  }
+
+  size_t size() const { return size_.load(std::memory_order_relaxed); }
+  size_t capacity() const { return capacity_; }
+
+  // Not thread safe.
+  void clear() {
+    for (size_t i = size_.load(std::memory_order_relaxed); i-- > 0;) {
+      reinterpret_cast<T*>(&data_[i])->~T();
+    }
+    size_.store(0, std::memory_order_relaxed);
+  }
+
+  T* begin() { return reinterpret_cast<T*>(data_); }
+  T* end() { return reinterpret_cast<T*>(data_) + size(); }
+  const T* begin() const { return reinterpret_cast<const T*>(data_); }
+  const T* end() const { return reinterpret_cast<const T*>(data_) + size(); }
+
+ private:
+  const size_t capacity_;
+  std::atomic<size_t> size_;
+  typename std::aligned_storage<sizeof(T), alignof(T)>::type* data_;
+};
+
+}  // namespace lczero