20x speedups

OpenBLAS, dataloader rewriting, complete overhaul of the threading system

Guido. It's time.
This commit is contained in:
Quinn
2025-10-27 22:52:46 -05:00
parent bf75200d4c
commit ebc4a53753
14 changed files with 166 additions and 218 deletions

5
.gitignore vendored
View File

@@ -7,4 +7,7 @@
# Project build
*.o
*.d
Ember*
Ember*
# Profiling
perf.*

View File

@@ -11,7 +11,7 @@ endif
# Compiler and flags
CXX := clang++
CXXFLAGS := -O3 -std=c++20 -flto -fopenmp -funroll-loops -DNDEBUG
CXXFLAGS := -O3 -std=c++20 -flto -funroll-loops -DNDEBUG
ifeq ($(OS),Windows_NT)
ARCH := $(PROCESSOR_ARCHITECTURE)
@@ -22,7 +22,7 @@ endif
IS_ARM := $(filter ARM arm64 aarch64 arm%,$(ARCH))
ifeq ($(IS_ARM),)
LINKFLAGS := -fuse-ld=lld -pthread
LINKFLAGS := -fuse-ld=lld -pthread -lopenblas -fopenmp
ARCHFLAGS := -march=native
else
LINKFLAGS :=
@@ -59,12 +59,12 @@ endif
# Debug build
.PHONY: debug
debug: CXXFLAGS = -O3 -std=c++20 -flto -fopenmp -fsanitize=address,undefined -fno-omit-frame-pointer -D_GLIBCXX_DEBUG -D_GLIBCXX_DEBUG_PEDANTIC -Wall -Wextra
debug: CXXFLAGS = -O3 -std=c++20 -flto -fsanitize=address,undefined -fno-omit-frame-pointer -D_GLIBCXX_DEBUG -D_GLIBCXX_DEBUG_PEDANTIC -Wall -Wextra
debug: all
# Debug build
.PHONY: profile
profile: CXXFLAGS = -O3 -std=c++20 -flto -fopenmp -funroll-loops -ggdb -fno-omit-frame-pointer -DNDEBUG
profile: CXXFLAGS = -O3 -std=c++20 -flto -funroll-loops -ggdb -fno-omit-frame-pointer -DNDEBUG
profile: all
# Force rebuild

View File

@@ -3,7 +3,7 @@
int main() {
Ember::Network net(
Ember::layers::Input(28 * 28),
Ember::layers::Linear(64),
Ember::layers::Linear(512),
Ember::activations::ReLU(),
Ember::layers::Linear(10),
Ember::activations::Softmax()
@@ -22,5 +22,5 @@ int main() {
Ember::callbacks::AutosaveBest("../net.bin")
);
learner.learn(0.01, 20, 1);
learner.learn(0.01, 20, 2);
}

View File

@@ -6,6 +6,7 @@
#include "../external/stb_image.h"
#include <filesystem>
#include <omp.h>
std::vector<float> loadGreyscaleImage(const std::string& path, const Ember::usize w, const Ember::usize h) {
int width, height, channels;
@@ -57,10 +58,12 @@ namespace Ember::dataloaders {
samplesPerType.resize(types.size());
allImages.resize(types.size());
numSamples = 0;
for (usize typeIdx = 0; typeIdx < types.size(); typeIdx++) {
for (const auto &entry: std::filesystem::directory_iterator(types[typeIdx])) {
if (entry.is_regular_file()) {
allImages[typeIdx].push_back(entry.path().string());
numSamples++;
samplesPerType[typeIdx]++;
}
@@ -77,37 +80,34 @@ namespace Ember::dataloaders {
if (types.empty())
exitWithMsg(fmt::format("No types found in '{}'", dataDir), 1);
std::mutex dataMut;
std::vector<std::vector<internal::DataPoint>> localData(threads);
#pragma omp parallel for num_threads(threads)
for (usize i = 0; i < batchSize; i++) {
std::mt19937 rng{ std::random_device{}() + omp_get_thread_num()};
auto& threadData = localData[omp_get_thread_num()];
threadData.reserve(batchSize / threads + 1);
// Randomly pick a type
std::uniform_int_distribution<usize> typeDist(0, types.size() - 1);
const usize typeIdx = typeDist(rng);
const std::string& typeDir = types[typeIdx];
// Gather image files in that directory
std::vector<std::filesystem::path> imgs;
for (const auto& entry : std::filesystem::directory_iterator(typeDir)) {
if (entry.is_regular_file())
imgs.push_back(entry.path());
}
if (imgs.empty())
exitWithMsg(fmt::format("No images found in '{}'", typeDir), 1);
// Randomly pick an image
std::uniform_int_distribution<usize> imgDist(0, imgs.size() * trainSplit - 1);
std::uniform_int_distribution<usize> imgDist(0, samplesPerType[typeIdx] * trainSplit - 1);
const usize imgIdx = imgDist(rng);
std::vector<float> input = loadGreyscaleImage(imgs[imgIdx].string(), width, height);
std::vector<float> input = loadGreyscaleImage(allImages[typeIdx][imgIdx], width, height);
std::vector<float> target(types.size(), 0);
target[typeIdx] = 1;
dataMut.lock();
data[batchIdx].emplace_back(input, target);
dataMut.unlock();
threadData.emplace_back(std::move(input), std::move(target));
}
for (auto& threadData : localData)
data[batchIdx].insert(data[batchIdx].end(),
std::make_move_iterator(threadData.begin()),
std::make_move_iterator(threadData.end()));
}
void ImageDataLoader::loadTestSet() {
@@ -129,7 +129,7 @@ namespace Ember::dataloaders {
std::vector<float> target(types.size());
target[typeIdx] = 1;
data[currBatch].emplace_back(input, target);
data[currBatch].emplace_back(std::move(input), std::move(target));
}
}
}

View File

@@ -13,7 +13,9 @@ namespace Ember {
std::vector<float> input;
std::vector<float> target;
DataPoint(const std::vector<float>& input, const std::vector<float>& target) : input(input), target(target) {}
DataPoint() = default;
DataPoint(std::vector<float>&& input, std::vector<float>&& target)
: input(std::move(input)), target(std::move(target)) {}
};
struct DataLoader {
@@ -86,7 +88,7 @@ namespace Ember {
std::string dataDir;
std::vector<std::string> types;
std::vector<u64> samplesPerType;
std::mt19937 rng{ std::random_device{}() };
std::vector<std::vector<std::string>> allImages;
usize width;
usize height;

View File

@@ -3,6 +3,7 @@
#include "tensor.h"
#include <utility>
#include <cblas.h>
#include <string>
#include <thread>
@@ -33,30 +34,20 @@ namespace Ember {
};
struct ComputeLayer : Layer {
Tensor<1> weights; // Indexed [previous][current], flattened to prev * size + curr
BlasMatrix weights; // previousSize rows and size cols
Tensor<1> biases;
usize threadCount;
ComputeLayer() = delete;
ComputeLayer(const usize size) : Layer(size) {
threadCount = std::max<usize>(1, std::thread::hardware_concurrency());
threadCount = std::min<usize>(threadCount, size / 2);
explicit ComputeLayer(const usize size) : Layer(size) {
this->biases.resize(size);
}
void setThreadCount(const usize threadCount) {
this->threadCount = std::max<usize>(1, threadCount);
this->threadCount = std::min<usize>(threadCount, size / 2);
}
void init(const usize previousSize) {
this->weights.resize(previousSize * size);
this->weights.resize(size, previousSize);
}
virtual std::tuple<Tensor<1>, Tensor<1>, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const = 0;
virtual std::tuple<Tensor<1>, BlasMatrix, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const = 0;
};
struct ActivationLayer : Layer {
@@ -89,51 +80,44 @@ namespace Ember {
const usize inputSize = previous.size;
const usize outputSize = size;
std::vector<std::thread> threads;
// Copy biases to output first
std::memcpy(values.ptr(), biases.ptr(), outputSize * sizeof(float));
threadCount = 1;
const auto worker = [&](const usize threadId) {
// Divide the range across threads
const usize start = (outputSize * threadId) / threadCount;
const usize end = std::min((outputSize * (threadId + 1)) / threadCount, outputSize);
for (usize curr = start; curr < end; curr++) {
float sum = biases[curr];
for (usize prev = 0; prev < inputSize; prev++)
sum += previous.values[prev] * weights[prev * size + curr];
values[curr] = sum;
}
};
// Launch worker threads
for (usize t = 1; t < threadCount; t++)
threads.emplace_back(worker, t);
// Run thread 0 on the main thread
worker(0);
// Join all threads
for (std::thread& t : threads)
if (t.joinable())
t.join();
// Perform y = W^T * x + y (in-place)
// dimensions:
// W: outputSize x inputSize
// x: inputSize
// y: outputSize
cblas_sgemv(
CblasRowMajor, // Memory layout
CblasNoTrans, // Don't transpose W to keep outputSize x inputSize
outputSize, // rows of W
inputSize, // cols of W
1.0f, // alpha
weights.data.data(), // W data
inputSize, // lda (leading dimension, number of cols)
previous.values.ptr(), // x vector
1, // incx
1.0f, // beta (since y already holds biases)
values.ptr(), // y vector (output)
1 // incy
);
}
std::tuple<Tensor<1>, Tensor<1>, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const override {
std::tuple<Tensor<1>, BlasMatrix, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const override {
const usize inputSize = previous.size;
const usize outputSize = size;
Tensor<1> gradInput(inputSize, 0.0f);
Tensor<1> weightGrad(weights.size(), 0.0f);
BlasMatrix weightGrad(weights.rows, weights.cols);
Tensor<1> biasGrad(size, 0.0f);
// Compute gradients
for (usize curr = 0; curr < outputSize; curr++) {
biasGrad[curr] = gradOutput[curr];
for (usize prev = 0; prev < inputSize; prev++) {
const usize wIndex = prev * outputSize + curr;
gradInput[prev] += weights[wIndex] * gradOutput[curr];
weightGrad[wIndex] += previous.values[prev] * gradOutput[curr];
gradInput[prev] += weights(curr, prev) * gradOutput[curr];
weightGrad(curr, prev) += previous.values[prev] * gradOutput[curr];
}
}
@@ -145,7 +129,7 @@ namespace Ember {
}
std::string str() const override {
return fmt::format("Linear - {} input features and {} output features", weights.size() / size, size);
return fmt::format("Linear - {} input features and {} output features", weights.cols, size);
}
};
}

View File

@@ -1,6 +1,5 @@
#include "learner.h"
#include "progbar.h"
#include "omp.h"
#include <algorithm>
@@ -26,23 +25,28 @@ namespace Ember {
return gradients;
}
void Learner::applyGradients(const usize batchSize, const std::vector<Tensor<1>>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum) {
void Learner::applyGradients(const usize batchSize, const std::vector<BlasMatrix>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum) {
const float batchScalar = 1.0f / batchSize;
// Apply gradients to weights and biases
// Apply gradients to the optimizer
for (usize l = net.layers.size() - 1; l > 0; l--) {
if (const auto& currLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
assert(optimizer.weightGradients[l].size() == currLayer->weights.size());
if (const auto* currLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
assert(optimizer.weightGradients[l].data.size() == currLayer->weights.data.size());
assert(optimizer.biasGradients[l].size() == currLayer->biases.size());
for (usize i = 0; i < optimizer.weightGradients[l].size(); i++)
optimizer.weightGradients[l][i] += weightGradAccum[l][i] * batchScalar;
for (usize i = 0; i < currLayer->size; i++)
optimizer.biasGradients[l][i] += biasGradAccum[l][i] * batchScalar;
// Weights
cblas_saxpy(optimizer.weightGradients[l].data.size(), batchScalar,
weightGradAccum[l].ptr(), 1,
optimizer.weightGradients[l].ptr(), 1);
// Biases
cblas_saxpy(optimizer.biasGradients[l].size(), batchScalar,
biasGradAccum[l].ptr(), 1,
optimizer.biasGradients[l].ptr(), 1);
}
}
}
void Learner::learn(const float initialLr, const usize epochs, usize threads) {
void Learner::learn(const float initialLr, const usize epochs, const usize threads) {
// Initialize the learner's callback shared states
lr = initialLr;
testLoss = std::numeric_limits<float>::infinity();
@@ -56,17 +60,12 @@ namespace Ember {
std::pair<float, float> test{};
// Accumulators
std::vector<std::vector<Tensor<1>>> threadWeightGradAccum(threads);
std::vector<std::vector<Tensor<1>>> threadBiasGradAccum(threads);
std::vector<Tensor<1>> weightGradAccum(net.layers.size());
std::vector<BlasMatrix> weightGradAccum(net.layers.size());
std::vector<Tensor<1>> biasGradAccum(net.layers.size());
const u64 batchSize = dataLoader.batchSize;
const u64 batchesPerEpoch = dataLoader.numSamples / batchSize;
std::vector<Network> networks;
double trainLoss{};
ProgressBar progressBar{};
@@ -79,7 +78,7 @@ namespace Ember {
const usize testSize = dataLoader.testSetSize();
while (dataLoader.hasNext()) {
internal::DataPoint data = dataLoader.next();
net.forward(data.input);
net.forward(data.input, threads);
loss += lossFunc->forward(net.layers.back()->values, data.target);
usize guess = 0;
usize goal = 0;
@@ -94,6 +93,10 @@ namespace Ember {
return std::pair<float, float>{ loss / (testSize ? testSize : 1), numCorrect / static_cast<float>(testSize ? testSize : 1) };
};
// Store the compute layers so RTTI isn't done on-the-fly
std::vector<internal::ComputeLayer*> computeLayers;
std::vector<usize> computeLayerIndexes;
Stopwatch<std::chrono::milliseconds> stopwatch;
for (const auto& c : callbacks)
@@ -108,48 +111,25 @@ namespace Ember {
goto afterFit;
}
// Get number of threads to use
if (threads == 0)
threads = std::thread::hardware_concurrency();
if (threads == 0) {
std::cerr << "Failed to detect number of threads" << std::endl;
threads = 1;
}
fmt::println("Using {} threads", threads);
fmt::println("Training for {} batches with {} batches per epoch", batchesPerEpoch * epochs, batchesPerEpoch);
std::cout << "Epoch Train loss Test loss Test accuracy Time\n\n" << std::endl;
for (auto& accum : threadWeightGradAccum)
accum.resize(net.layers.size());
for (auto& accum : threadBiasGradAccum)
accum.resize(net.layers.size());
for (usize i = 1; i < net.layers.size(); i++) {
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[i].get())) {
weightGradAccum[i].resize(compLayer->weights.size());
if (auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[i].get())) {
weightGradAccum[i].resize(compLayer->weights.rows, compLayer->weights.cols);
biasGradAccum[i].resize(compLayer->biases.size());
for (auto& accum : threadWeightGradAccum)
accum[i].resize(compLayer->weights.size());
for (auto& accum : threadBiasGradAccum)
accum[i].resize(compLayer->biases.size());
computeLayers.push_back(compLayer);
computeLayerIndexes.push_back(i);
}
}
networks.reserve(threads);
for (usize t = 0; t < threads; t++)
networks.push_back(net);
// Preload first batch
dataLoader.asyncPreloadBatch();
stopwatch.reset();
// Set the network to only use 1 thread on the forward pass
net.setMode(NetworkMode::TRAIN);
// Main loop
for (epoch = 0; epoch < epochs; epoch++) {
try {
@@ -186,15 +166,6 @@ namespace Ember {
t.fill(0);
for (auto& t : biasGradAccum)
t.fill(0);
for (auto& accum : threadWeightGradAccum)
for (auto& t : accum)
t.fill(0);
for (auto& accum : threadBiasGradAccum)
for (auto& t : accum)
t.fill(0);
for (auto& n : networks)
n = net;
dataLoader.waitForBatch();
dataLoader.swapBuffers();
@@ -202,63 +173,29 @@ namespace Ember {
// Instantly start loading next batch
dataLoader.asyncPreloadBatch();
#pragma omp parallel for num_threads(threads) reduction(+:trainLoss)
for (u64 sample = 0; sample < batchSize; sample++) {
const usize tID = omp_get_thread_num();
Network& thisNet = networks[tID];
const internal::DataPoint& data = dataLoader.batchData(sample);
thisNet.forward(data.input);
net.forward(data.input, threads);
// Accumulate training loss
trainLoss += lossFunc->forward(thisNet.output(), data.target);
trainLoss += lossFunc->forward(net.output(), data.target);
const auto gradients = backward(thisNet, data.target);
const auto gradients = backward(net, data.target);
// Accumulate gradients
for (usize l = 1; l < thisNet.layers.size(); l++) {
const auto& prevLayer = thisNet.layers[l - 1];
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(thisNet.layers[l].get())) {
for (usize i = 0; i < compLayer->size; i++) {
for (usize j = 0; j < prevLayer->size; j++) {
const usize idx = j * compLayer->size + i;
assert(l < weightGradAccum.size());
assert(idx < weightGradAccum[l].size());
assert(idx < gradients[l].weightGrad.size());
for (usize i = 0; i < computeLayers.size(); i++) {
const usize l = computeLayerIndexes[i];
const auto* layer = computeLayers[i];
cblas_saxpy(layer->weights.data.size(), 1.0f,
gradients[l].weightGrad.ptr(), 1,
weightGradAccum[l].ptr(), 1);
threadWeightGradAccum[tID][l][idx] += gradients[l].weightGrad[idx];
}
assert(l < biasGradAccum.size());
assert(i < biasGradAccum[l].size());
assert(i < gradients[l].biasGrad.size());
threadBiasGradAccum[tID][l][i] += gradients[l].biasGrad[i];
}
}
cblas_saxpy(layer->biases.size(), 1.0f,
gradients[l].biasGrad.ptr(), 1,
biasGradAccum[l].ptr(), 1);
}
}
// Reduce across threads
for (usize t = 0; t < threads; t++) {
for (usize l = 1; l < net.layers.size(); l++) {
const auto& prevLayer = net.layers[l - 1];
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
for (usize i = 0; i < compLayer->size; i++) {
for (usize j = 0; j < prevLayer->size; j++) {
const usize idx = j * compLayer->size + i;
weightGradAccum[l][idx] += threadWeightGradAccum[t][l][idx];
}
biasGradAccum[l][i] += threadBiasGradAccum[t][l][i];
}
}
}
}
applyGradients(batchSize, weightGradAccum, biasGradAccum);
optimizer.clipGrad(1);
optimizer.step(lr);
@@ -302,7 +239,5 @@ namespace Ember {
afterFit:
for (const auto& c : callbacks)
c->run(internal::AFTER_FIT);
net.setMode(NetworkMode::EVAL);
}
}

View File

@@ -9,11 +9,11 @@
namespace Ember {
namespace internal {
struct Gradient {
Tensor<1> weightGrad;
BlasMatrix weightGrad;
Tensor<1> biasGrad;
Gradient() = default;
Gradient(const Tensor<1>& weightGrad, const Tensor<1>& biasGrad) : weightGrad(weightGrad), biasGrad(biasGrad) {}
Gradient(const BlasMatrix& weightGrad, const Tensor<1>& biasGrad) : weightGrad(weightGrad), biasGrad(biasGrad) {}
};
}
@@ -56,10 +56,10 @@ namespace Ember {
std::vector<internal::Gradient> backward(const Network& net, const std::vector<float>& target) const;
// Apply a gradient to the optimizer
void applyGradients(const usize batchSize, const std::vector<Tensor<1>>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum);
void applyGradients(const usize batchSize, const std::vector<BlasMatrix>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum);
// Main trainer functionality is through this function
// Trains a neural network
void learn(const float lr, const usize epochs, usize threads = 0);
void learn(const float lr, const usize epochs, usize threads);
};
}

View File

@@ -1,13 +1,8 @@
#include "network.h"
namespace Ember {
void Network::setMode(const NetworkMode mode) {
for (usize i = 1; i < layers.size(); i++)
if (auto* layer = dynamic_cast<internal::ComputeLayer*>(layers[i].get()); layer != nullptr)
layer->setThreadCount(mode == NetworkMode::EVAL ? std::thread::hardware_concurrency() : 1);
}
void Network::forward(const Tensor<1>& input) {
void Network::forward(const Tensor<1>& input, const usize threads) {
openblas_set_num_threads(threads);
layers[0]->values = input;
for (usize i = 1; i < layers.size(); i++)

View File

@@ -44,13 +44,13 @@ namespace Ember {
if (useXavierInit) {
const float limit = std::sqrt(6.0f / (fanIn + fanOut));
std::uniform_real_distribution<float> dist(-limit, limit);
for (auto& w : layer->weights)
for (auto& w : layer->weights.data)
w = dist(gen);
}
else {
const float stddev = std::sqrt(2.0f / fanIn);
std::normal_distribution<float> dist(0.0f, stddev);
for (auto& w : layer->weights)
for (auto& w : layer->weights.data)
w = dist(gen);
}
@@ -69,9 +69,7 @@ namespace Ember {
_init(true, std::forward<Args>(args)...);
}
void setMode(const NetworkMode mode);
void forward(const Tensor<1>& input);
void forward(const Tensor<1>& input, const usize threads);
const Tensor<1>& output() const;
Network& operator=(const Network& other) {

View File

@@ -11,13 +11,13 @@ namespace Ember {
if (!layer)
continue;
weightGradients[i].resize(layer->weights.size());
weightGradients[i].resize(layer->weights.rows, layer->weights.cols);
biasGradients[i].resize(layer->biases.size());
}
}
void Optimizer::zeroGrad() {
for (Tensor<1>& grad : weightGradients)
for (BlasMatrix& grad : weightGradients)
grad.fill(0);
for (Tensor<1>& grad : biasGradients)
@@ -29,7 +29,7 @@ namespace Ember {
double totalNormSq = 0.0;
// Weights gradients
for (const auto& layerGradients : weightGradients)
for (const float wg : layerGradients)
for (const float wg : layerGradients.data)
totalNormSq += wg * wg;
// Bias gradients
@@ -45,7 +45,7 @@ namespace Ember {
// Weights gradients
for (auto& layerGradients : weightGradients)
for (float& wg : layerGradients)
for (float& wg : layerGradients.data)
wg *= scale;
// Bias gradients
@@ -67,7 +67,7 @@ namespace Ember {
if (!layer)
continue;
weightVelocities[i].resize(layer->weights.size());
weightVelocities[i].resize(layer->weights.rows, layer->weights.cols);
biasVelocities[i].resize(layer->biases.size());
}
}
@@ -79,15 +79,15 @@ namespace Ember {
if (!layer)
continue;
assert(weightVelocities[lIdx].size() == layer->weights.size());
assert(weightVelocities[lIdx].data.size() == layer->weights.data.size());
assert(biasVelocities[lIdx].size() == layer->biases.size());
assert(weightGradients[lIdx].size() == layer->weights.size());
assert(weightGradients[lIdx].data.size() == layer->weights.data.size());
assert(biasGradients[lIdx].size() == layer->biases.size());
// Update weights with momentum
for (usize i = 0; i < layer->weights.size(); i++) {
weightVelocities[lIdx][i] = momentum * weightVelocities[lIdx][i] - lr * weightGradients[lIdx][i];
layer->weights[i] += weightVelocities[lIdx][i];
for (usize i = 0; i < layer->weights.data.size(); i++) {
weightVelocities[lIdx].data[i] = momentum * weightVelocities[lIdx].data[i] - lr * weightGradients[lIdx].data[i];
layer->weights.data[i] += weightVelocities[lIdx].data[i];
}
// Update biases with momentum
@@ -118,9 +118,9 @@ namespace Ember {
if (!layer)
continue;
weightVelocities[i].resize(layer->weights.size());
weightVelocities[i].resize(layer->weights.rows, layer->weights.cols);
biasVelocities[i].resize(layer->biases.size());
weightMomentum[i].resize(layer->weights.size());
weightMomentum[i].resize(layer->weights.rows, layer->weights.cols);
biasMomentum[i].resize(layer->biases.size());
}
}
@@ -136,23 +136,23 @@ namespace Ember {
if (!layer)
continue;
assert(weightVelocities[lIdx].size() == layer->weights.size());
assert(weightVelocities[lIdx].data.size() == layer->weights.data.size());
assert(biasVelocities[lIdx].size() == layer->biases.size());
assert(weightGradients[lIdx].size() == layer->weights.size());
assert(weightGradients[lIdx].data.size() == layer->weights.data.size());
assert(biasGradients[lIdx].size() == layer->biases.size());
// Update weights
for (usize i = 0; i < layer->weights.size(); i++) {
layer->weights[i] *= 1.0f - lr * decay;
for (usize i = 0; i < layer->weights.data.size(); i++) {
layer->weights.data[i] *= 1.0f - lr * decay;
weightMomentum[lIdx][i] = beta1 * weightMomentum[lIdx][i] + (1.0f - beta1) * weightGradients[lIdx][i];
weightVelocities[lIdx][i] = beta2 * weightVelocities[lIdx][i] + (1.0f - beta2) * weightGradients[lIdx][i] * weightGradients[lIdx][i];
weightMomentum[lIdx].data[i] = beta1 * weightMomentum[lIdx].data[i] + (1.0f - beta1) * weightGradients[lIdx].data[i];
weightVelocities[lIdx].data[i] = beta2 * weightVelocities[lIdx].data[i] + (1.0f - beta2) * weightGradients[lIdx].data[i] * weightGradients[lIdx].data[i];
// Bias correction
const float mHat = weightMomentum[lIdx][i] / biasCorr1;
const float vHat = weightVelocities[lIdx][i] / biasCorr2;
const float mHat = weightMomentum[lIdx].data[i] / biasCorr1;
const float vHat = weightVelocities[lIdx].data[i] / biasCorr2;
layer->weights[i] -= lr * mHat / (std::sqrt(vHat) + epsilon);
layer->weights.data[i] -= lr * mHat / (std::sqrt(vHat) + epsilon);
}
// Update biases

View File

@@ -9,7 +9,7 @@ namespace Ember {
struct Optimizer {
Network& net;
std::vector<Tensor<1>> weightGradients;
std::vector<BlasMatrix> weightGradients;
std::vector<Tensor<1>> biasGradients;
explicit Optimizer(Network& net);
@@ -29,7 +29,7 @@ namespace Ember {
namespace optimizers {
struct SGD : internal::Optimizer {
std::vector<Tensor<1>> weightVelocities;
std::vector<BlasMatrix> weightVelocities;
std::vector<Tensor<1>> biasVelocities;
float momentum;
@@ -49,9 +49,9 @@ namespace Ember {
float decay;
usize iteration = 0;
std::vector<Tensor<1>> weightVelocities;
std::vector<BlasMatrix> weightVelocities;
std::vector<Tensor<1>> biasVelocities;
std::vector<Tensor<1>> weightMomentum;
std::vector<BlasMatrix> weightMomentum;
std::vector<Tensor<1>> biasMomentum;
explicit Adam(Network& net, const float beta1 = 0.9f, const float beta2 = 0.999f, const float epsilon = 1e-08, const float decay = 0.01f);

View File

@@ -14,7 +14,7 @@ namespace Ember {
if (!layer)
continue;
for (const float weight : layer->weights)
for (const float weight : layer->weights.data)
write(weight);
for (const float bias : layer->biases)
@@ -34,7 +34,7 @@ namespace Ember {
if (!layer)
continue;
for (float& weight : layer->weights)
for (float& weight : layer->weights.data)
read(weight);
for (float& bias : layer->biases)

View File

@@ -8,6 +8,7 @@
namespace Ember {
// Tensor recursive case
// Tensor currently has a rather foolish implementation since it doesn't flatten the memory
template<usize dimensionality>
struct Tensor {
static_assert(dimensionality > 1, "dimensionality must be >= 1");
@@ -72,6 +73,9 @@ namespace Ember {
auto begin() const { return data.begin(); }
auto end() const { return data.end(); }
auto ptr() { return data.data(); }
auto ptr() const { return data.data(); }
float& operator[](const usize idx) { return data[idx]; }
const float& operator[](const usize idx) const { return data[idx]; }
@@ -88,4 +92,31 @@ namespace Ember {
return os;
}
};
struct BlasMatrix {
usize rows{};
usize cols{};
std::vector<float> data;
BlasMatrix() = default;
BlasMatrix(const usize rows, const usize cols) : rows(rows), cols(cols), data(rows * cols) {}
void resize(const usize rows, const usize cols) {
this->rows = rows;
this->cols = cols;
data.resize(rows * cols);
}
float* ptr() { return data.data(); }
const float* ptr() const { return data.data(); }
void fill(const float value) {
for (float& f : data)
f = value;
}
// (i, j) access (row i, column j)
float& operator()(const usize i, const usize j) { return data[i * cols + j]; }
const float& operator()(const usize i, const usize j) const { return data[i * cols + j]; }
};
}