20x speedups
OpenBLAS, dataloader rewriting, complete overhaul of the threading system Guido. It's time.
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -7,4 +7,7 @@
|
||||
# Project build
|
||||
*.o
|
||||
*.d
|
||||
Ember*
|
||||
Ember*
|
||||
|
||||
# Profiling
|
||||
perf.*
|
||||
8
makefile
8
makefile
@@ -11,7 +11,7 @@ endif
|
||||
|
||||
# Compiler and flags
|
||||
CXX := clang++
|
||||
CXXFLAGS := -O3 -std=c++20 -flto -fopenmp -funroll-loops -DNDEBUG
|
||||
CXXFLAGS := -O3 -std=c++20 -flto -funroll-loops -DNDEBUG
|
||||
|
||||
ifeq ($(OS),Windows_NT)
|
||||
ARCH := $(PROCESSOR_ARCHITECTURE)
|
||||
@@ -22,7 +22,7 @@ endif
|
||||
IS_ARM := $(filter ARM arm64 aarch64 arm%,$(ARCH))
|
||||
|
||||
ifeq ($(IS_ARM),)
|
||||
LINKFLAGS := -fuse-ld=lld -pthread
|
||||
LINKFLAGS := -fuse-ld=lld -pthread -lopenblas -fopenmp
|
||||
ARCHFLAGS := -march=native
|
||||
else
|
||||
LINKFLAGS :=
|
||||
@@ -59,12 +59,12 @@ endif
|
||||
|
||||
# Debug build
|
||||
.PHONY: debug
|
||||
debug: CXXFLAGS = -O3 -std=c++20 -flto -fopenmp -fsanitize=address,undefined -fno-omit-frame-pointer -D_GLIBCXX_DEBUG -D_GLIBCXX_DEBUG_PEDANTIC -Wall -Wextra
|
||||
debug: CXXFLAGS = -O3 -std=c++20 -flto -fsanitize=address,undefined -fno-omit-frame-pointer -D_GLIBCXX_DEBUG -D_GLIBCXX_DEBUG_PEDANTIC -Wall -Wextra
|
||||
debug: all
|
||||
|
||||
# Debug build
|
||||
.PHONY: profile
|
||||
profile: CXXFLAGS = -O3 -std=c++20 -flto -fopenmp -funroll-loops -ggdb -fno-omit-frame-pointer -DNDEBUG
|
||||
profile: CXXFLAGS = -O3 -std=c++20 -flto -funroll-loops -ggdb -fno-omit-frame-pointer -DNDEBUG
|
||||
profile: all
|
||||
|
||||
# Force rebuild
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
int main() {
|
||||
Ember::Network net(
|
||||
Ember::layers::Input(28 * 28),
|
||||
Ember::layers::Linear(64),
|
||||
Ember::layers::Linear(512),
|
||||
Ember::activations::ReLU(),
|
||||
Ember::layers::Linear(10),
|
||||
Ember::activations::Softmax()
|
||||
@@ -22,5 +22,5 @@ int main() {
|
||||
Ember::callbacks::AutosaveBest("../net.bin")
|
||||
);
|
||||
|
||||
learner.learn(0.01, 20, 1);
|
||||
learner.learn(0.01, 20, 2);
|
||||
}
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "../external/stb_image.h"
|
||||
|
||||
#include <filesystem>
|
||||
#include <omp.h>
|
||||
|
||||
std::vector<float> loadGreyscaleImage(const std::string& path, const Ember::usize w, const Ember::usize h) {
|
||||
int width, height, channels;
|
||||
@@ -57,10 +58,12 @@ namespace Ember::dataloaders {
|
||||
|
||||
samplesPerType.resize(types.size());
|
||||
|
||||
allImages.resize(types.size());
|
||||
numSamples = 0;
|
||||
for (usize typeIdx = 0; typeIdx < types.size(); typeIdx++) {
|
||||
for (const auto &entry: std::filesystem::directory_iterator(types[typeIdx])) {
|
||||
if (entry.is_regular_file()) {
|
||||
allImages[typeIdx].push_back(entry.path().string());
|
||||
numSamples++;
|
||||
samplesPerType[typeIdx]++;
|
||||
}
|
||||
@@ -77,37 +80,34 @@ namespace Ember::dataloaders {
|
||||
if (types.empty())
|
||||
exitWithMsg(fmt::format("No types found in '{}'", dataDir), 1);
|
||||
|
||||
std::mutex dataMut;
|
||||
std::vector<std::vector<internal::DataPoint>> localData(threads);
|
||||
|
||||
#pragma omp parallel for num_threads(threads)
|
||||
for (usize i = 0; i < batchSize; i++) {
|
||||
std::mt19937 rng{ std::random_device{}() + omp_get_thread_num()};
|
||||
|
||||
auto& threadData = localData[omp_get_thread_num()];
|
||||
threadData.reserve(batchSize / threads + 1);
|
||||
|
||||
// Randomly pick a type
|
||||
std::uniform_int_distribution<usize> typeDist(0, types.size() - 1);
|
||||
const usize typeIdx = typeDist(rng);
|
||||
const std::string& typeDir = types[typeIdx];
|
||||
|
||||
// Gather image files in that directory
|
||||
std::vector<std::filesystem::path> imgs;
|
||||
for (const auto& entry : std::filesystem::directory_iterator(typeDir)) {
|
||||
if (entry.is_regular_file())
|
||||
imgs.push_back(entry.path());
|
||||
}
|
||||
|
||||
if (imgs.empty())
|
||||
exitWithMsg(fmt::format("No images found in '{}'", typeDir), 1);
|
||||
|
||||
// Randomly pick an image
|
||||
std::uniform_int_distribution<usize> imgDist(0, imgs.size() * trainSplit - 1);
|
||||
std::uniform_int_distribution<usize> imgDist(0, samplesPerType[typeIdx] * trainSplit - 1);
|
||||
const usize imgIdx = imgDist(rng);
|
||||
|
||||
std::vector<float> input = loadGreyscaleImage(imgs[imgIdx].string(), width, height);
|
||||
std::vector<float> input = loadGreyscaleImage(allImages[typeIdx][imgIdx], width, height);
|
||||
std::vector<float> target(types.size(), 0);
|
||||
target[typeIdx] = 1;
|
||||
|
||||
dataMut.lock();
|
||||
data[batchIdx].emplace_back(input, target);
|
||||
dataMut.unlock();
|
||||
threadData.emplace_back(std::move(input), std::move(target));
|
||||
}
|
||||
|
||||
for (auto& threadData : localData)
|
||||
data[batchIdx].insert(data[batchIdx].end(),
|
||||
std::make_move_iterator(threadData.begin()),
|
||||
std::make_move_iterator(threadData.end()));
|
||||
}
|
||||
|
||||
void ImageDataLoader::loadTestSet() {
|
||||
@@ -129,7 +129,7 @@ namespace Ember::dataloaders {
|
||||
std::vector<float> target(types.size());
|
||||
target[typeIdx] = 1;
|
||||
|
||||
data[currBatch].emplace_back(input, target);
|
||||
data[currBatch].emplace_back(std::move(input), std::move(target));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,9 @@ namespace Ember {
|
||||
std::vector<float> input;
|
||||
std::vector<float> target;
|
||||
|
||||
DataPoint(const std::vector<float>& input, const std::vector<float>& target) : input(input), target(target) {}
|
||||
DataPoint() = default;
|
||||
DataPoint(std::vector<float>&& input, std::vector<float>&& target)
|
||||
: input(std::move(input)), target(std::move(target)) {}
|
||||
};
|
||||
|
||||
struct DataLoader {
|
||||
@@ -86,7 +88,7 @@ namespace Ember {
|
||||
std::string dataDir;
|
||||
std::vector<std::string> types;
|
||||
std::vector<u64> samplesPerType;
|
||||
std::mt19937 rng{ std::random_device{}() };
|
||||
std::vector<std::vector<std::string>> allImages;
|
||||
|
||||
usize width;
|
||||
usize height;
|
||||
|
||||
78
src/layer.h
78
src/layer.h
@@ -3,6 +3,7 @@
|
||||
#include "tensor.h"
|
||||
|
||||
#include <utility>
|
||||
#include <cblas.h>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
@@ -33,30 +34,20 @@ namespace Ember {
|
||||
};
|
||||
|
||||
struct ComputeLayer : Layer {
|
||||
Tensor<1> weights; // Indexed [previous][current], flattened to prev * size + curr
|
||||
BlasMatrix weights; // previousSize rows and size cols
|
||||
Tensor<1> biases;
|
||||
|
||||
usize threadCount;
|
||||
|
||||
ComputeLayer() = delete;
|
||||
|
||||
ComputeLayer(const usize size) : Layer(size) {
|
||||
threadCount = std::max<usize>(1, std::thread::hardware_concurrency());
|
||||
threadCount = std::min<usize>(threadCount, size / 2);
|
||||
|
||||
explicit ComputeLayer(const usize size) : Layer(size) {
|
||||
this->biases.resize(size);
|
||||
}
|
||||
|
||||
void setThreadCount(const usize threadCount) {
|
||||
this->threadCount = std::max<usize>(1, threadCount);
|
||||
this->threadCount = std::min<usize>(threadCount, size / 2);
|
||||
}
|
||||
|
||||
void init(const usize previousSize) {
|
||||
this->weights.resize(previousSize * size);
|
||||
this->weights.resize(size, previousSize);
|
||||
}
|
||||
|
||||
virtual std::tuple<Tensor<1>, Tensor<1>, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const = 0;
|
||||
virtual std::tuple<Tensor<1>, BlasMatrix, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const = 0;
|
||||
};
|
||||
|
||||
struct ActivationLayer : Layer {
|
||||
@@ -89,51 +80,44 @@ namespace Ember {
|
||||
const usize inputSize = previous.size;
|
||||
const usize outputSize = size;
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
// Copy biases to output first
|
||||
std::memcpy(values.ptr(), biases.ptr(), outputSize * sizeof(float));
|
||||
|
||||
threadCount = 1;
|
||||
|
||||
const auto worker = [&](const usize threadId) {
|
||||
// Divide the range across threads
|
||||
const usize start = (outputSize * threadId) / threadCount;
|
||||
const usize end = std::min((outputSize * (threadId + 1)) / threadCount, outputSize);
|
||||
|
||||
for (usize curr = start; curr < end; curr++) {
|
||||
float sum = biases[curr];
|
||||
for (usize prev = 0; prev < inputSize; prev++)
|
||||
sum += previous.values[prev] * weights[prev * size + curr];
|
||||
values[curr] = sum;
|
||||
}
|
||||
};
|
||||
|
||||
// Launch worker threads
|
||||
for (usize t = 1; t < threadCount; t++)
|
||||
threads.emplace_back(worker, t);
|
||||
|
||||
// Run thread 0 on the main thread
|
||||
worker(0);
|
||||
|
||||
// Join all threads
|
||||
for (std::thread& t : threads)
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
// Perform y = W^T * x + y (in-place)
|
||||
// dimensions:
|
||||
// W: outputSize x inputSize
|
||||
// x: inputSize
|
||||
// y: outputSize
|
||||
cblas_sgemv(
|
||||
CblasRowMajor, // Memory layout
|
||||
CblasNoTrans, // Don't transpose W to keep outputSize x inputSize
|
||||
outputSize, // rows of W
|
||||
inputSize, // cols of W
|
||||
1.0f, // alpha
|
||||
weights.data.data(), // W data
|
||||
inputSize, // lda (leading dimension, number of cols)
|
||||
previous.values.ptr(), // x vector
|
||||
1, // incx
|
||||
1.0f, // beta (since y already holds biases)
|
||||
values.ptr(), // y vector (output)
|
||||
1 // incy
|
||||
);
|
||||
}
|
||||
|
||||
std::tuple<Tensor<1>, Tensor<1>, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const override {
|
||||
std::tuple<Tensor<1>, BlasMatrix, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const override {
|
||||
const usize inputSize = previous.size;
|
||||
const usize outputSize = size;
|
||||
|
||||
Tensor<1> gradInput(inputSize, 0.0f);
|
||||
Tensor<1> weightGrad(weights.size(), 0.0f);
|
||||
BlasMatrix weightGrad(weights.rows, weights.cols);
|
||||
Tensor<1> biasGrad(size, 0.0f);
|
||||
|
||||
// Compute gradients
|
||||
for (usize curr = 0; curr < outputSize; curr++) {
|
||||
biasGrad[curr] = gradOutput[curr];
|
||||
for (usize prev = 0; prev < inputSize; prev++) {
|
||||
const usize wIndex = prev * outputSize + curr;
|
||||
gradInput[prev] += weights[wIndex] * gradOutput[curr];
|
||||
weightGrad[wIndex] += previous.values[prev] * gradOutput[curr];
|
||||
gradInput[prev] += weights(curr, prev) * gradOutput[curr];
|
||||
weightGrad(curr, prev) += previous.values[prev] * gradOutput[curr];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +129,7 @@ namespace Ember {
|
||||
}
|
||||
|
||||
std::string str() const override {
|
||||
return fmt::format("Linear - {} input features and {} output features", weights.size() / size, size);
|
||||
return fmt::format("Linear - {} input features and {} output features", weights.cols, size);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
137
src/learner.cpp
137
src/learner.cpp
@@ -1,6 +1,5 @@
|
||||
#include "learner.h"
|
||||
#include "progbar.h"
|
||||
#include "omp.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
@@ -26,23 +25,28 @@ namespace Ember {
|
||||
return gradients;
|
||||
}
|
||||
|
||||
void Learner::applyGradients(const usize batchSize, const std::vector<Tensor<1>>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum) {
|
||||
void Learner::applyGradients(const usize batchSize, const std::vector<BlasMatrix>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum) {
|
||||
const float batchScalar = 1.0f / batchSize;
|
||||
// Apply gradients to weights and biases
|
||||
// Apply gradients to the optimizer
|
||||
for (usize l = net.layers.size() - 1; l > 0; l--) {
|
||||
if (const auto& currLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
|
||||
assert(optimizer.weightGradients[l].size() == currLayer->weights.size());
|
||||
if (const auto* currLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
|
||||
assert(optimizer.weightGradients[l].data.size() == currLayer->weights.data.size());
|
||||
assert(optimizer.biasGradients[l].size() == currLayer->biases.size());
|
||||
|
||||
for (usize i = 0; i < optimizer.weightGradients[l].size(); i++)
|
||||
optimizer.weightGradients[l][i] += weightGradAccum[l][i] * batchScalar;
|
||||
for (usize i = 0; i < currLayer->size; i++)
|
||||
optimizer.biasGradients[l][i] += biasGradAccum[l][i] * batchScalar;
|
||||
// Weights
|
||||
cblas_saxpy(optimizer.weightGradients[l].data.size(), batchScalar,
|
||||
weightGradAccum[l].ptr(), 1,
|
||||
optimizer.weightGradients[l].ptr(), 1);
|
||||
|
||||
// Biases
|
||||
cblas_saxpy(optimizer.biasGradients[l].size(), batchScalar,
|
||||
biasGradAccum[l].ptr(), 1,
|
||||
optimizer.biasGradients[l].ptr(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Learner::learn(const float initialLr, const usize epochs, usize threads) {
|
||||
void Learner::learn(const float initialLr, const usize epochs, const usize threads) {
|
||||
// Initialize the learner's callback shared states
|
||||
lr = initialLr;
|
||||
testLoss = std::numeric_limits<float>::infinity();
|
||||
@@ -56,17 +60,12 @@ namespace Ember {
|
||||
std::pair<float, float> test{};
|
||||
|
||||
// Accumulators
|
||||
std::vector<std::vector<Tensor<1>>> threadWeightGradAccum(threads);
|
||||
std::vector<std::vector<Tensor<1>>> threadBiasGradAccum(threads);
|
||||
|
||||
std::vector<Tensor<1>> weightGradAccum(net.layers.size());
|
||||
std::vector<BlasMatrix> weightGradAccum(net.layers.size());
|
||||
std::vector<Tensor<1>> biasGradAccum(net.layers.size());
|
||||
|
||||
const u64 batchSize = dataLoader.batchSize;
|
||||
const u64 batchesPerEpoch = dataLoader.numSamples / batchSize;
|
||||
|
||||
std::vector<Network> networks;
|
||||
|
||||
double trainLoss{};
|
||||
|
||||
ProgressBar progressBar{};
|
||||
@@ -79,7 +78,7 @@ namespace Ember {
|
||||
const usize testSize = dataLoader.testSetSize();
|
||||
while (dataLoader.hasNext()) {
|
||||
internal::DataPoint data = dataLoader.next();
|
||||
net.forward(data.input);
|
||||
net.forward(data.input, threads);
|
||||
loss += lossFunc->forward(net.layers.back()->values, data.target);
|
||||
usize guess = 0;
|
||||
usize goal = 0;
|
||||
@@ -94,6 +93,10 @@ namespace Ember {
|
||||
return std::pair<float, float>{ loss / (testSize ? testSize : 1), numCorrect / static_cast<float>(testSize ? testSize : 1) };
|
||||
};
|
||||
|
||||
// Store the compute layers so RTTI isn't done on-the-fly
|
||||
std::vector<internal::ComputeLayer*> computeLayers;
|
||||
std::vector<usize> computeLayerIndexes;
|
||||
|
||||
Stopwatch<std::chrono::milliseconds> stopwatch;
|
||||
|
||||
for (const auto& c : callbacks)
|
||||
@@ -108,48 +111,25 @@ namespace Ember {
|
||||
goto afterFit;
|
||||
}
|
||||
|
||||
// Get number of threads to use
|
||||
if (threads == 0)
|
||||
threads = std::thread::hardware_concurrency();
|
||||
if (threads == 0) {
|
||||
std::cerr << "Failed to detect number of threads" << std::endl;
|
||||
threads = 1;
|
||||
}
|
||||
fmt::println("Using {} threads", threads);
|
||||
|
||||
fmt::println("Training for {} batches with {} batches per epoch", batchesPerEpoch * epochs, batchesPerEpoch);
|
||||
|
||||
std::cout << "Epoch Train loss Test loss Test accuracy Time\n\n" << std::endl;
|
||||
|
||||
for (auto& accum : threadWeightGradAccum)
|
||||
accum.resize(net.layers.size());
|
||||
for (auto& accum : threadBiasGradAccum)
|
||||
accum.resize(net.layers.size());
|
||||
|
||||
for (usize i = 1; i < net.layers.size(); i++) {
|
||||
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[i].get())) {
|
||||
weightGradAccum[i].resize(compLayer->weights.size());
|
||||
if (auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[i].get())) {
|
||||
weightGradAccum[i].resize(compLayer->weights.rows, compLayer->weights.cols);
|
||||
biasGradAccum[i].resize(compLayer->biases.size());
|
||||
|
||||
for (auto& accum : threadWeightGradAccum)
|
||||
accum[i].resize(compLayer->weights.size());
|
||||
for (auto& accum : threadBiasGradAccum)
|
||||
accum[i].resize(compLayer->biases.size());
|
||||
computeLayers.push_back(compLayer);
|
||||
computeLayerIndexes.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
networks.reserve(threads);
|
||||
for (usize t = 0; t < threads; t++)
|
||||
networks.push_back(net);
|
||||
|
||||
// Preload first batch
|
||||
dataLoader.asyncPreloadBatch();
|
||||
|
||||
stopwatch.reset();
|
||||
|
||||
// Set the network to only use 1 thread on the forward pass
|
||||
net.setMode(NetworkMode::TRAIN);
|
||||
|
||||
// Main loop
|
||||
for (epoch = 0; epoch < epochs; epoch++) {
|
||||
try {
|
||||
@@ -186,15 +166,6 @@ namespace Ember {
|
||||
t.fill(0);
|
||||
for (auto& t : biasGradAccum)
|
||||
t.fill(0);
|
||||
for (auto& accum : threadWeightGradAccum)
|
||||
for (auto& t : accum)
|
||||
t.fill(0);
|
||||
for (auto& accum : threadBiasGradAccum)
|
||||
for (auto& t : accum)
|
||||
t.fill(0);
|
||||
|
||||
for (auto& n : networks)
|
||||
n = net;
|
||||
|
||||
dataLoader.waitForBatch();
|
||||
dataLoader.swapBuffers();
|
||||
@@ -202,63 +173,29 @@ namespace Ember {
|
||||
// Instantly start loading next batch
|
||||
dataLoader.asyncPreloadBatch();
|
||||
|
||||
#pragma omp parallel for num_threads(threads) reduction(+:trainLoss)
|
||||
for (u64 sample = 0; sample < batchSize; sample++) {
|
||||
const usize tID = omp_get_thread_num();
|
||||
|
||||
Network& thisNet = networks[tID];
|
||||
|
||||
const internal::DataPoint& data = dataLoader.batchData(sample);
|
||||
|
||||
thisNet.forward(data.input);
|
||||
net.forward(data.input, threads);
|
||||
|
||||
// Accumulate training loss
|
||||
trainLoss += lossFunc->forward(thisNet.output(), data.target);
|
||||
trainLoss += lossFunc->forward(net.output(), data.target);
|
||||
|
||||
const auto gradients = backward(thisNet, data.target);
|
||||
const auto gradients = backward(net, data.target);
|
||||
|
||||
// Accumulate gradients
|
||||
for (usize l = 1; l < thisNet.layers.size(); l++) {
|
||||
const auto& prevLayer = thisNet.layers[l - 1];
|
||||
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(thisNet.layers[l].get())) {
|
||||
for (usize i = 0; i < compLayer->size; i++) {
|
||||
for (usize j = 0; j < prevLayer->size; j++) {
|
||||
const usize idx = j * compLayer->size + i;
|
||||
assert(l < weightGradAccum.size());
|
||||
assert(idx < weightGradAccum[l].size());
|
||||
assert(idx < gradients[l].weightGrad.size());
|
||||
for (usize i = 0; i < computeLayers.size(); i++) {
|
||||
const usize l = computeLayerIndexes[i];
|
||||
const auto* layer = computeLayers[i];
|
||||
cblas_saxpy(layer->weights.data.size(), 1.0f,
|
||||
gradients[l].weightGrad.ptr(), 1,
|
||||
weightGradAccum[l].ptr(), 1);
|
||||
|
||||
threadWeightGradAccum[tID][l][idx] += gradients[l].weightGrad[idx];
|
||||
}
|
||||
|
||||
assert(l < biasGradAccum.size());
|
||||
assert(i < biasGradAccum[l].size());
|
||||
assert(i < gradients[l].biasGrad.size());
|
||||
|
||||
threadBiasGradAccum[tID][l][i] += gradients[l].biasGrad[i];
|
||||
}
|
||||
}
|
||||
cblas_saxpy(layer->biases.size(), 1.0f,
|
||||
gradients[l].biasGrad.ptr(), 1,
|
||||
biasGradAccum[l].ptr(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce across threads
|
||||
for (usize t = 0; t < threads; t++) {
|
||||
for (usize l = 1; l < net.layers.size(); l++) {
|
||||
const auto& prevLayer = net.layers[l - 1];
|
||||
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
|
||||
for (usize i = 0; i < compLayer->size; i++) {
|
||||
for (usize j = 0; j < prevLayer->size; j++) {
|
||||
const usize idx = j * compLayer->size + i;
|
||||
|
||||
weightGradAccum[l][idx] += threadWeightGradAccum[t][l][idx];
|
||||
}
|
||||
|
||||
biasGradAccum[l][i] += threadBiasGradAccum[t][l][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
applyGradients(batchSize, weightGradAccum, biasGradAccum);
|
||||
optimizer.clipGrad(1);
|
||||
optimizer.step(lr);
|
||||
@@ -302,7 +239,5 @@ namespace Ember {
|
||||
afterFit:
|
||||
for (const auto& c : callbacks)
|
||||
c->run(internal::AFTER_FIT);
|
||||
|
||||
net.setMode(NetworkMode::EVAL);
|
||||
}
|
||||
}
|
||||
@@ -9,11 +9,11 @@
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
struct Gradient {
|
||||
Tensor<1> weightGrad;
|
||||
BlasMatrix weightGrad;
|
||||
Tensor<1> biasGrad;
|
||||
|
||||
Gradient() = default;
|
||||
Gradient(const Tensor<1>& weightGrad, const Tensor<1>& biasGrad) : weightGrad(weightGrad), biasGrad(biasGrad) {}
|
||||
Gradient(const BlasMatrix& weightGrad, const Tensor<1>& biasGrad) : weightGrad(weightGrad), biasGrad(biasGrad) {}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -56,10 +56,10 @@ namespace Ember {
|
||||
std::vector<internal::Gradient> backward(const Network& net, const std::vector<float>& target) const;
|
||||
|
||||
// Apply a gradient to the optimizer
|
||||
void applyGradients(const usize batchSize, const std::vector<Tensor<1>>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum);
|
||||
void applyGradients(const usize batchSize, const std::vector<BlasMatrix>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum);
|
||||
|
||||
// Main trainer functionality is through this function
|
||||
// Trains a neural network
|
||||
void learn(const float lr, const usize epochs, usize threads = 0);
|
||||
void learn(const float lr, const usize epochs, usize threads);
|
||||
};
|
||||
}
|
||||
@@ -1,13 +1,8 @@
|
||||
#include "network.h"
|
||||
|
||||
namespace Ember {
|
||||
void Network::setMode(const NetworkMode mode) {
|
||||
for (usize i = 1; i < layers.size(); i++)
|
||||
if (auto* layer = dynamic_cast<internal::ComputeLayer*>(layers[i].get()); layer != nullptr)
|
||||
layer->setThreadCount(mode == NetworkMode::EVAL ? std::thread::hardware_concurrency() : 1);
|
||||
}
|
||||
|
||||
void Network::forward(const Tensor<1>& input) {
|
||||
void Network::forward(const Tensor<1>& input, const usize threads) {
|
||||
openblas_set_num_threads(threads);
|
||||
layers[0]->values = input;
|
||||
|
||||
for (usize i = 1; i < layers.size(); i++)
|
||||
|
||||
@@ -44,13 +44,13 @@ namespace Ember {
|
||||
if (useXavierInit) {
|
||||
const float limit = std::sqrt(6.0f / (fanIn + fanOut));
|
||||
std::uniform_real_distribution<float> dist(-limit, limit);
|
||||
for (auto& w : layer->weights)
|
||||
for (auto& w : layer->weights.data)
|
||||
w = dist(gen);
|
||||
}
|
||||
else {
|
||||
const float stddev = std::sqrt(2.0f / fanIn);
|
||||
std::normal_distribution<float> dist(0.0f, stddev);
|
||||
for (auto& w : layer->weights)
|
||||
for (auto& w : layer->weights.data)
|
||||
w = dist(gen);
|
||||
}
|
||||
|
||||
@@ -69,9 +69,7 @@ namespace Ember {
|
||||
_init(true, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
void setMode(const NetworkMode mode);
|
||||
|
||||
void forward(const Tensor<1>& input);
|
||||
void forward(const Tensor<1>& input, const usize threads);
|
||||
const Tensor<1>& output() const;
|
||||
|
||||
Network& operator=(const Network& other) {
|
||||
|
||||
@@ -11,13 +11,13 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
weightGradients[i].resize(layer->weights.size());
|
||||
weightGradients[i].resize(layer->weights.rows, layer->weights.cols);
|
||||
biasGradients[i].resize(layer->biases.size());
|
||||
}
|
||||
}
|
||||
|
||||
void Optimizer::zeroGrad() {
|
||||
for (Tensor<1>& grad : weightGradients)
|
||||
for (BlasMatrix& grad : weightGradients)
|
||||
grad.fill(0);
|
||||
|
||||
for (Tensor<1>& grad : biasGradients)
|
||||
@@ -29,7 +29,7 @@ namespace Ember {
|
||||
double totalNormSq = 0.0;
|
||||
// Weights gradients
|
||||
for (const auto& layerGradients : weightGradients)
|
||||
for (const float wg : layerGradients)
|
||||
for (const float wg : layerGradients.data)
|
||||
totalNormSq += wg * wg;
|
||||
|
||||
// Bias gradients
|
||||
@@ -45,7 +45,7 @@ namespace Ember {
|
||||
|
||||
// Weights gradients
|
||||
for (auto& layerGradients : weightGradients)
|
||||
for (float& wg : layerGradients)
|
||||
for (float& wg : layerGradients.data)
|
||||
wg *= scale;
|
||||
|
||||
// Bias gradients
|
||||
@@ -67,7 +67,7 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
weightVelocities[i].resize(layer->weights.size());
|
||||
weightVelocities[i].resize(layer->weights.rows, layer->weights.cols);
|
||||
biasVelocities[i].resize(layer->biases.size());
|
||||
}
|
||||
}
|
||||
@@ -79,15 +79,15 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
assert(weightVelocities[lIdx].size() == layer->weights.size());
|
||||
assert(weightVelocities[lIdx].data.size() == layer->weights.data.size());
|
||||
assert(biasVelocities[lIdx].size() == layer->biases.size());
|
||||
assert(weightGradients[lIdx].size() == layer->weights.size());
|
||||
assert(weightGradients[lIdx].data.size() == layer->weights.data.size());
|
||||
assert(biasGradients[lIdx].size() == layer->biases.size());
|
||||
|
||||
// Update weights with momentum
|
||||
for (usize i = 0; i < layer->weights.size(); i++) {
|
||||
weightVelocities[lIdx][i] = momentum * weightVelocities[lIdx][i] - lr * weightGradients[lIdx][i];
|
||||
layer->weights[i] += weightVelocities[lIdx][i];
|
||||
for (usize i = 0; i < layer->weights.data.size(); i++) {
|
||||
weightVelocities[lIdx].data[i] = momentum * weightVelocities[lIdx].data[i] - lr * weightGradients[lIdx].data[i];
|
||||
layer->weights.data[i] += weightVelocities[lIdx].data[i];
|
||||
}
|
||||
|
||||
// Update biases with momentum
|
||||
@@ -118,9 +118,9 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
weightVelocities[i].resize(layer->weights.size());
|
||||
weightVelocities[i].resize(layer->weights.rows, layer->weights.cols);
|
||||
biasVelocities[i].resize(layer->biases.size());
|
||||
weightMomentum[i].resize(layer->weights.size());
|
||||
weightMomentum[i].resize(layer->weights.rows, layer->weights.cols);
|
||||
biasMomentum[i].resize(layer->biases.size());
|
||||
}
|
||||
}
|
||||
@@ -136,23 +136,23 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
assert(weightVelocities[lIdx].size() == layer->weights.size());
|
||||
assert(weightVelocities[lIdx].data.size() == layer->weights.data.size());
|
||||
assert(biasVelocities[lIdx].size() == layer->biases.size());
|
||||
assert(weightGradients[lIdx].size() == layer->weights.size());
|
||||
assert(weightGradients[lIdx].data.size() == layer->weights.data.size());
|
||||
assert(biasGradients[lIdx].size() == layer->biases.size());
|
||||
|
||||
// Update weights
|
||||
for (usize i = 0; i < layer->weights.size(); i++) {
|
||||
layer->weights[i] *= 1.0f - lr * decay;
|
||||
for (usize i = 0; i < layer->weights.data.size(); i++) {
|
||||
layer->weights.data[i] *= 1.0f - lr * decay;
|
||||
|
||||
weightMomentum[lIdx][i] = beta1 * weightMomentum[lIdx][i] + (1.0f - beta1) * weightGradients[lIdx][i];
|
||||
weightVelocities[lIdx][i] = beta2 * weightVelocities[lIdx][i] + (1.0f - beta2) * weightGradients[lIdx][i] * weightGradients[lIdx][i];
|
||||
weightMomentum[lIdx].data[i] = beta1 * weightMomentum[lIdx].data[i] + (1.0f - beta1) * weightGradients[lIdx].data[i];
|
||||
weightVelocities[lIdx].data[i] = beta2 * weightVelocities[lIdx].data[i] + (1.0f - beta2) * weightGradients[lIdx].data[i] * weightGradients[lIdx].data[i];
|
||||
|
||||
// Bias correction
|
||||
const float mHat = weightMomentum[lIdx][i] / biasCorr1;
|
||||
const float vHat = weightVelocities[lIdx][i] / biasCorr2;
|
||||
const float mHat = weightMomentum[lIdx].data[i] / biasCorr1;
|
||||
const float vHat = weightVelocities[lIdx].data[i] / biasCorr2;
|
||||
|
||||
layer->weights[i] -= lr * mHat / (std::sqrt(vHat) + epsilon);
|
||||
layer->weights.data[i] -= lr * mHat / (std::sqrt(vHat) + epsilon);
|
||||
}
|
||||
|
||||
// Update biases
|
||||
|
||||
@@ -9,7 +9,7 @@ namespace Ember {
|
||||
struct Optimizer {
|
||||
Network& net;
|
||||
|
||||
std::vector<Tensor<1>> weightGradients;
|
||||
std::vector<BlasMatrix> weightGradients;
|
||||
std::vector<Tensor<1>> biasGradients;
|
||||
|
||||
explicit Optimizer(Network& net);
|
||||
@@ -29,7 +29,7 @@ namespace Ember {
|
||||
|
||||
namespace optimizers {
|
||||
struct SGD : internal::Optimizer {
|
||||
std::vector<Tensor<1>> weightVelocities;
|
||||
std::vector<BlasMatrix> weightVelocities;
|
||||
std::vector<Tensor<1>> biasVelocities;
|
||||
|
||||
float momentum;
|
||||
@@ -49,9 +49,9 @@ namespace Ember {
|
||||
float decay;
|
||||
usize iteration = 0;
|
||||
|
||||
std::vector<Tensor<1>> weightVelocities;
|
||||
std::vector<BlasMatrix> weightVelocities;
|
||||
std::vector<Tensor<1>> biasVelocities;
|
||||
std::vector<Tensor<1>> weightMomentum;
|
||||
std::vector<BlasMatrix> weightMomentum;
|
||||
std::vector<Tensor<1>> biasMomentum;
|
||||
|
||||
explicit Adam(Network& net, const float beta1 = 0.9f, const float beta2 = 0.999f, const float epsilon = 1e-08, const float decay = 0.01f);
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
for (const float weight : layer->weights)
|
||||
for (const float weight : layer->weights.data)
|
||||
write(weight);
|
||||
|
||||
for (const float bias : layer->biases)
|
||||
@@ -34,7 +34,7 @@ namespace Ember {
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
for (float& weight : layer->weights)
|
||||
for (float& weight : layer->weights.data)
|
||||
read(weight);
|
||||
|
||||
for (float& bias : layer->biases)
|
||||
|
||||
31
src/tensor.h
31
src/tensor.h
@@ -8,6 +8,7 @@
|
||||
|
||||
namespace Ember {
|
||||
// Tensor recursive case
|
||||
// Tensor currently has a rather foolish implementation since it doesn't flatten the memory
|
||||
template<usize dimensionality>
|
||||
struct Tensor {
|
||||
static_assert(dimensionality > 1, "dimensionality must be >= 1");
|
||||
@@ -72,6 +73,9 @@ namespace Ember {
|
||||
auto begin() const { return data.begin(); }
|
||||
auto end() const { return data.end(); }
|
||||
|
||||
auto ptr() { return data.data(); }
|
||||
auto ptr() const { return data.data(); }
|
||||
|
||||
float& operator[](const usize idx) { return data[idx]; }
|
||||
const float& operator[](const usize idx) const { return data[idx]; }
|
||||
|
||||
@@ -88,4 +92,31 @@ namespace Ember {
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
struct BlasMatrix {
|
||||
usize rows{};
|
||||
usize cols{};
|
||||
std::vector<float> data;
|
||||
|
||||
BlasMatrix() = default;
|
||||
BlasMatrix(const usize rows, const usize cols) : rows(rows), cols(cols), data(rows * cols) {}
|
||||
|
||||
void resize(const usize rows, const usize cols) {
|
||||
this->rows = rows;
|
||||
this->cols = cols;
|
||||
data.resize(rows * cols);
|
||||
}
|
||||
|
||||
float* ptr() { return data.data(); }
|
||||
const float* ptr() const { return data.data(); }
|
||||
|
||||
void fill(const float value) {
|
||||
for (float& f : data)
|
||||
f = value;
|
||||
}
|
||||
|
||||
// (i, j) access (row i, column j)
|
||||
float& operator()(const usize i, const usize j) { return data[i * cols + j]; }
|
||||
const float& operator()(const usize i, const usize j) const { return data[i * cols + j]; }
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user