Working learner, dataloader, and optimizer
This commit is contained in:
7988
external/stb_image.h
vendored
Normal file
7988
external/stb_image.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
22
src/Ember.cpp
Normal file
22
src/Ember.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
#include "learner.h"
|
||||
|
||||
int main() {
|
||||
Ember::Network net(
|
||||
Ember::layers::Input(28 * 28),
|
||||
Ember::layers::Linear(28 * 28, 64),
|
||||
Ember::activations::ReLU(),
|
||||
Ember::layers::Linear(64, 10),
|
||||
Ember::activations::Softmax()
|
||||
);
|
||||
|
||||
Ember::dataloaders::ImageDataLoader dataloader("../datasets/MNIST/", 128, 0.9, 4, 28, 28);
|
||||
Ember::optimizers::SGD optimizer(net, 0.9);
|
||||
|
||||
Ember::Learner learner(net, dataloader, optimizer, Ember::loss::MeanSquaredError());
|
||||
|
||||
net.setMode(Ember::NetworkMode::TRAIN);
|
||||
|
||||
std::cout << net << std::endl;
|
||||
|
||||
learner.learn(0.05, 2);
|
||||
}
|
||||
@@ -5,12 +5,59 @@ namespace Ember {
|
||||
float ReLU(const float x) {
|
||||
return std::max(x, 0.0f);
|
||||
}
|
||||
|
||||
namespace derivatives {
|
||||
float ReLU(const float x) {
|
||||
return x > 0 ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace activations {
|
||||
void ReLU::forward(const Layer& previous) {
|
||||
for (usize prev = 0; prev < previous.size; prev++)
|
||||
values[prev] = internal::activations::ReLU(previous.getOutputs()[prev]);
|
||||
values[prev] = internal::activations::ReLU(previous.values[prev]);
|
||||
}
|
||||
Tensor<1> ReLU::backward(const Layer& previous, const Tensor<1>& gradOutput) const {
|
||||
Tensor<1> result(gradOutput.size());
|
||||
for (usize prev = 0; prev < gradOutput.size(); prev++)
|
||||
result[prev] = gradOutput[prev] * internal::activations::derivatives::ReLU(previous.values[prev]);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
void Softmax::forward(const Layer& previous) {
|
||||
values.resize(previous.size);
|
||||
float maxIn = previous.values[0];
|
||||
for (usize i = 1; i < previous.size; i++)
|
||||
maxIn = std::max(maxIn, previous.values[i]);
|
||||
|
||||
float sum = 0.0f;
|
||||
for (usize i = 0; i < previous.size; i++) {
|
||||
values[i] = std::exp(previous.values[i] - maxIn);
|
||||
sum += values[i];
|
||||
}
|
||||
|
||||
if (sum == 0.0f)
|
||||
for (auto& v : values) v = 1.0f / previous.size;
|
||||
else
|
||||
for (auto& v : values) v /= sum;
|
||||
}
|
||||
Tensor<1> Softmax::backward(const Layer& previous, const Tensor<1>& gradOutput) const {
|
||||
const usize n = gradOutput.size();
|
||||
Tensor<1> result(n);
|
||||
|
||||
// Compute dot product of gradOutput and softmax output (values)
|
||||
float dot = 0.0f;
|
||||
for (usize i = 0; i < n; ++i)
|
||||
dot += values[i] * gradOutput[i];
|
||||
|
||||
// Compute gradient for each element
|
||||
for (usize i = 0; i < n; ++i)
|
||||
result[i] = values[i] * (gradOutput[i] - dot);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,9 +11,21 @@ namespace Ember {
|
||||
struct ReLU : internal::ActivationLayer {
|
||||
void forward(const Layer& previous) override;
|
||||
|
||||
Tensor<1> backward(const Layer& previous, const Tensor<1>& gradOutput) const override;
|
||||
|
||||
std::string str() const override {
|
||||
return fmt::format("ReLU - applied to {} features", size);
|
||||
}
|
||||
};
|
||||
|
||||
struct Softmax : internal::ActivationLayer {
|
||||
void forward(const Layer& previous) override;
|
||||
|
||||
Tensor<1> backward(const Layer& previous, const Tensor<1>& gradOutput) const override;
|
||||
|
||||
std::string str() const override {
|
||||
return fmt::format("Softmax - applied to {} features", size);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
137
src/dataloader.cpp
Normal file
137
src/dataloader.cpp
Normal file
@@ -0,0 +1,137 @@
|
||||
#include "dataloader.h"
|
||||
|
||||
#include "../external/fmt/format.h"
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "../external/stb_image.h"
|
||||
|
||||
#include <filesystem>
|
||||
|
||||
std::vector<float> loadGreyscaleImage(const std::string& path, const Ember::usize w, const Ember::usize h) {
|
||||
int width, height, channels;
|
||||
unsigned char* data = stbi_load(path.data(), &width, &height, &channels, 1);
|
||||
if (!data)
|
||||
throw std::runtime_error("Failed to load image: " + path);
|
||||
|
||||
std::vector<float> vec(width * height);
|
||||
|
||||
if ((w == static_cast<Ember::usize>(width) || w == 0) && (h == static_cast<Ember::usize>(height) || h == 0)) {
|
||||
for (Ember::usize i = 0; i < width * height; i++)
|
||||
vec[i] = data[i] / 255.0f;
|
||||
}
|
||||
else {
|
||||
// Simple nearest-neighbor resize
|
||||
for (Ember::usize y = 0; y < h; ++y) {
|
||||
for (Ember::usize x = 0; x < w; ++x) {
|
||||
const int sourceX = x * width / w;
|
||||
const int sourceY = y * height / h;
|
||||
const int sourceIdx = sourceY * width + sourceX;
|
||||
const int destIdx = y * w + x;
|
||||
vec[destIdx] = data[sourceIdx] / 255.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stbi_image_free(data);
|
||||
return vec;
|
||||
}
|
||||
|
||||
namespace Ember::dataloaders {
|
||||
ImageDataLoader::ImageDataLoader(const std::string& dataDir, const u64 batchSize, const float trainSplit, const u64 threads, const usize width, const usize height)
|
||||
: DataLoader(batchSize, trainSplit, threads) {
|
||||
this->width = width;
|
||||
this->height = height;
|
||||
|
||||
fmt::println("Attempting to open data dir '{}'", dataDir);
|
||||
if (!std::filesystem::exists(dataDir) || !std::filesystem::is_directory(dataDir))
|
||||
exitWithMsg("Data directory does not exist or is not a directory: " + dataDir, 1);
|
||||
|
||||
this->dataDir = dataDir;
|
||||
|
||||
for (const auto &entry: std::filesystem::directory_iterator(this->dataDir)) {
|
||||
if (entry.is_directory())
|
||||
types.push_back(entry.path().string());
|
||||
}
|
||||
|
||||
fmt::println("Found {} types", types.size());
|
||||
|
||||
samplesPerType.resize(types.size());
|
||||
|
||||
numSamples = 0;
|
||||
for (usize typeIdx = 0; typeIdx < types.size(); typeIdx++) {
|
||||
for (const auto &entry: std::filesystem::directory_iterator(types[typeIdx])) {
|
||||
if (entry.is_regular_file()) {
|
||||
numSamples++;
|
||||
samplesPerType[typeIdx]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt::println("Using train to test ratio of {:.2f} with approximately {:.0f} train samples and {:.0f} test samples", trainSplit / (1 - trainSplit), numSamples * trainSplit, numSamples * (1 - trainSplit));
|
||||
}
|
||||
|
||||
void ImageDataLoader::loadBatch(const usize batchIdx) {
|
||||
data[batchIdx].clear();
|
||||
data[batchIdx].reserve(batchSize);
|
||||
|
||||
if (types.empty())
|
||||
exitWithMsg(fmt::format("No types found in '{}'", dataDir), 1);
|
||||
|
||||
std::mutex dataMut;
|
||||
|
||||
#pragma omp parallel for num_threads(threads)
|
||||
for (usize i = 0; i < batchSize; i++) {
|
||||
// Randomly pick a type
|
||||
std::uniform_int_distribution<usize> typeDist(0, types.size() - 1);
|
||||
const usize typeIdx = typeDist(rng);
|
||||
const std::string& typeDir = types[typeIdx];
|
||||
|
||||
// Gather image files in that directory
|
||||
std::vector<std::filesystem::path> imgs;
|
||||
for (const auto& entry : std::filesystem::directory_iterator(typeDir)) {
|
||||
if (entry.is_regular_file())
|
||||
imgs.push_back(entry.path());
|
||||
}
|
||||
|
||||
if (imgs.empty())
|
||||
exitWithMsg(fmt::format("No images found in '{}'", typeDir), 1);
|
||||
|
||||
// Randomly pick an image
|
||||
std::uniform_int_distribution<usize> imgDist(0, imgs.size() * trainSplit - 1);
|
||||
const usize imgIdx = imgDist(rng);
|
||||
|
||||
std::vector<float> input = loadGreyscaleImage(imgs[imgIdx].string(), width, height);
|
||||
std::vector<float> target(types.size(), 0);
|
||||
target[typeIdx] = 1;
|
||||
|
||||
dataMut.lock();
|
||||
data[batchIdx].emplace_back(input, target);
|
||||
dataMut.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
void ImageDataLoader::loadTestSet() {
|
||||
data[currBatch].clear();
|
||||
|
||||
if (types.empty())
|
||||
exitWithMsg(fmt::format("No types found in '{}'", dataDir), 1);
|
||||
|
||||
for (usize typeIdx = 0; typeIdx < types.size(); typeIdx++) {
|
||||
u64 currIdx = 0;
|
||||
for (const auto& entry : std::filesystem::directory_iterator(types[typeIdx])) {
|
||||
if (entry.is_regular_file()) {
|
||||
if (currIdx < samplesPerType[typeIdx] * trainSplit - 1) {
|
||||
currIdx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<float> input = loadGreyscaleImage(entry.path().string(), width, height);
|
||||
std::vector<float> target(types.size());
|
||||
target[typeIdx] = 1;
|
||||
|
||||
data[currBatch].emplace_back(input, target);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
96
src/dataloader.h
Normal file
96
src/dataloader.h
Normal file
@@ -0,0 +1,96 @@
|
||||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
#include <vector>
|
||||
#include <future>
|
||||
#include <random>
|
||||
#include <array>
|
||||
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
struct DataPoint {
|
||||
std::vector<float> input;
|
||||
std::vector<float> target;
|
||||
|
||||
DataPoint(const std::vector<float>& input, const std::vector<float>& target) : input(input), target(target) {}
|
||||
};
|
||||
|
||||
struct DataLoader {
|
||||
u64 threads;
|
||||
u64 batchSize;
|
||||
float trainSplit;
|
||||
|
||||
u64 numSamples;
|
||||
|
||||
usize currBatch;
|
||||
std::future<void> dataFuture;
|
||||
std::array<std::vector<DataPoint>, 2> data;
|
||||
|
||||
DataLoader(const u64 batchSize, const float trainSplit, const u64 threads) {
|
||||
this->threads = threads;
|
||||
this->batchSize = batchSize;
|
||||
this->trainSplit = trainSplit;
|
||||
|
||||
this->numSamples = 0;
|
||||
|
||||
this->currBatch = 0;
|
||||
|
||||
data[0].reserve(batchSize);
|
||||
data[1].reserve(batchSize);
|
||||
}
|
||||
|
||||
// Loads batch into other buffer
|
||||
virtual void loadBatch(const usize batchIdx) = 0;
|
||||
virtual void loadTestSet() = 0;
|
||||
|
||||
bool hasNext() const {
|
||||
return data[currBatch].size() > 0;
|
||||
}
|
||||
|
||||
DataPoint next() {
|
||||
assert(hasNext());
|
||||
const DataPoint dataPoint = data[currBatch].back();
|
||||
data[currBatch].pop_back();
|
||||
return dataPoint;
|
||||
}
|
||||
|
||||
// Attempts to load data asynchronously if threads > 0
|
||||
void asyncPreloadBatch() {
|
||||
dataFuture = std::async(threads > 0 ? std::launch::async : std::launch::deferred, [this]() { loadBatch(currBatch ^ 1); });
|
||||
}
|
||||
|
||||
void waitForBatch() {
|
||||
if (dataFuture.valid())
|
||||
dataFuture.get();
|
||||
}
|
||||
|
||||
const std::vector<DataPoint>& batchData() const {
|
||||
return data[currBatch];
|
||||
}
|
||||
|
||||
void swapBuffers() {
|
||||
currBatch ^= 1;
|
||||
}
|
||||
|
||||
virtual ~DataLoader() = default;
|
||||
};
|
||||
}
|
||||
|
||||
namespace dataloaders {
|
||||
struct ImageDataLoader : internal::DataLoader {
|
||||
std::string dataDir;
|
||||
std::vector<std::string> types;
|
||||
std::vector<u64> samplesPerType;
|
||||
std::mt19937 rng{ std::random_device{}() };
|
||||
|
||||
usize width;
|
||||
usize height;
|
||||
|
||||
ImageDataLoader(const std::string& dataDir, const u64 batchSize, const float trainSplit, const u64 threads = 0, const usize width = 0, const usize height = 0);
|
||||
|
||||
void loadBatch(const usize batchIdx) override;
|
||||
void loadTestSet() override;
|
||||
};
|
||||
}
|
||||
}
|
||||
76
src/layer.h
76
src/layer.h
@@ -2,6 +2,10 @@
|
||||
|
||||
#include "tensor.h"
|
||||
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
struct Layer {
|
||||
@@ -21,8 +25,6 @@ namespace Ember {
|
||||
}
|
||||
|
||||
virtual void forward(const Layer& previous) = 0;
|
||||
virtual Tensor<1>& getOutputs() { return values; };
|
||||
virtual const Tensor<1>& getOutputs() const { return values; };
|
||||
|
||||
virtual std::string str() const = 0;
|
||||
virtual ~Layer() = default;
|
||||
@@ -32,15 +34,29 @@ namespace Ember {
|
||||
Tensor<1> weights; // Indexed [previous][current], flattened to prev * size + curr
|
||||
Tensor<1> biases;
|
||||
|
||||
usize threadCount;
|
||||
|
||||
ComputeLayer() = delete;
|
||||
|
||||
ComputeLayer(const usize previousSize, const usize size) : Layer(size) {
|
||||
threadCount = std::max<usize>(1, std::thread::hardware_concurrency());
|
||||
threadCount = std::min<usize>(threadCount, size / 2);
|
||||
|
||||
this->weights.resize(previousSize * size);
|
||||
this->biases.resize(size);
|
||||
}
|
||||
|
||||
void setThreadCount(const usize threadCount) {
|
||||
this->threadCount = std::max<usize>(1, threadCount);
|
||||
this->threadCount = std::min<usize>(threadCount, size / 2);
|
||||
}
|
||||
|
||||
virtual std::tuple<Tensor<1>, Tensor<1>, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const = 0;
|
||||
};
|
||||
|
||||
struct ActivationLayer : Layer {};
|
||||
struct ActivationLayer : Layer {
|
||||
virtual Tensor<1> backward(const Layer& previous, const Tensor<1>& gradOutput) const = 0;
|
||||
};
|
||||
}
|
||||
|
||||
namespace layers {
|
||||
@@ -64,15 +80,55 @@ namespace Ember {
|
||||
const usize inputSize = previous.size;
|
||||
const usize outputSize = size;
|
||||
|
||||
// Move biases into the target vector
|
||||
values = biases;
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
// This instruction tells the compiler to run across all threads
|
||||
#pragma omp parallel for schedule(auto)
|
||||
for (usize prev = 0; prev < inputSize; prev++) {
|
||||
for (usize curr = 0; curr < outputSize; curr++)
|
||||
values[curr] += previous.getOutputs()[prev] * weights[prev * size + curr];
|
||||
threadCount = 1;
|
||||
|
||||
const auto worker = [&](const usize threadId) {
|
||||
// Divide the range across threads
|
||||
const usize start = (outputSize * threadId) / threadCount;
|
||||
const usize end = std::min((outputSize * (threadId + 1)) / threadCount, outputSize);
|
||||
|
||||
for (usize curr = start; curr < end; curr++) {
|
||||
float sum = biases[curr];
|
||||
for (usize prev = 0; prev < inputSize; prev++)
|
||||
sum += previous.values[prev] * weights[prev * size + curr];
|
||||
values[curr] = sum;
|
||||
}
|
||||
};
|
||||
|
||||
// Launch worker threads
|
||||
for (usize t = 1; t < threadCount; t++)
|
||||
threads.emplace_back(worker, t);
|
||||
|
||||
// Run thread 0 on the main thread
|
||||
worker(0);
|
||||
|
||||
// Join all threads
|
||||
for (std::thread& t : threads)
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
|
||||
std::tuple<Tensor<1>, Tensor<1>, Tensor<1>> backward(const Layer& previous, const Tensor<1>& gradOutput) const override {
|
||||
const usize inputSize = previous.size;
|
||||
const usize outputSize = size;
|
||||
|
||||
Tensor<1> gradInput(inputSize, 0.0f);
|
||||
Tensor<1> weightGrad(weights.size(), 0.0f);
|
||||
Tensor<1> biasGrad(size, 0.0f);
|
||||
|
||||
// Compute gradients
|
||||
for (usize curr = 0; curr < outputSize; curr++) {
|
||||
biasGrad[curr] = gradOutput[curr];
|
||||
for (usize prev = 0; prev < inputSize; prev++) {
|
||||
const usize wIndex = prev * outputSize + curr;
|
||||
gradInput[prev] += weights[wIndex] * gradOutput[curr];
|
||||
weightGrad[wIndex] += previous.values[prev] * gradOutput[curr];
|
||||
}
|
||||
}
|
||||
|
||||
return { gradInput, weightGrad, biasGrad };
|
||||
}
|
||||
|
||||
std::string str() const override {
|
||||
|
||||
169
src/learner.cpp
Normal file
169
src/learner.cpp
Normal file
@@ -0,0 +1,169 @@
|
||||
#include "learner.h"
|
||||
#include "progbar.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
namespace Ember {
|
||||
std::vector<internal::Gradient> Learner::backward(const std::vector<float> &target) const {
|
||||
std::vector<internal::Gradient> gradients(net.layers.size());
|
||||
|
||||
Tensor<1> error = lossFunc->backward(net.output(), target);
|
||||
|
||||
for (usize idx = net.layers.size() - 1; idx > 0; idx--) {
|
||||
auto* layer = net.layers[idx].get();
|
||||
|
||||
if (const auto* actLayer = dynamic_cast<internal::ActivationLayer*>(layer)) {
|
||||
error = actLayer->backward(*net.layers[idx - 1], error);
|
||||
}
|
||||
else if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(layer)) {
|
||||
auto [gradInput, weightGrad, biasGrad] = compLayer->backward(*net.layers[idx - 1], error);
|
||||
gradients[idx] = internal::Gradient(weightGrad, biasGrad);
|
||||
error = gradInput;
|
||||
}
|
||||
}
|
||||
|
||||
return gradients;
|
||||
}
|
||||
|
||||
void Learner::applyGradients(const usize batchSize, const std::vector<Tensor<1>>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum) {
|
||||
const float batchScalar = 1.0f / batchSize;
|
||||
// Apply gradients to weights and biases
|
||||
for (usize l = net.layers.size() - 1; l > 0; l--) {
|
||||
if (const auto& currLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
|
||||
assert(optimizer.weightGradients[l].size() == currLayer->weights.size());
|
||||
assert(optimizer.biasGradients[l].size() == currLayer->biases.size());
|
||||
|
||||
for (usize i = 0; i < optimizer.weightGradients[l].size(); i++)
|
||||
optimizer.weightGradients[l][i] += weightGradAccum[l][i] * batchScalar;
|
||||
for (usize i = 0; i < currLayer->size; i++)
|
||||
optimizer.biasGradients[l][i] += biasGradAccum[l][i] * batchScalar;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Learner::learn(const float lr, const usize epochs, usize threads) {
|
||||
if (threads == 0)
|
||||
threads = std::thread::hardware_concurrency();
|
||||
if (threads == 0) {
|
||||
std::cerr << "Failed to detect number of threads. Defaulting to 1" << std::endl;
|
||||
threads = 1;
|
||||
}
|
||||
|
||||
const u64 batchSize = dataLoader.batchSize;
|
||||
const u64 batchesPerEpoch = dataLoader.numSamples / batchSize;
|
||||
|
||||
fmt::println("Training for {} batches with {} batches per epoch", batchesPerEpoch * epochs, batchesPerEpoch);
|
||||
|
||||
std::cout << "Epoch Train loss Test loss Test accuracy\n\n" << std::endl;
|
||||
|
||||
// Returns { test loss, test accuracy }
|
||||
const auto getTestLossAcc = [&]() {
|
||||
float loss = 0;
|
||||
usize numCorrect = 0;
|
||||
dataLoader.loadTestSet();
|
||||
const usize testSize = dataLoader.batchData().size();
|
||||
while (dataLoader.hasNext()) {
|
||||
internal::DataPoint data = dataLoader.next();
|
||||
net.forward(data.input);
|
||||
loss += lossFunc->forward(net.layers.back()->values, data.target);
|
||||
usize guess = 0;
|
||||
usize goal = 0;
|
||||
for (usize i = 0; i < data.target.size(); i++) {
|
||||
if (net.layers.back()->values[i] > net.layers.back()->values[guess])
|
||||
guess = i;
|
||||
if (data.target[i] > data.target[goal])
|
||||
goal = i;
|
||||
}
|
||||
numCorrect += (guess == goal);
|
||||
}
|
||||
return std::pair<float, float>{ loss / (testSize ? testSize : 1), numCorrect / static_cast<float>(testSize ? testSize : 1) };
|
||||
};
|
||||
|
||||
// Initialize accumulators
|
||||
std::vector<Tensor<1>> weightGradAccum(net.layers.size());
|
||||
std::vector<Tensor<1>> biasGradAccum(net.layers.size());
|
||||
|
||||
for (usize i = 1; i < net.layers.size(); i++) {
|
||||
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[i].get())) {
|
||||
weightGradAccum[i].resize(compLayer->weights.size());
|
||||
biasGradAccum[i].resize(compLayer->biases.size());
|
||||
}
|
||||
}
|
||||
|
||||
// Preload first batch
|
||||
dataLoader.asyncPreloadBatch();
|
||||
|
||||
// Main loop
|
||||
for (usize epoch = 0; epoch < epochs; epoch++) {
|
||||
double trainLoss = 0;
|
||||
|
||||
ProgressBar progressBar{};
|
||||
|
||||
for (u64 batchIdx = 0; batchIdx < batchesPerEpoch; batchIdx++) {
|
||||
// Reset accumulators per mini-batch
|
||||
for (auto& t : weightGradAccum)
|
||||
t.fill(0);
|
||||
for (auto& t : biasGradAccum)
|
||||
t.fill(0);
|
||||
|
||||
dataLoader.waitForBatch();
|
||||
dataLoader.swapBuffers();
|
||||
|
||||
// Instantly start loading next batch
|
||||
dataLoader.asyncPreloadBatch();
|
||||
|
||||
for (u64 sample = 0; sample < batchSize; sample++) {
|
||||
const internal::DataPoint& data = dataLoader.next();
|
||||
|
||||
net.forward(data.input);
|
||||
|
||||
// Accumulate training loss
|
||||
trainLoss += lossFunc->forward(net.layers.back()->values, data.target);
|
||||
|
||||
const auto gradients = backward(data.target);
|
||||
|
||||
// Accumulate gradients
|
||||
for (usize l = 1; l < net.layers.size(); l++) {
|
||||
const auto& prevLayer = net.layers[l - 1];
|
||||
if (const auto* compLayer = dynamic_cast<internal::ComputeLayer*>(net.layers[l].get())) {
|
||||
for (usize i = 0; i < compLayer->size; i++) {
|
||||
for (usize j = 0; j < prevLayer->size; j++) {
|
||||
const usize idx = j * compLayer->size + i;
|
||||
assert(l < weightGradAccum.size());
|
||||
assert(idx < weightGradAccum[l].size());
|
||||
assert(idx < gradients[l].weightGrad.size());
|
||||
|
||||
weightGradAccum[l][idx] += gradients[l].weightGrad[idx];
|
||||
}
|
||||
|
||||
assert(l < biasGradAccum.size());
|
||||
assert(i < biasGradAccum[l].size());
|
||||
assert(i < gradients[l].biasGrad.size());
|
||||
|
||||
biasGradAccum[l][i] += gradients[l].biasGrad[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
applyGradients(batchSize, weightGradAccum, biasGradAccum);
|
||||
optimizer.clipGrad(1);
|
||||
optimizer.step(lr);
|
||||
optimizer.zeroGrad();
|
||||
|
||||
internal::cursor::up();
|
||||
internal::cursor::up();
|
||||
internal::cursor::begin();
|
||||
fmt::println("{:>5L}{:>14.5f}{:>13}{:>17}", epoch, trainLoss / batchIdx / batchSize, "Pending", "Pending");
|
||||
std::cout << progressBar.report(batchIdx, batchesPerEpoch, 63) << " " << std::endl;
|
||||
}
|
||||
const auto [testLoss, testAccuracy] = getTestLossAcc();
|
||||
|
||||
internal::cursor::up();
|
||||
internal::cursor::clear();
|
||||
internal::cursor::up();
|
||||
fmt::println("{:>5L}{:>14.5f}{:>13.5f}{:>17.2f}%\n\n", epoch, trainLoss / batchesPerEpoch / batchSize, testLoss, testAccuracy * 100);
|
||||
}
|
||||
}
|
||||
}
|
||||
41
src/learner.h
Normal file
41
src/learner.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include "activation.h"
|
||||
#include "dataloader.h"
|
||||
#include "optimizer.h"
|
||||
#include "loss.h"
|
||||
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
struct Gradient {
|
||||
Tensor<1> weightGrad;
|
||||
Tensor<1> biasGrad;
|
||||
|
||||
Gradient() = default;
|
||||
Gradient(const Tensor<1>& weightGrad, const Tensor<1>& biasGrad) : weightGrad(weightGrad), biasGrad(biasGrad) {}
|
||||
};
|
||||
}
|
||||
|
||||
struct Learner {
|
||||
Network& net;
|
||||
internal::DataLoader& dataLoader;
|
||||
internal::Optimizer& optimizer;
|
||||
std::unique_ptr<internal::LossFunction> lossFunc;
|
||||
|
||||
template<typename LossFunction>
|
||||
Learner(Network& net, internal::DataLoader& dataLoader, internal::Optimizer& optimizer, const LossFunction&& lossFunc) : net(net), dataLoader(dataLoader), optimizer(optimizer) {
|
||||
this->lossFunc = std::make_unique<std::decay_t<LossFunction>>(lossFunc);
|
||||
}
|
||||
|
||||
// Returns a vector of gradients
|
||||
// RETURNS VALUES ORDERED FROM LAST TO FIRST LAYER
|
||||
std::vector<internal::Gradient> backward(const std::vector<float>& target) const;
|
||||
|
||||
// Apply a gradient to the optimizer
|
||||
void applyGradients(const usize batchSize, const std::vector<Tensor<1>>& weightGradAccum, const std::vector<Tensor<1>>& biasGradAccum);
|
||||
|
||||
// Main trainer functionality is through this function
|
||||
// Trains a neural network
|
||||
void learn(const float lr, const usize epochs, usize threads = 0);
|
||||
};
|
||||
}
|
||||
23
src/loss.cpp
Normal file
23
src/loss.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
#include "loss.h"
|
||||
|
||||
namespace Ember::loss {
|
||||
float MeanSquaredError::forward(const Tensor<1>& output, const std::vector<float>& target) {
|
||||
assert(output.size() == target.size());
|
||||
|
||||
float loss = 0;
|
||||
for (usize i = 0; i < output.size(); i++)
|
||||
loss += std::pow(output[i] - target[i], 2);
|
||||
return loss / output.size();
|
||||
}
|
||||
|
||||
Tensor<1> MeanSquaredError::backward(const Tensor<1>& output, const std::vector<float>& target) {
|
||||
Tensor<1> gradient;
|
||||
gradient.resize(output.size());
|
||||
|
||||
const float scalar = 2.0f / output.size();
|
||||
|
||||
for (usize i = 0; i < output.size(); i++)
|
||||
gradient[i] = (output[i] - target[i]) * scalar;
|
||||
return gradient;
|
||||
}
|
||||
}
|
||||
22
src/loss.h
Normal file
22
src/loss.h
Normal file
@@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include <vector>
|
||||
|
||||
#include "layer.h"
|
||||
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
struct LossFunction {
|
||||
virtual float forward(const Tensor<1>& output, const std::vector<float>& target) = 0;
|
||||
virtual Tensor<1> backward(const Tensor<1>& output, const std::vector<float>& target) = 0;
|
||||
|
||||
virtual ~LossFunction() = default;
|
||||
};
|
||||
}
|
||||
|
||||
namespace loss {
|
||||
struct MeanSquaredError : internal::LossFunction {
|
||||
float forward(const Tensor<1>& output, const std::vector<float> &target) override;
|
||||
Tensor<1> backward(const Tensor<1>& output, const std::vector<float> &target) override;
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -1,15 +1,21 @@
|
||||
#include "network.h"
|
||||
|
||||
namespace Ember {
|
||||
void Network::setMode(const NetworkMode mode) {
|
||||
for (usize i = 1; i < layers.size(); i++)
|
||||
if (auto* layer = dynamic_cast<internal::ComputeLayer*>(layers[i].get()); layer != nullptr)
|
||||
layer->setThreadCount(mode == NetworkMode::EVAL ? std::thread::hardware_concurrency() : 1);
|
||||
}
|
||||
|
||||
void Network::forward(const Tensor<1>& input) {
|
||||
layers[0]->getOutputs() = input;
|
||||
layers[0]->values = input;
|
||||
|
||||
for (usize i = 1; i < layers.size(); i++)
|
||||
layers[i]->forward(*layers[i - 1]);
|
||||
}
|
||||
|
||||
const Tensor<1>& Network::output() const {
|
||||
return layers.back()->getOutputs();
|
||||
return layers.back()->values;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const Network& net) {
|
||||
|
||||
@@ -6,6 +6,11 @@
|
||||
#include <memory>
|
||||
|
||||
namespace Ember {
|
||||
enum class NetworkMode {
|
||||
EVAL,
|
||||
TRAIN
|
||||
};
|
||||
|
||||
struct Network {
|
||||
std::vector<std::unique_ptr<internal::Layer>> layers;
|
||||
|
||||
@@ -53,6 +58,8 @@ namespace Ember {
|
||||
_init(true, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
void setMode(const NetworkMode mode);
|
||||
|
||||
void forward(const Tensor<1>& input);
|
||||
const Tensor<1>& output() const;
|
||||
|
||||
|
||||
111
src/optimizer.cpp
Normal file
111
src/optimizer.cpp
Normal file
@@ -0,0 +1,111 @@
|
||||
#include "optimizer.h"
|
||||
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
Optimizer::Optimizer(Network& net) : net(net) {
|
||||
weightGradients.resize(net.layers.size());
|
||||
biasGradients.resize(net.layers.size());
|
||||
for (usize i = 1; i < net.layers.size(); i++) {
|
||||
const std::unique_ptr<Layer>& l = net.layers[i];
|
||||
const auto* layer = dynamic_cast<ComputeLayer*>(l.get());
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
weightGradients[i].resize(layer->weights.size());
|
||||
biasGradients[i].resize(layer->biases.size());
|
||||
}
|
||||
}
|
||||
|
||||
void Optimizer::zeroGrad() {
|
||||
for (Tensor<1>& grad : weightGradients)
|
||||
grad.fill(0);
|
||||
|
||||
for (Tensor<1>& grad : biasGradients)
|
||||
grad.fill(0);
|
||||
}
|
||||
|
||||
void Optimizer::clipGrad(const float maxNorm) {
|
||||
// Compute total norm of all gradients (weights and biases) across all layers
|
||||
double totalNormSq = 0.0;
|
||||
// Weights gradients
|
||||
for (const auto& layerGradients : weightGradients)
|
||||
for (const float wg : layerGradients)
|
||||
totalNormSq += wg * wg;
|
||||
|
||||
// Bias gradients
|
||||
for (const auto& layerGradients : biasGradients)
|
||||
for (const float bg : layerGradients)
|
||||
totalNormSq += bg * bg;
|
||||
|
||||
const float totalNorm = std::sqrt(totalNormSq);
|
||||
|
||||
// Scale all gradients if needed
|
||||
if (totalNorm > maxNorm && totalNorm > 0.0f) {
|
||||
const float scale = maxNorm / totalNorm;
|
||||
|
||||
// Weights gradients
|
||||
for (auto& layerGradients : weightGradients)
|
||||
for (float& wg : layerGradients)
|
||||
wg *= scale;
|
||||
|
||||
// Bias gradients
|
||||
for (auto& layerGradients : biasGradients)
|
||||
for (float& bg : layerGradients)
|
||||
bg *= scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace optimizers {
|
||||
SGD::SGD(Network& net, const float momentum) : Optimizer(net), momentum(momentum) {
|
||||
weightVelocities.resize(net.layers.size());
|
||||
biasVelocities.resize(net.layers.size());
|
||||
|
||||
for (usize i = 1; i < net.layers.size(); i++) {
|
||||
const std::unique_ptr<internal::Layer>& l = net.layers[i];
|
||||
const auto* layer = dynamic_cast<internal::ComputeLayer*>(l.get());
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
weightVelocities[i].resize(layer->weights.size());
|
||||
biasVelocities[i].resize(layer->biases.size());
|
||||
}
|
||||
}
|
||||
|
||||
SGD::SGD(const SGD& other)
|
||||
: Optimizer(other),
|
||||
weightVelocities(other.weightVelocities),
|
||||
biasVelocities(other.biasVelocities),
|
||||
momentum(other.momentum) {}
|
||||
|
||||
void SGD::step(const float lr) {
|
||||
for (usize lIdx = 1; lIdx < net.layers.size(); lIdx++) {
|
||||
std::unique_ptr<internal::Layer>& l = net.layers[lIdx];
|
||||
auto* layer = dynamic_cast<internal::ComputeLayer*>(l.get());
|
||||
if (!layer)
|
||||
continue;
|
||||
|
||||
assert(weightVelocities[lIdx].size() == layer->weights.size());
|
||||
assert(biasVelocities[lIdx].size() == layer->biases.size());
|
||||
assert(weightGradients[lIdx].size() == layer->weights.size());
|
||||
assert(biasGradients[lIdx].size() == layer->biases.size());
|
||||
|
||||
// Update weights with momentum
|
||||
for (usize i = 0; i < layer->weights.size(); i++) {
|
||||
weightVelocities[lIdx][i] = momentum * weightVelocities[lIdx][i] - lr * weightGradients[lIdx][i];
|
||||
layer->weights[i] += weightVelocities[lIdx][i];
|
||||
}
|
||||
|
||||
// Update biases with momentum
|
||||
for (usize i = 0; i < layer->biases.size(); i++) {
|
||||
biasVelocities[lIdx][i] = momentum * biasVelocities[lIdx][i] - lr * biasGradients[lIdx][i];
|
||||
layer->biases[i] += biasVelocities[lIdx][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::Optimizer> SGD::clone() const {
|
||||
return std::make_unique<SGD>(*this);
|
||||
}
|
||||
}
|
||||
}
|
||||
46
src/optimizer.h
Normal file
46
src/optimizer.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "network.h"
|
||||
|
||||
namespace Ember {
|
||||
namespace internal {
|
||||
struct Optimizer {
|
||||
Network& net;
|
||||
|
||||
std::vector<Tensor<1>> weightGradients;
|
||||
std::vector<Tensor<1>> biasGradients;
|
||||
|
||||
explicit Optimizer(Network& net);
|
||||
|
||||
Optimizer(const Optimizer& other) : net(other.net), weightGradients(other.weightGradients), biasGradients(other.biasGradients) {}
|
||||
|
||||
void zeroGrad();
|
||||
|
||||
void clipGrad(const float maxNorm);
|
||||
|
||||
virtual void step(float lr) = 0;
|
||||
virtual std::unique_ptr<Optimizer> clone() const = 0;
|
||||
|
||||
virtual ~Optimizer() = default;
|
||||
};
|
||||
}
|
||||
|
||||
namespace optimizers {
|
||||
struct SGD : internal::Optimizer {
|
||||
std::vector<Tensor<1>> weightVelocities;
|
||||
std::vector<Tensor<1>> biasVelocities;
|
||||
|
||||
float momentum;
|
||||
|
||||
SGD(Network& net, const float momentum = 0.9f);
|
||||
|
||||
SGD(const SGD& other);
|
||||
|
||||
void step(const float lr) override;
|
||||
|
||||
std::unique_ptr<Optimizer> clone() const override;
|
||||
};
|
||||
}
|
||||
}
|
||||
39
src/progbar.h
Normal file
39
src/progbar.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#include "stopwatch.h"
|
||||
#include "../external/fmt/format.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Ember {
|
||||
|
||||
|
||||
struct ProgressBar {
|
||||
Stopwatch<std::chrono::milliseconds> start;
|
||||
|
||||
ProgressBar() {
|
||||
start.start();
|
||||
}
|
||||
|
||||
std::string report(const u64 progress, const u64 total, const u64 barWidth) {
|
||||
std::ostringstream out;
|
||||
|
||||
out << fmt::format("{:>4.0f}% ", static_cast<float>(progress * 100) / total);
|
||||
|
||||
const u64 pos = barWidth * progress / total;
|
||||
out << "\u2595";
|
||||
for (u64 i = 0; i < barWidth - 1; ++i) {
|
||||
if (i < pos) out << "\u2588";
|
||||
else out << " ";
|
||||
}
|
||||
out << "\u258F";
|
||||
|
||||
const u64 elapsed = std::max<u64>(start.elapsed(), 1);
|
||||
const u64 msRemaining = (total - progress) * elapsed / std::max<u64>(progress, 1);
|
||||
|
||||
out << fmt::format(" {}/{} at {:.2f} per sec with {} remaining", progress, total, static_cast<float>(progress) / elapsed * 1000, formatTime(msRemaining));
|
||||
|
||||
return out.str();
|
||||
}
|
||||
};
|
||||
}
|
||||
64
src/stopwatch.h
Normal file
64
src/stopwatch.h
Normal file
@@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
|
||||
#include "types.h"
|
||||
|
||||
namespace Ember {
|
||||
inline std::string formatTime(const u64 timeInMS) {
|
||||
long long seconds = timeInMS / 1000;
|
||||
const long long hours = seconds / 3600;
|
||||
seconds %= 3600;
|
||||
const long long minutes = seconds / 60;
|
||||
seconds %= 60;
|
||||
|
||||
std::string result;
|
||||
|
||||
if (hours > 0)
|
||||
result += std::to_string(hours) + "h ";
|
||||
if (minutes > 0 || hours > 0)
|
||||
result += std::to_string(minutes) + "m ";
|
||||
if (seconds > 0 || minutes > 0 || hours > 0)
|
||||
result += std::to_string(seconds) + "s";
|
||||
if (result == "")
|
||||
return std::to_string(timeInMS) + "ms";
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Precision>
|
||||
class Stopwatch {
|
||||
std::chrono::high_resolution_clock::time_point startTime;
|
||||
std::chrono::high_resolution_clock::time_point pauseTime;
|
||||
|
||||
bool paused;
|
||||
|
||||
u64 pausedTime;
|
||||
|
||||
public:
|
||||
Stopwatch() { start(); }
|
||||
|
||||
void start() {
|
||||
startTime = std::chrono::high_resolution_clock::now();
|
||||
pausedTime = 0;
|
||||
paused = false;
|
||||
}
|
||||
|
||||
void reset() { start(); }
|
||||
|
||||
u64 elapsed() {
|
||||
u64 pausedTime = this->pausedTime;
|
||||
if (paused)
|
||||
pausedTime += std::chrono::duration_cast<Precision>(std::chrono::high_resolution_clock::now() - pauseTime).count();
|
||||
return std::chrono::duration_cast<Precision>(std::chrono::high_resolution_clock::now() - startTime).count() - pausedTime;
|
||||
}
|
||||
|
||||
void pause() {
|
||||
paused = true;
|
||||
pauseTime = std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
void resume() {
|
||||
paused = false;
|
||||
pausedTime += std::chrono::duration_cast<Precision>(std::chrono::high_resolution_clock::now() - pauseTime).count();
|
||||
}
|
||||
};
|
||||
}
|
||||
13
src/tensor.h
13
src/tensor.h
@@ -4,6 +4,8 @@
|
||||
|
||||
#include "../external/fmt/format.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace Ember {
|
||||
// Tensor recursive case
|
||||
template<usize dimensionality>
|
||||
@@ -25,6 +27,11 @@ namespace Ember {
|
||||
subTensor.resize(restDims...);
|
||||
}
|
||||
|
||||
void fill(const float value) {
|
||||
for (auto& subTensor : data)
|
||||
subTensor.fill(value);
|
||||
}
|
||||
|
||||
usize size() const { return data.size(); }
|
||||
|
||||
auto begin() { return data.begin(); }
|
||||
@@ -48,12 +55,16 @@ namespace Ember {
|
||||
|
||||
Tensor() = default;
|
||||
Tensor(const std::vector<float> &data) : data(data) {}
|
||||
explicit Tensor(const usize size) : data(size) {}
|
||||
explicit Tensor(const usize size, const float def = 0.0f) : data(size, def) {}
|
||||
|
||||
void resize(const usize size) {
|
||||
data.resize(size);
|
||||
}
|
||||
|
||||
void fill(const float value) {
|
||||
std::fill(data.begin(), data.end(), value);
|
||||
}
|
||||
|
||||
usize size() const { return data.size(); }
|
||||
|
||||
auto begin() { return data.begin(); }
|
||||
|
||||
35
src/types.h
35
src/types.h
@@ -1,10 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
namespace Ember {
|
||||
#define exitWithMsg(msg, code) \
|
||||
@@ -24,4 +29,30 @@ std::exit(code); \
|
||||
using i8 = int8_t;
|
||||
|
||||
using usize = size_t;
|
||||
|
||||
namespace internal {
|
||||
namespace cursor {
|
||||
[[maybe_unused]] inline void clearAll(std::ostream& out = std::cout) { out << "\033[2J\033[H"; }
|
||||
[[maybe_unused]] inline void clear(std::ostream& out = std::cout) { out << "\033[2K\r"; }
|
||||
[[maybe_unused]] inline void clearDown(std::ostream& out = std::cout) { out << "\x1b[J"; }
|
||||
[[maybe_unused]] inline void home(std::ostream& out = std::cout) { out << "\033[H"; }
|
||||
[[maybe_unused]] inline void up(std::ostream& out = std::cout) { out << "\033[A"; }
|
||||
[[maybe_unused]] inline void down(std::ostream& out = std::cout) { out << "\033[B"; }
|
||||
[[maybe_unused]] inline void begin(std::ostream& out = std::cout) { out << "\033[1G"; }
|
||||
[[maybe_unused]] inline void goTo(const usize x, const usize y, std::ostream& out = std::cout) { out << "\033[" << y << ";" << x << "H"; }
|
||||
|
||||
[[maybe_unused]] inline void hide(std::ostream& out = std::cout) { out << "\033[?25l"; }
|
||||
[[maybe_unused]] inline void show(std::ostream& out = std::cout) { out << "\033[?25h"; }
|
||||
}
|
||||
|
||||
struct UnicodeTerminalInitializer {
|
||||
UnicodeTerminalInitializer() {
|
||||
#ifdef _WIN32
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static inline UnicodeTerminalInitializer unicodeTerminalInitializer;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user