From f728261354fcf47be177dde12632bbd0601628cd Mon Sep 17 00:00:00 2001
From: StepanovPlaton <mail2platon@yandex.ru>
Date: Thu, 30 Oct 2025 23:26:53 +0400
Subject: [PATCH] First NN forward

---
 src/Makefile                           |  14 +-
 src/benchmark.cpp                      | 136 +++++++++++++++++++
 src/main.cpp                           | 179 ++++++++-----------------
 src/math/matrix/gpu/matrix.cpp         |  28 +++-
 src/math/matrix/gpu/matrix.hpp         |   9 +-
 src/math/matrix/gpu/mutable_matrix.cpp |   7 +
 src/math/matrix/gpu/mutable_matrix.hpp |   6 +
 src/math/matrix/matrix.hpp             |   1 +
 src/math/matrix/mutable_matrix.hpp     |   2 +-
 src/math/opencl/opencl.hpp             |   2 +-
 10 files changed, 254 insertions(+), 130 deletions(-)
 create mode 100644 src/benchmark.cpp
diff --git a/src/Makefile b/src/Makefile
index 3c13673..8d6b1b3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -2,13 +2,18 @@ CXX = g++
 CXXFLAGS = -Wall -Wextra -O2 -std=c++23
 LIBS = -lOpenCL
 TARGET = main
-SRC = main.cpp ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp
+COMMON_SRC = ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp
+MAIN_SRC = main.cpp $(COMMON_SRC)
+BENCHMARK_SRC = benchmark.cpp $(COMMON_SRC)
 
 INCLUDES = -I"A:/Programs/OpenCL/include" 
 LIB_PATH = -L"A:/Programs/OpenCL/lib"
 
-$(TARGET): $(SRC)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) $(LIB_PATH) -o $(TARGET) $(SRC) $(LIBS)
+$(TARGET): $(MAIN_SRC)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) $(LIB_PATH) -o $(TARGET) $(MAIN_SRC) $(LIBS)
+
+benchmark: $(BENCHMARK_SRC)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) $(LIB_PATH) -o $(TARGET) $(BENCHMARK_SRC) $(LIBS)
 
 clean:
 	rm -f $(TARGET)
@@ -16,4 +21,7 @@ clean:
 run: $(TARGET)
 	./$(TARGET)
 
+run_benchmark: benchmark
+	./$(TARGET)
+
 .PHONY: clean run
\ No newline at end of file
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
new file mode 100644
index 0000000..d3e1492
--- /dev/null
+++ b/src/benchmark.cpp
@@ -0,0 +1,136 @@
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <vector>
+
+#include "./math/math.hpp"
+
+typedef Matrices::CPU Matrix;
+typedef MutableMatrices::CPU MutableMatrix;
+
+OpenCL openCL;
+
+std::vector<float> generateRandomMatrix(int rows, int cols) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+
+  std::vector<float> matrix(rows * cols);
+  for (int i = 0; i < rows * cols; ++i) {
+    matrix[i] = dis(gen);
+  }
+  return matrix;
+}
+std::vector<float> generateIdentityMatrix(int size) {
+  std::vector<float> matrix(size * size, 0.0f);
+  for (int i = 0; i < size; ++i) {
+    matrix[i * size + i] = 1.0f;
+  }
+  return matrix;
+}
+
+int main() {
+  const int SIZE = 1024;
+
+  std::cout << "Testing with " << SIZE << "x" << SIZE << " matrices..."
+            << std::endl;
+
+  std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
+  std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
+  std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);
+
+  // std::vector<float> matrixA = generateIdentityMatrix(SIZE);
+  // std::vector<float> matrixB = generateIdentityMatrix(SIZE);
+  // std::vector<float> matrixC = generateIdentityMatrix(SIZE);
+
+  // Тестирование на CPU
+  {
+    std::cout << "\n=== CPU Version ===" << std::endl;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    MutableMatrices::CPU a(SIZE, SIZE, matrixA);
+    Matrices::CPU b(SIZE, SIZE, matrixB);
+    Matrices::CPU c(SIZE, SIZE, matrixC);
+
+    auto gen_end = std::chrono::high_resolution_clock::now();
+
+    auto op_start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < 10; i++) {
+      a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID);
+    }
+
+    auto op_end = std::chrono::high_resolution_clock::now();
+
+    std::vector<float> v = a.toVector();
+
+    auto total_end = std::chrono::high_resolution_clock::now();
+
+    auto gen_duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - start);
+    auto op_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        op_end - op_start);
+    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        total_end - start);
+
+    std::cout << "Matrix generation time: " << gen_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Operations time: " << op_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;
+
+    std::cout << "First few elements: ";
+    for (size_t i = 0; i < 5 && i < v.size(); ++i) {
+      std::cout << v[i] << " ";
+    }
+    std::cout << std::endl;
+  }
+
+  // Тестирование на GPU
+  {
+    std::cout << "\n=== GPU Version ===" << std::endl;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    MutableMatrices::GPU a(SIZE, SIZE, matrixA);
+    Matrices::GPU b(SIZE, SIZE, matrixB);
+    Matrices::GPU c(SIZE, SIZE, matrixC);
+
+    auto gen_end = std::chrono::high_resolution_clock::now();
+
+    auto op_start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < 10; i++) {
+      a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f);
+    }
+
+    auto op_end = std::chrono::high_resolution_clock::now();
+
+    std::vector<float> v = a.toVector();
+
+    auto total_end = std::chrono::high_resolution_clock::now();
+
+    auto gen_duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - start);
+    auto op_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        op_end - op_start);
+    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        total_end - start);
+
+    std::cout << "Matrix generation time: " << gen_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Operations time: " << op_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;
+
+    std::cout << "First few elements: ";
+    for (size_t i = 0; i < 5 && i < v.size(); ++i) {
+      std::cout << v[i] << " ";
+    }
+    std::cout << std::endl;
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index d3e1492..cb9731c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,133 +1,72 @@
-#include <chrono>
-#include <iostream>
-#include <random>
-#include <stdexcept>
-#include <vector>
-
 #include "./math/math.hpp"
 
-typedef Matrices::CPU Matrix;
-typedef MutableMatrices::CPU MutableMatrix;
+#include <chrono>
+#include <thread>
+
+typedef Matrices::GPU M;
+typedef MutableMatrices::GPU MM;
+
+class Layer {
+protected:
+  int features;
+  float bias;
+  MM::Activate activate;
+  float alpha;
+
+public:
+  Layer(int features, MM::Activate activate = MM::Activate::LINEAR,
+        float bias = 0.0f, float alpha = 0.0f)
+      : features(features), activate(activate), bias(bias), alpha(alpha) {}
+
+  int getFeatures() const { return features; }
+  float getBias() const { return bias; }
+  MM::Activate getActivate() const { return activate; }
+  float getAlpha() const { return alpha; }
+};
+
+class NeuralNetwork {
+private:
+  std::vector<Layer> layers;
+  std::vector<MM> weights;
+
+public:
+  NeuralNetwork(int n, std::initializer_list<Layer> l) : layers(l) {
+    weights.emplace_back(n, layers[0].getFeatures());
+    for (int i = 0; i < layers.size() - 1; i++)
+      weights.emplace_back(layers[i].getFeatures(),
+                           layers[i + 1].getFeatures());
+  }
+
+  std::vector<float> predict(std::vector<float> i) {
+    if (i.size() != weights[0].getRows())
+      std::invalid_argument("Invalid input size");
+    MM input(1, (int)i.size(), i);
+    for (size_t i = 0; i < weights.size(); i++)
+      input.mult(weights[i], layers[i + 1].getBias(),
+                 layers[i + 1].getActivate(), layers[i + 1].getAlpha());
+    return input.toVector();
+  }
+};
 
 OpenCL openCL;
 
-std::vector<float> generateRandomMatrix(int rows, int cols) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
-
-  std::vector<float> matrix(rows * cols);
-  for (int i = 0; i < rows * cols; ++i) {
-    matrix[i] = dis(gen);
-  }
-  return matrix;
-}
-std::vector<float> generateIdentityMatrix(int size) {
-  std::vector<float> matrix(size * size, 0.0f);
-  for (int i = 0; i < size; ++i) {
-    matrix[i * size + i] = 1.0f;
-  }
-  return matrix;
-}
-
 int main() {
-  const int SIZE = 1024;
+  NeuralNetwork nn(
+      2, {Layer(3, MM::Activate::RELU), Layer(1, MM::Activate::RELU)});
 
-  std::cout << "Testing with " << SIZE << "x" << SIZE << " matrices..."
-            << std::endl;
+  for (int i = 0; i < 10; i++) {
+    int v1 = (i / 2) % 2;
+    int v2 = i % 2;
 
-  std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
-  std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
-  std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);
+    std::vector<float> v = {static_cast<float>(v1), static_cast<float>(v2)};
 
-  // std::vector<float> matrixA = generateIdentityMatrix(SIZE);
-  // std::vector<float> matrixB = generateIdentityMatrix(SIZE);
-  // std::vector<float> matrixC = generateIdentityMatrix(SIZE);
+    std::vector<float> r = nn.predict(v);
+    float expected = static_cast<float>(v1 ^ v2);
 
-  // Тестирование на CPU
-  {
-    std::cout << "\n=== CPU Version ===" << std::endl;
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    MutableMatrices::CPU a(SIZE, SIZE, matrixA);
-    Matrices::CPU b(SIZE, SIZE, matrixB);
-    Matrices::CPU c(SIZE, SIZE, matrixC);
-
-    auto gen_end = std::chrono::high_resolution_clock::now();
-
-    auto op_start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < 10; i++) {
-      a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID);
-    }
-
-    auto op_end = std::chrono::high_resolution_clock::now();
-
-    std::vector<float> v = a.toVector();
-
-    auto total_end = std::chrono::high_resolution_clock::now();
-
-    auto gen_duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - start);
-    auto op_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        op_end - op_start);
-    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        total_end - start);
-
-    std::cout << "Matrix generation time: " << gen_duration.count() << " ms"
-              << std::endl;
-    std::cout << "Operations time: " << op_duration.count() << " ms"
-              << std::endl;
-    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;
-
-    std::cout << "First few elements: ";
-    for (size_t i = 0; i < 5 && i < v.size(); ++i) {
-      std::cout << v[i] << " ";
-    }
-    std::cout << std::endl;
-  }
-
-  // Тестирование на GPU
-  {
-    std::cout << "\n=== GPU Version ===" << std::endl;
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    MutableMatrices::GPU a(SIZE, SIZE, matrixA);
-    Matrices::GPU b(SIZE, SIZE, matrixB);
-    Matrices::GPU c(SIZE, SIZE, matrixC);
-
-    auto gen_end = std::chrono::high_resolution_clock::now();
-
-    auto op_start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < 10; i++) {
-      a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f);
-    }
-
-    auto op_end = std::chrono::high_resolution_clock::now();
-
-    std::vector<float> v = a.toVector();
-
-    auto total_end = std::chrono::high_resolution_clock::now();
-
-    auto gen_duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - start);
-    auto op_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        op_end - op_start);
-    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        total_end - start);
-
-    std::cout << "Matrix generation time: " << gen_duration.count() << " ms"
-              << std::endl;
-    std::cout << "Operations time: " << op_duration.count() << " ms"
-              << std::endl;
-    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;
-
-    std::cout << "First few elements: ";
-    for (size_t i = 0; i < 5 && i < v.size(); ++i) {
-      std::cout << v[i] << " ";
+    std::cout << "XOR(" << v1 << ", " << v2 << ") = " << expected;
+    std::cout << " | Network: ";
+    for (size_t j = 0; j < r.size(); ++j) {
+      std::cout << r[j] << " ";
     }
     std::cout << std::endl;
   }
diff --git a/src/math/matrix/gpu/matrix.cpp b/src/math/matrix/gpu/matrix.cpp
index 3b46ac3..116b94f 100644
--- a/src/math/matrix/gpu/matrix.cpp
+++ b/src/math/matrix/gpu/matrix.cpp
@@ -1,15 +1,35 @@
+#include <random>
+
 #include "matrix.hpp"
 
+std::random_device rd;
+std::mt19937 gen(rd());
+
+Matrices::GPU::GPU(int rows, int cols)
+    : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
+  validateDimensions(rows, cols);
+  std::vector<float> matrix;
+  matrix.reserve(rows * cols);
+  for (size_t i = 0; i < (size_t)rows * (size_t)cols; ++i)
+    matrix.push_back(std::generate_canonical<float, 32>(gen));
+  buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                          rows * cols * sizeof(float));
+  queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
+                           matrix.data());
+  queue.finish();
+}
+
 Matrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
     : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
   validateDimensions(rows, cols);
   if (matrix.size() != static_cast<size_t>(rows * cols)) {
     throw std::invalid_argument("Matrix data size doesn't match dimensions");
   }
-
-  buffer = new cl::Buffer(
-      openCL.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-      rows * cols * sizeof(float), const_cast<float *>(matrix.data()));
+  buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                          rows * cols * sizeof(float));
+  queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
+                           matrix.data());
+  queue.finish();
 }
 
 const std::vector<float> Matrices::GPU::toVector() const {
diff --git a/src/math/matrix/gpu/matrix.hpp b/src/math/matrix/gpu/matrix.hpp
index c6b7fae..1c4b244 100644
--- a/src/math/matrix/gpu/matrix.hpp
+++ b/src/math/matrix/gpu/matrix.hpp
@@ -11,12 +11,19 @@ protected:
   cl::CommandQueue queue;
 
 public:
+  GPU(int rows, int cols);
   GPU(int rows, int cols, const std::vector<float> &matrix);
   ~GPU() { delete buffer; }
 
   GPU(const GPU &) = delete;
   GPU &operator=(const GPU &) = delete;
-  GPU(GPU &&other) = default;
+  GPU(GPU &&other)
+      : IMatrix(other.rows, other.cols), buffer(other.buffer),
+        queue(std::move(other.queue)) {
+    other.buffer = nullptr;
+    other.rows = 0;
+    other.cols = 0;
+  }
   GPU &operator=(GPU &&other) = default;
 
   int getRows() const override { return rows; }
diff --git a/src/math/matrix/gpu/mutable_matrix.cpp b/src/math/matrix/gpu/mutable_matrix.cpp
index 1e09e0a..ebe2e03 100644
--- a/src/math/matrix/gpu/mutable_matrix.cpp
+++ b/src/math/matrix/gpu/mutable_matrix.cpp
@@ -1,5 +1,12 @@
 #include "mutable_matrix.hpp"
 
+MutableMatrices::GPU::GPU(int rows, int cols) : Matrices::GPU(rows, cols) {
+  for (const auto &entry : kernelsNames) {
+    kernels[entry.first] =
+        cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
+  }
+}
+
 MutableMatrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
     : Matrices::GPU(rows, cols, matrix) {
   for (const auto &entry : kernelsNames) {
diff --git a/src/math/matrix/gpu/mutable_matrix.hpp b/src/math/matrix/gpu/mutable_matrix.hpp
index 02df50f..8cd6f9d 100644
--- a/src/math/matrix/gpu/mutable_matrix.hpp
+++ b/src/math/matrix/gpu/mutable_matrix.hpp
@@ -27,8 +27,14 @@ private:
   }
 
 public:
+  GPU(int rows, int cols);
   GPU(int rows, int cols, const std::vector<float> &matrix);
 
+  GPU(const GPU &) = delete;
+  GPU &operator=(const GPU &) = delete;
+  GPU(GPU &&other) = default;
+  GPU &operator=(GPU &&other) = default;
+
   void mult(Matrices::GPU &m, float bias = 0.0f,
             Activate type = Activate::LINEAR, float alpha = 0.01f);
   void mult(float scalar);
diff --git a/src/math/matrix/matrix.hpp b/src/math/matrix/matrix.hpp
index fa8748b..339509a 100644
--- a/src/math/matrix/matrix.hpp
+++ b/src/math/matrix/matrix.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <random>
 #include <stdexcept>
 #include <vector>
 
diff --git a/src/math/matrix/mutable_matrix.hpp b/src/math/matrix/mutable_matrix.hpp
index 5f84236..72e3867 100644
--- a/src/math/matrix/mutable_matrix.hpp
+++ b/src/math/matrix/mutable_matrix.hpp
@@ -16,7 +16,7 @@ public:
   virtual void activate(Activate type, float alpha = 0.01f) = 0;
 
   void validateMultDimensions(T &a, T &b) const {
-    if (a.getRows() != b.getCols()) {
+    if (a.getCols() != b.getRows()) {
       throw std::invalid_argument(
           "Invalid matrix dimensions for multiplication");
     }
diff --git a/src/math/opencl/opencl.hpp b/src/math/opencl/opencl.hpp
index 8b32101..5623655 100644
--- a/src/math/opencl/opencl.hpp
+++ b/src/math/opencl/opencl.hpp
@@ -13,7 +13,7 @@
 
 class OpenCL {
 public:
-  enum class Program { MATRIX, MATH, IMAGE_PROCESSING };
+  enum class Program { MATRIX };
 
 private:
   cl::Device device;