From c548c3089b0a33e16d22cae70d5c19daba86867f Mon Sep 17 00:00:00 2001
From: StepanovPlaton <mail2platon@yandex.ru>
Date: Sat, 1 Nov 2025 14:22:09 +0400
Subject: [PATCH] Forward with new tensors math

---
 src/Makefile                   |   2 +-
 src/kernels/matrix.cl          |   8 +--
 src/main.cpp                   |  75 +++++++++++++++++++++----
 src/math/tensor/cpu/math.hpp   |  11 +++-
 src/math/tensor/gpu/math.hpp   |  32 ++++++-----
 src/math/tensor/gpu/tensor.hpp | 100 ++++++++++++++++++++++++++-------
 src/math/tensor/math.hpp       |  12 +++-
 src/math/tensor/tensor.hpp     |  20 +++----
 8 files changed, 192 insertions(+), 68 deletions(-)
diff --git a/src/Makefile b/src/Makefile
index dd89464..c4a81b0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 CXX = g++
-CXXFLAGS = -Wall -Wextra -O2 -std=c++23
+CXXFLAGS = -Wall -Wextra -O1 -g -std=c++23 -fno-omit-frame-pointer
 LIBS = -lOpenCL
 TARGET = main
 COMMON_SRC = ./math/opencl/opencl.cpp 
diff --git a/src/kernels/matrix.cl b/src/kernels/matrix.cl
index 95810b9..d8ca0ad 100644
--- a/src/kernels/matrix.cl
+++ b/src/kernels/matrix.cl
@@ -27,7 +27,7 @@ __kernel void activate(__global float *input, __global float *output,
 }
 
 __kernel void mult_small(__global float *A, __global float *B,
-                         __global float *C, const float bias,
+                         __global float *C, __global float *bias,
                          const int activation_type, const float alpha,
                          const int M, const int N, const int K,
                          const int transpose_B) {
@@ -49,7 +49,7 @@ __kernel void mult_small(__global float *A, __global float *B,
       sum += a_val * b_val;
     }
 
-    float result = sum + bias;
+    float result = sum + bias[col];
     if (activation_type != 0) {
       result = activate_x(result, activation_type, alpha);
     }
@@ -58,7 +58,7 @@ __kernel void mult_small(__global float *A, __global float *B,
 }
 
 __kernel void mult(__global float *A, __global float *B, __global float *C,
-                   const float bias, const int activation_type,
+                   __global float *bias, const int activation_type,
                    const float alpha, const int M, const int N, const int K,
                    const int transpose_B) {
   const int tile_size = 16;
@@ -122,7 +122,7 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
   }
 
   if (global_i < M && global_j < N) {
-    float result = sum + bias;
+    float result = sum + bias[global_j];
     if (activation_type != 0) {
       result = activate_x(result, activation_type, alpha);
     }
diff --git a/src/main.cpp b/src/main.cpp
index f3fa5ec..4b9ddff 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -5,27 +5,78 @@
 
 using namespace GPU;
 
+class Layer {
+protected:
+  int inputFeatures;
+  int outputFeatures;
+  Vector bias;
+  Activation activation;
+  float alpha;
+  Matrix weights;
+
+public:
+  Layer(int inputFeatures, int outputFeatures, Activation activation,
+        Vector bias, float alpha = 0.0f)
+      : inputFeatures(inputFeatures), outputFeatures(outputFeatures),
+        bias(bias), activation(activation), alpha(alpha),
+        weights(outputFeatures, inputFeatures) {}
+
+  int getInputFeatures() const { return inputFeatures; }
+  int getOuputFeatures() const { return outputFeatures; }
+  Activation getActivation() const { return activation; }
+  float getAlpha() const { return alpha; }
+
+  const Vector &getBias() const { return bias; }
+  const Matrix &getWeights() const { return weights; }
+};
+
+class NeuralNetwork {
+private:
+  std::vector<Layer> layers;
+
+public:
+  NeuralNetwork(std::vector<Layer> l) : layers(l) {}
+
+  Matrix predict(Matrix inputs) {
+    MatrixMath mm;
+    std::vector<Matrix> steps;
+    steps.push_back(inputs);
+    for (size_t i = 0; i < layers.size(); i++)
+      steps.push_back(mm.mult(steps[steps.size() - 1], layers[i].getWeights(),
+                              true, &layers[i].getBias(),
+                              layers[i].getActivation(), layers[i].getAlpha()));
+    mm.await();
+    return steps[steps.size() - 1];
+  }
+
+  const Layer &getLayer(int i) const { return layers[i]; }
+};
+
 OpenCL openCL;
 
 int main() {
-  MatrixMath mm;
+  NeuralNetwork nn(
+      {Layer(2, 1, Activation::SIGMOID, Vector(std::vector<float>{1.0f}))});
 
-  Matrix a(2, 2);
-  Matrix b(2, 2);
+  for (int i = 0; i < 10; i++) {
+    int v1 = (i / 2) % 2;
+    int v2 = i % 2;
 
-  CPU::Matrix a_(2, 2, a.toVector());
-  CPU::Matrix b_(2, 2, b.toVector());
+    Matrix input(1, 2, {static_cast<float>(v1), static_cast<float>(v2)});
 
-  a_.print();
-  b_.print();
+    Matrix r = nn.predict(input);
+    std::vector<float> rv = r.toVector();
 
-  Matrix c = mm.add(a, b);
+    std::cout << "Network: ";
+    for (size_t j = 0; j < rv.size(); ++j) {
+      printf("%f\t", rv[j]);
+    }
 
-  CPU::Matrix c_(2, 2, c.toVector(&mm.getQueue()));
+    float expected = static_cast<float>(v1 ^ v2);
+    std::cout << " | XOR(" << v1 << ", " << v2 << ") = " << expected;
 
-  mm.await();
-
-  c_.print();
+    std::cout << std::endl;
+  }
 
   return 0;
 }
\ No newline at end of file
diff --git a/src/math/tensor/cpu/math.hpp b/src/math/tensor/cpu/math.hpp
index d2c1b3f..745c24a 100644
--- a/src/math/tensor/cpu/math.hpp
+++ b/src/math/tensor/cpu/math.hpp
@@ -75,19 +75,24 @@ class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
 
 class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
 
-class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
+class Tensor2Math : public TensorMath<Tensor2>,
+                    public ITensor2Math<Tensor2, Tensor1> {
 public:
   Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
-               float bias = 0.0f, Activation type = Activation::LINEAR,
+               const Vector *bias = nullptr,
+               Activation type = Activation::LINEAR,
                float alpha = 0.01f) override {
     validateMultDimensions(a, b, transpose);
+    if (bias != nullptr)
+      validateBiasDimensions(b, *bias, transpose);
     Tensor2 result(a.getRows(), b.getCols(), 0.0f);
     for (int i = 0; i < result.getRows(); ++i) {
       for (int j = 0; j < result.getCols(); ++j) {
         float sum = 0.0f;
         for (int k = 0; k < a.getCols(); ++k)
           sum += a(i, k) * (transpose ? b(j, k) : b(k, j));
-        result(i, j) = activate_x(sum + bias, type, alpha);
+        result(i, j) = activate_x(sum + (bias == nullptr ? 0.0f : (*bias)(j)),
+                                  type, alpha);
       }
     }
     return result;
diff --git a/src/math/tensor/gpu/math.hpp b/src/math/tensor/gpu/math.hpp
index d783d3a..4d06f37 100644
--- a/src/math/tensor/gpu/math.hpp
+++ b/src/math/tensor/gpu/math.hpp
@@ -93,12 +93,11 @@ class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
 
 class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
 
-class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
+class Tensor2Math : public TensorMath<Tensor2>,
+                    public ITensor2Math<Tensor2, Tensor1> {
 private:
-  Tensor2 mult_tiled(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
-                     float bias = 0.0f, Activation type = Activation::LINEAR,
-                     float alpha = 0.01f) {
-    validateMultDimensions(a, b, transpose);
+  Tensor2 mult_tiled(const Tensor2 &a, const Tensor2 &b, bool transpose,
+                     const Vector &bias, Activation type, float alpha) {
     Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
                    &queue);
 
@@ -111,7 +110,7 @@ private:
     kernels[Method::MULT].setArg(0, *a.getBuffer());
     kernels[Method::MULT].setArg(1, *b.getBuffer());
     kernels[Method::MULT].setArg(2, *result.getBuffer());
-    kernels[Method::MULT].setArg(3, bias);
+    kernels[Method::MULT].setArg(3, *bias.getBuffer());
     kernels[Method::MULT].setArg(4, static_cast<int>(type));
     kernels[Method::MULT].setArg(5, alpha);
     kernels[Method::MULT].setArg(6, result.getRows());
@@ -122,16 +121,14 @@ private:
                                global_size, local_size);
     return result;
   }
-  Tensor2 mult_small(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
-                     float bias = 0.0f, Activation type = Activation::LINEAR,
-                     float alpha = 0.01f) {
-    validateMultDimensions(a, b, transpose);
+  Tensor2 mult_small(const Tensor2 &a, const Tensor2 &b, bool transpose,
+                     const Vector &bias, Activation type, float alpha) {
     Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
                    &queue);
     kernels[Method::MULT_SMALL].setArg(0, *a.getBuffer());
     kernels[Method::MULT_SMALL].setArg(1, *b.getBuffer());
     kernels[Method::MULT_SMALL].setArg(2, *result.getBuffer());
-    kernels[Method::MULT_SMALL].setArg(3, bias);
+    kernels[Method::MULT_SMALL].setArg(3, *bias.getBuffer());
     kernels[Method::MULT_SMALL].setArg(4, static_cast<int>(type));
     kernels[Method::MULT_SMALL].setArg(5, alpha);
     kernels[Method::MULT_SMALL].setArg(6, result.getRows());
@@ -145,13 +142,20 @@ private:
 
 public:
   Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
-               float bias = 0.0f, Activation type = Activation::LINEAR,
+               const Vector *bias = nullptr,
+               Activation type = Activation::LINEAR,
                float alpha = 0.01f) override {
+    validateMultDimensions(a, b, transpose);
+    const Vector defaultBias(a.getRows(), 0.0f, &queue);
+    if (bias != nullptr)
+      validateBiasDimensions(b, *bias, transpose);
     if (a.getRows() > 64 || a.getCols() > 64 || b.getRows() > 64 ||
         b.getCols() > 64)
-      return mult_tiled(a, b, transpose, bias, type, alpha);
+      return mult_tiled(a, b, transpose, bias == nullptr ? defaultBias : *bias,
+                        type, alpha);
     else
-      return mult_small(a, b, transpose, bias, type, alpha);
+      return mult_small(a, b, transpose, bias == nullptr ? defaultBias : *bias,
+                        type, alpha);
   }
 };
 
diff --git a/src/math/tensor/gpu/tensor.hpp b/src/math/tensor/gpu/tensor.hpp
index 58fe563..df628ee 100644
--- a/src/math/tensor/gpu/tensor.hpp
+++ b/src/math/tensor/gpu/tensor.hpp
@@ -69,12 +69,41 @@ public:
     if (fill)
       createBuf(getShapeSize(shape), 0.0f, queue);
   }
-  Tensor(const Tensor &) = delete;
-  Tensor &operator=(const Tensor &) = delete;
-  Tensor(Tensor &&other) : ITensor(other.shape), buffer(other.buffer) {
+
+  Tensor(const Tensor &other, const cl::CommandQueue *queue = nullptr)
+      : ITensor(other) {
+    cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
+    createBuf(other.getSize(), &q);
+    q.enqueueCopyBuffer(*other.buffer, *buffer, 0, 0,
+                        other.getSize() * sizeof(float));
+  };
+  Tensor &operator=(const Tensor &other) {
+    if (buffer != nullptr)
+      delete buffer;
+    ITensor::operator=(other);
+    createBuf(other.getSize(), &openCL.getDefaultQueue());
+    openCL.getDefaultQueue().enqueueCopyBuffer(*other.buffer, *buffer, 0, 0,
+                                               other.getSize() * sizeof(float));
+    return *this;
+  };
+  Tensor(Tensor &&other) : ITensor(other), buffer(other.buffer) {
     other.buffer = nullptr;
   };
-  Tensor &operator=(Tensor &&other) = delete;
+  Tensor &operator=(Tensor &&other) {
+    if (this != &other) {
+      if (buffer != nullptr)
+        delete buffer;
+      ITensor::operator=(std::move(other));
+      buffer = other.buffer;
+      other.buffer = nullptr;
+    }
+    return *this;
+  };
+
+  ~Tensor() {
+    if (buffer != nullptr)
+      delete buffer;
+  }
 
   std::vector<float> toVector(const cl::CommandQueue *queue = nullptr) {
     size_t size = getShapeSize(shape);
@@ -144,17 +173,25 @@ public:
     if (shape.size() != 0)
       throw std::invalid_argument("Tensor0 dimension must be 0");
   }
-  Tensor0(const cl::CommandQueue *queue = nullptr) : Tensor({}, queue) {
+  Tensor0(const cl::CommandQueue *queue = nullptr)
+      : Tensor(std::vector<int>{}, queue) {
     createBuf(1, queue);
   }
   Tensor0(float value, const cl::CommandQueue *queue = nullptr)
-      : Tensor({}, queue) {
+      : Tensor(std::vector<int>{}, queue) {
     createBuf(1, value, queue);
   }
-  Tensor0(const Tensor0 &) = delete;
-  Tensor0 &operator=(const Tensor0 &) = delete;
+  Tensor0(const Tensor0 &other, const cl::CommandQueue *queue = nullptr)
+      : Tensor(other, queue) {};
+  Tensor0 &operator=(const Tensor0 &other) {
+    Tensor::operator=(other);
+    return *this;
+  };
   Tensor0(Tensor0 &&other) : Tensor(std::move(other)) {};
-  Tensor0 &operator=(Tensor0 &&other) = delete;
+  Tensor0 &operator=(Tensor0 &&other) {
+    Tensor::operator=(std::move(other));
+    return *this;
+  };
 };
 
 class Tensor1 : public Tensor, public ITensor1 {
@@ -180,10 +217,17 @@ public:
       : Tensor({(int)values.size()}, false, queue) {
     fillBuf(values, queue);
   }
-  Tensor1(const Tensor1 &) = delete;
-  Tensor1 &operator=(const Tensor1 &) = delete;
-  Tensor1(Tensor1 &&other) : Tensor(std::move(other)) {}
-  Tensor1 &operator=(Tensor1 &&other) = delete;
+  Tensor1(const Tensor1 &other, const cl::CommandQueue *queue = nullptr)
+      : Tensor(other, queue) {};
+  Tensor1 &operator=(const Tensor1 &other) {
+    Tensor::operator=(other);
+    return *this;
+  };
+  Tensor1(Tensor1 &&other) : Tensor(std::move(other)) {};
+  Tensor1 &operator=(Tensor1 &&other) {
+    Tensor::operator=(std::move(other));
+    return *this;
+  };
 
   int getSize() const override { return shape[0]; }
 };
@@ -223,10 +267,17 @@ public:
     fillBuf(v, queue);
   }
 
-  Tensor2(const Tensor2 &) = delete;
-  Tensor2 &operator=(const Tensor2 &) = delete;
-  Tensor2(Tensor2 &&other) : Tensor(std::move(other)) {}
-  Tensor2 &operator=(Tensor2 &&other) = delete;
+  Tensor2(const Tensor2 &other, const cl::CommandQueue *queue = nullptr)
+      : Tensor(other, queue) {};
+  Tensor2 &operator=(const Tensor2 &other) {
+    Tensor::operator=(other);
+    return *this;
+  };
+  Tensor2(Tensor2 &&other) : Tensor(std::move(other)) {};
+  Tensor2 &operator=(Tensor2 &&other) {
+    Tensor::operator=(std::move(other));
+    return *this;
+  };
 
   int getRows() const override { return shape[0]; }
   int getCols() const override { return shape[1]; }
@@ -269,10 +320,17 @@ public:
     }
     fillBuf(v, queue);
   }
-  Tensor3(const Tensor3 &) = delete;
-  Tensor3 &operator=(const Tensor3 &) = delete;
-  Tensor3(Tensor3 &&other) : Tensor(std::move(other)) {}
-  Tensor3 &operator=(Tensor3 &&other) = delete;
+  Tensor3(const Tensor3 &other, const cl::CommandQueue *queue = nullptr)
+      : Tensor(other, queue) {};
+  Tensor3 &operator=(const Tensor3 &other) {
+    Tensor::operator=(other);
+    return *this;
+  };
+  Tensor3(Tensor3 &&other) : Tensor(std::move(other)) {};
+  Tensor3 &operator=(Tensor3 &&other) {
+    Tensor::operator=(std::move(other));
+    return *this;
+  };
 };
 
 typedef Tensor0 Scalar;
diff --git a/src/math/tensor/math.hpp b/src/math/tensor/math.hpp
index 36794d9..40b768f 100644
--- a/src/math/tensor/math.hpp
+++ b/src/math/tensor/math.hpp
@@ -41,18 +41,24 @@ template <ITensor0Type T> class ITensor0Math {};
 
 template <ITensor1Type T> class ITensor1Math {};
 
-template <ITensor2Type T> class ITensor2Math {
+template <ITensor2Type M, ITensor1Type V> class ITensor2Math {
 public:
-  virtual T mult(const T &a, const T &b, bool transpose, float bias,
+  virtual M mult(const M &a, const M &b, bool transpose, const V *bias,
                  Activation type, float alpha) = 0;
 
-  void validateMultDimensions(const T &a, const T &b, bool transpose) const {
+  void validateMultDimensions(const M &a, const M &b, bool transpose) const {
     if ((!transpose && a.getCols() != b.getRows()) ||
         (transpose && a.getCols() != b.getCols())) {
       throw std::invalid_argument(
           "Invalid matrix dimensions for multiplication");
     }
   };
+  void validateBiasDimensions(const M &a, const V &b, bool transpose) const {
+    if ((!transpose && a.getCols() != b.getSize()) ||
+        (transpose && a.getRows() != b.getSize())) {
+      throw std::invalid_argument("Invalid matrix bias");
+    }
+  };
 };
 
 template <ITensor3Type T> class ITensor3Math {};
\ No newline at end of file
diff --git a/src/math/tensor/tensor.hpp b/src/math/tensor/tensor.hpp
index 0ccc2dd..d200153 100644
--- a/src/math/tensor/tensor.hpp
+++ b/src/math/tensor/tensor.hpp
@@ -23,10 +23,16 @@ protected:
 
 public:
   ITensor(const std::vector<int> &shape) : shape(shape) {}
-  ITensor(const ITensor &) = default;
-  ITensor &operator=(const ITensor &) = default;
-  ITensor(ITensor &&other) = default;
-  ITensor &operator=(ITensor &&other) = default;
+  ITensor(const ITensor &other) : shape(other.shape) {};
+  ITensor &operator=(const ITensor &other) {
+    shape = other.shape;
+    return *this;
+  };
+  ITensor(ITensor &&other) : shape(other.shape) {};
+  ITensor &operator=(ITensor &&other) {
+    shape = other.shape;
+    return *this;
+  };
 
   const std::vector<int> &getShape() const { return shape; }
   int getDim() const { return static_cast<int>(shape.size()); }
@@ -50,12 +56,6 @@ public:
 
 class ITensor2 {
 public:
-  ITensor2() = default;
-  ITensor2(const ITensor2 &) = default;
-  ITensor2 &operator=(const ITensor2 &) = default;
-  ITensor2(ITensor2 &&other) = default;
-  ITensor2 &operator=(ITensor2 &&other) = default;
-
   virtual int getRows() const = 0;
   virtual int getCols() const = 0;
 };