Complete tensors math

2026-04-03 20:30:39 +04:00 · 2025-11-01 10:30:32 +04:00
parent f728261354
commit f1dfe1b335
26 changed files with 1147 additions and 673 deletions
--- a/src/Makefile
+++ b/src/Makefile
@@ -2,7 +2,7 @@ CXX = g++
 CXXFLAGS = -Wall -Wextra -O2 -std=c++23
 LIBS = -lOpenCL
 TARGET = main
-COMMON_SRC = ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp
+COMMON_SRC = ./math/opencl/opencl.cpp 
 MAIN_SRC = main.cpp $(COMMON_SRC)
 BENCHMARK_SRC = benchmark.cpp $(COMMON_SRC)

--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -6,8 +6,7 @@

 #include "./math/math.hpp"

-typedef Matrices::CPU Matrix;
-typedef MutableMatrices::CPU MutableMatrix;
+using namespace GPU;

 OpenCL openCL;

@@ -31,40 +30,37 @@ std::vector<float> generateIdentityMatrix(int size) {
 }

 int main() {
-  const int SIZE = 1024;
+  const int SIZE = 48;

  std::cout << "Testing with " << SIZE << "x" << SIZE << " matrices..."
            << std::endl;

-  std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
-  std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
-  std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);
+  // std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
+  // std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
+  // std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);

-  // std::vector<float> matrixA = generateIdentityMatrix(SIZE);
-  // std::vector<float> matrixB = generateIdentityMatrix(SIZE);
-  // std::vector<float> matrixC = generateIdentityMatrix(SIZE);
+  std::vector<float> matrixA = generateIdentityMatrix(SIZE);
+  std::vector<float> matrixB = generateIdentityMatrix(SIZE);
+  std::vector<float> matrixC = generateIdentityMatrix(SIZE);

-  // Тестирование на CPU
+  // Тестирование на GPU
  {
-    std::cout << "\n=== CPU Version ===" << std::endl;
+    std::cout << "\n=== GPU Version ===" << std::endl;

    auto start = std::chrono::high_resolution_clock::now();

-    MutableMatrices::CPU a(SIZE, SIZE, matrixA);
-    Matrices::CPU b(SIZE, SIZE, matrixB);
-    Matrices::CPU c(SIZE, SIZE, matrixC);
+    MatrixMath mm;
+    Matrix a(SIZE, SIZE, matrixA);
+    Matrix b(SIZE, SIZE, matrixB);

    auto gen_end = std::chrono::high_resolution_clock::now();
-
    auto op_start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < 10; i++) {
-      a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID);
+    for (int i = 0; i < 100; ++i) {
+      Matrix x = mm.mult(a, b);
    }
-
    auto op_end = std::chrono::high_resolution_clock::now();

-    std::vector<float> v = a.toVector();
+    std::vector<float> v = a.toVector(&mm.getQueue());

    auto total_end = std::chrono::high_resolution_clock::now();

@@ -88,24 +84,22 @@ int main() {
    std::cout << std::endl;
  }

-  // Тестирование на GPU
+  // Тестирование на CPU
  {
-    std::cout << "\n=== GPU Version ===" << std::endl;
+    std::cout << "\n=== CPU Version ===" << std::endl;

    auto start = std::chrono::high_resolution_clock::now();

-    MutableMatrices::GPU a(SIZE, SIZE, matrixA);
-    Matrices::GPU b(SIZE, SIZE, matrixB);
-    Matrices::GPU c(SIZE, SIZE, matrixC);
+    CPU::MatrixMath mm;
+    CPU::Matrix a(SIZE, SIZE, matrixA);
+    CPU::Matrix b(SIZE, SIZE, matrixB);

    auto gen_end = std::chrono::high_resolution_clock::now();

    auto op_start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < 10; i++) {
-      a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f);
+    for (int i = 0; i < 100; ++i) {
+      CPU::Matrix x = mm.mult(a, b);
    }
-
    auto op_end = std::chrono::high_resolution_clock::now();

    std::vector<float> v = a.toVector();
--- a/src/kernels/matrix.cl
+++ b/src/kernels/matrix.cl
@@ -13,40 +13,54 @@ float activate_x(float x, const int activation_type, const float alpha) {
  case 5: // ELU
    return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
  case 6: // GELU
-            return 0.5f * x * (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x)));
+    return 0.5f * x *
+           (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x)));
  default:
    return x;
  }
 }

-__kernel void activate(
-    __global float* input,
-    __global float* output,
-    const int activation_type,
-    const float alpha,
-    const int rows,
-    const int cols)
-{
-    int row = get_global_id(0);
-    int col = get_global_id(1);
+__kernel void activate(__global float *input, __global float *output,
+                       const int activation_type, const float alpha) {
+  int i = get_global_id(0);
+  output[i] = activate_x(input[i], activation_type, alpha);
+}

-    if (row < rows && col < cols) {
-        int idx = row * cols + col;
-        output[idx] = activate_x(input[idx], activation_type, alpha);
+__kernel void mult_small(__global float *A, __global float *B,
+                         __global float *C, const float bias,
+                         const int activation_type, const float alpha,
+                         const int M, const int N, const int K,
+                         const int transpose_B) {
+  const int row = get_global_id(0);
+  const int col = get_global_id(1);
+
+  if (row < M && col < N) {
+    float sum = 0.0f;
+    for (int k = 0; k < K; k++) {
+      float a_val = A[row * K + k];
+
+      float b_val;
+      if (transpose_B) {
+        b_val = B[col * K + k];
+      } else {
+        b_val = B[k * N + col];
+      }
+
+      sum += a_val * b_val;
+    }
+
+    float result = sum + bias;
+    if (activation_type != 0) {
+      result = activate_x(result, activation_type, alpha);
+    }
+    C[row * N + col] = result;
  }
 }

-__kernel void mult(
-    __global float* A, 
-    __global float* B, 
-    __global float* C,
-    const float bias,          
-    const int activation_type, 
-    const float alpha,         
-    const int M, 
-    const int N, 
-    const int K) 
-{
+__kernel void mult(__global float *A, __global float *B, __global float *C,
+                   const float bias, const int activation_type,
+                   const float alpha, const int M, const int N, const int K,
+                   const int transpose_B) {
  const int tile_size = 16;

  int local_i = get_local_id(0);
@@ -67,6 +81,7 @@ __kernel void mult(
  for (int tile = 0; tile < num_tiles; tile++) {
    int tile_offset = tile * tile_size;

+    // Загрузка tile_A (без изменений)
    int load_i_A = tile_offset + local_i;
    int load_j_A = tile_offset + local_j;

@@ -76,19 +91,30 @@ __kernel void mult(
      tile_A[local_j][local_i] = 0.0f;
    }

+    // Загрузка tile_B с учетом транспонирования
    int load_i_B = tile_offset + local_i;
    int load_j_B = tile_offset + local_j;

+    if (transpose_B) {
+      // B транспонирована: обращаем индексы
+      if (load_i_B < N && global_j < K) {
+        tile_B[local_j][local_i] = B[global_j * N + load_i_B];
+      } else {
+        tile_B[local_j][local_i] = 0.0f;
+      }
+    } else {
+      // B не транспонирована (оригинальная логика)
      if (load_i_B < K && global_j < N) {
        tile_B[local_j][local_i] = B[load_i_B * N + global_j];
      } else {
        tile_B[local_j][local_i] = 0.0f;
      }
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

 #pragma unroll
-        for (int k = 0; k < tile_size; k++) {
+    for (int k = 0; k < tile_size; ++k) {
      sum += tile_A[k][local_i] * tile_B[local_j][k];
    }

@@ -104,21 +130,18 @@ __kernel void mult(
  }
 }

-__kernel void mult_sc(__global float* A, __global float* B, float scalar, int M, int N) {
+__kernel void mult_sc(__global float *A, __global float *B, float scalar) {
  int i = get_global_id(0);
-    int j = get_global_id(1);
-    B[i * N + j] = A[i * N + j] * scalar;
+  B[i] = A[i] * scalar;
 }

-__kernel void add(__global float* A, __global float* B, __global float* C, float a, float b, int M, int N) {
+__kernel void add(__global float *A, __global float *B, __global float *C,
+                  float x) {
  int i = get_global_id(0);
-    int j = get_global_id(1);
-    C[i * N + j] = (A[i * N + j] * a) + (B[i * N + j] * b);
+  C[i] = A[i] + (B[i] * x);
 }

-__kernel void add_sc(__global float* A, __global float* B, float scalar, int M, int N) {
+__kernel void add_sc(__global float *A, __global float *B, float scalar) {
  int i = get_global_id(0);
-    int j = get_global_id(1);
-    B[i * N + j] = A[i * N + j] + scalar;
+  B[i] = A[i] + scalar;
 }
-
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -3,73 +3,29 @@
 #include <chrono>
 #include <thread>

-typedef Matrices::GPU M;
-typedef MutableMatrices::GPU MM;
-
-class Layer {
-protected:
-  int features;
-  float bias;
-  MM::Activate activate;
-  float alpha;
-
-public:
-  Layer(int features, MM::Activate activate = MM::Activate::LINEAR,
-        float bias = 0.0f, float alpha = 0.0f)
-      : features(features), activate(activate), bias(bias), alpha(alpha) {}
-
-  int getFeatures() const { return features; }
-  float getBias() const { return bias; }
-  MM::Activate getActivate() const { return activate; }
-  float getAlpha() const { return alpha; }
-};
-
-class NeuralNetwork {
-private:
-  std::vector<Layer> layers;
-  std::vector<MM> weights;
-
-public:
-  NeuralNetwork(int n, std::initializer_list<Layer> l) : layers(l) {
-    weights.emplace_back(n, layers[0].getFeatures());
-    for (int i = 0; i < layers.size() - 1; i++)
-      weights.emplace_back(layers[i].getFeatures(),
-                           layers[i + 1].getFeatures());
-  }
-
-  std::vector<float> predict(std::vector<float> i) {
-    if (i.size() != weights[0].getRows())
-      std::invalid_argument("Invalid input size");
-    MM input(1, (int)i.size(), i);
-    for (size_t i = 0; i < weights.size(); i++)
-      input.mult(weights[i], layers[i + 1].getBias(),
-                 layers[i + 1].getActivate(), layers[i + 1].getAlpha());
-    return input.toVector();
-  }
-};
+using namespace GPU;

 OpenCL openCL;

 int main() {
-  NeuralNetwork nn(
-      2, {Layer(3, MM::Activate::RELU), Layer(1, MM::Activate::RELU)});
+  MatrixMath mm;

-  for (int i = 0; i < 10; i++) {
-    int v1 = (i / 2) % 2;
-    int v2 = i % 2;
+  Matrix a(2, 2);
+  Matrix b(2, 2);

-    std::vector<float> v = {static_cast<float>(v1), static_cast<float>(v2)};
+  CPU::Matrix a_(2, 2, a.toVector());
+  CPU::Matrix b_(2, 2, b.toVector());

-    std::vector<float> r = nn.predict(v);
-    float expected = static_cast<float>(v1 ^ v2);
+  a_.print();
+  b_.print();

-    std::cout << "XOR(" << v1 << ", " << v2 << ") = " << expected;
-    std::cout << " | Network: ";
-    for (size_t j = 0; j < r.size(); ++j) {
-      std::cout << r[j] << " ";
-    }
-    std::cout << std::endl;
-  }
+  Matrix c = mm.add(a, b);
+
+  CPU::Matrix c_(2, 2, c.toVector(&mm.getQueue()));
+
+  mm.await();
+
+  c_.print();

  return 0;
 }
--- a/src/math/math.hpp
+++ b/src/math/math.hpp
@@ -2,8 +2,6 @@

 #include "opencl/opencl.hpp"

-#include "matrix/cpu/matrix.hpp"
-#include "matrix/cpu/mutable_matrix.hpp"
+#include "tensor/cpu/math.hpp"

-#include "matrix/gpu/matrix.hpp"
-#include "matrix/gpu/mutable_matrix.hpp"
+#include "tensor/gpu/math.hpp"
--- a/src/math/matrix/cpu/matrix.cpp
+++ b/src/math/matrix/cpu/matrix.cpp
@@ -1,24 +0,0 @@
-#include "matrix.hpp"
-
-Matrices::CPU::CPU(int rows, int cols, float value)
-    : IMatrix(rows, cols), data(rows * cols, value) {
-  validateDimensions(rows, cols);
-}
-
-Matrices::CPU::CPU(int rows, int cols, const std::vector<float> &matrix)
-    : IMatrix(rows, cols), data(matrix) {
-  validateDimensions(rows, cols);
-  if (matrix.size() != static_cast<size_t>(rows * cols)) {
-    throw std::invalid_argument("Data size doesn't match matrix dimensions");
-  }
-}
-
-float &Matrices::CPU::operator()(int row, int col) {
-  checkIndices(row, col);
-  return data[row * cols + col];
-}
-
-const float &Matrices::CPU::operator()(int row, int col) const {
-  checkIndices(row, col);
-  return data[row * cols + col];
-}
--- a/src/math/matrix/cpu/matrix.hpp
+++ b/src/math/matrix/cpu/matrix.hpp
@@ -1,38 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-
-#include "../matrix.hpp"
-
-namespace Matrices {
-
-class CPU : public IMatrix {
-protected:
-  std::vector<float> data;
-
-public:
-  CPU(int rows, int cols, float value = 0.0f);
-  CPU(int rows, int cols, const std::vector<float> &matrix);
-
-  CPU(const CPU &) = default;
-  CPU &operator=(const CPU &) = default;
-  CPU(CPU &&) = default;
-  CPU &operator=(CPU &&) = default;
-  ~CPU() override = default;
-
-  float &operator()(int row, int col);
-  const float &operator()(int row, int col) const;
-
-  const std::vector<float> toVector() const { return data; }
-
-  int getRows() const override { return rows; }
-  int getCols() const override { return cols; }
-  size_t getSize() const { return data.size(); }
-
-  //   GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); }
-};
-
-} // namespace Matrices
--- a/src/math/matrix/cpu/mutable_matrix.cpp
+++ b/src/math/matrix/cpu/mutable_matrix.cpp
@@ -1,76 +0,0 @@
-#include "mutable_matrix.hpp"
-
-float MutableMatrices::CPU::activate_x(float x, Activate type, float alpha) {
-  switch (type) {
-  case Activate::LINEAR:
-    return x;
-  case Activate::SIGMOID:
-    return 1.0f / (1.0f + std::exp(-x));
-  case Activate::TANH:
-    return std::tanh(x);
-  case Activate::RELU:
-    return std::max(0.0f, x);
-  case Activate::LEAKY_RELU:
-    return (x > 0.0f) ? x : alpha * x;
-  case Activate::ELU:
-    return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f);
-  case Activate::GELU:
-    return 0.5f * x *
-           (1.0f +
-            std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x)));
-  default:
-    throw std::invalid_argument("Unknown activation type");
-  }
-}
-void MutableMatrices::CPU::mult(Matrices::CPU &m, float bias, Activate type,
-                                float alpha) {
-  validateMultDimensions(*this, m);
-
-  std::vector<float> result(rows * m.getCols(), 0.0f);
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < m.getCols(); j++) {
-      float sum = 0.0f;
-      for (int k = 0; k < cols; k++) {
-        sum += (*this)(i, k) * m(k, j);
-      }
-      result[i * m.getCols() + j] = activate_x(sum + bias, type, alpha);
-    }
-  }
-  data = std::move(result);
-  cols = m.getCols();
-}
-
-void MutableMatrices::CPU::mult(float scalar) {
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      data[i * cols + j] *= scalar;
-    }
-  }
-}
-
-void MutableMatrices::CPU::add(Matrices::CPU &m, float a, float b) {
-  validateSameDimensions(*this, m);
-
-  std::vector<float> result(rows * cols, 0.0f);
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b);
-    }
-  }
-  data = std::move(result);
-}
-
-void MutableMatrices::CPU::add(float scalar) {
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      data[i * cols + j] += scalar;
-    }
-  }
-}
-void MutableMatrices::CPU::activate(Activate type, float alpha) {
-  for (int i = 0; i < rows; i++) {
-    for (int j = 0; j < cols; j++) {
-      data[i * cols + j] = activate_x(data[i * cols + j], type, alpha);
-    }
-  }
-}
--- a/src/math/matrix/cpu/mutable_matrix.hpp
+++ b/src/math/matrix/cpu/mutable_matrix.hpp
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "matrix.hpp"
-
-#include "../mutable_matrix.hpp"
-
-#include <cmath>
-
-#define M_PI 3.14159265358979323846
-
-namespace MutableMatrices {
-
-class CPU : public Matrices::CPU, public IMutableMatrix<Matrices::CPU> {
-private:
-  static float activate_x(float x, Activate type, float alpha = 0.01f);
-
-public:
-  CPU(int rows, int cols, const std::vector<float> &matrix)
-      : Matrices::CPU(rows, cols, matrix) {}
-
-  void mult(Matrices::CPU &m, float bias = 0.0f,
-            Activate type = Activate::LINEAR, float alpha = 0.01f);
-  void mult(float scalar);
-  void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f);
-  void add(float scalar);
-  void activate(Activate type, float alpha = 0.01f);
-};
-}; // namespace MutableMatrices
--- a/src/math/matrix/gpu/matrix.cpp
+++ b/src/math/matrix/gpu/matrix.cpp
@@ -1,41 +0,0 @@
-#include <random>
-
-#include "matrix.hpp"
-
-std::random_device rd;
-std::mt19937 gen(rd());
-
-Matrices::GPU::GPU(int rows, int cols)
-    : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
-  validateDimensions(rows, cols);
-  std::vector<float> matrix;
-  matrix.reserve(rows * cols);
-  for (size_t i = 0; i < (size_t)rows * (size_t)cols; ++i)
-    matrix.push_back(std::generate_canonical<float, 32>(gen));
-  buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                          rows * cols * sizeof(float));
-  queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
-                           matrix.data());
-  queue.finish();
-}
-
-Matrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
-    : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
-  validateDimensions(rows, cols);
-  if (matrix.size() != static_cast<size_t>(rows * cols)) {
-    throw std::invalid_argument("Matrix data size doesn't match dimensions");
-  }
-  buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                          rows * cols * sizeof(float));
-  queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
-                           matrix.data());
-  queue.finish();
-}
-
-const std::vector<float> Matrices::GPU::toVector() const {
-  std::vector<float> result(rows * cols);
-  queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
-                          result.data());
-  queue.finish();
-  return result;
-}
--- a/src/math/matrix/gpu/matrix.hpp
+++ b/src/math/matrix/gpu/matrix.hpp
@@ -1,40 +0,0 @@
-#pragma once
-
-#include "../../opencl/opencl.hpp"
-
-#include "../matrix.hpp"
-
-namespace Matrices {
-class GPU : public IMatrix {
-protected:
-  cl::Buffer *buffer;
-  cl::CommandQueue queue;
-
-public:
-  GPU(int rows, int cols);
-  GPU(int rows, int cols, const std::vector<float> &matrix);
-  ~GPU() { delete buffer; }
-
-  GPU(const GPU &) = delete;
-  GPU &operator=(const GPU &) = delete;
-  GPU(GPU &&other)
-      : IMatrix(other.rows, other.cols), buffer(other.buffer),
-        queue(std::move(other.queue)) {
-    other.buffer = nullptr;
-    other.rows = 0;
-    other.cols = 0;
-  }
-  GPU &operator=(GPU &&other) = default;
-
-  int getRows() const override { return rows; }
-  int getCols() const override { return cols; }
-  size_t getSize() const { return rows * cols; }
-
-  const cl::Buffer *getBuffer() const { return buffer; }
-
-  const std::vector<float> toVector() const;
-
-  // CPU toCPU() const { return CPU(rows, cols, toVector()); };
-};
-
-} // namespace Matrices
--- a/src/math/matrix/gpu/mutable_matrix.cpp
+++ b/src/math/matrix/gpu/mutable_matrix.cpp
@@ -1,120 +0,0 @@
-#include "mutable_matrix.hpp"
-
-MutableMatrices::GPU::GPU(int rows, int cols) : Matrices::GPU(rows, cols) {
-  for (const auto &entry : kernelsNames) {
-    kernels[entry.first] =
-        cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
-  }
-}
-
-MutableMatrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
-    : Matrices::GPU(rows, cols, matrix) {
-  for (const auto &entry : kernelsNames) {
-    kernels[entry.first] =
-        cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
-  }
-}
-
-void MutableMatrices::GPU::mult(Matrices::GPU &m, float bias, Activate type,
-                                float alpha) {
-  validateMultDimensions(*this, m);
-
-  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                 rows * m.getCols() * sizeof(float));
-
-  const int tile_size = 16;
-  cl::NDRange local_size(tile_size, tile_size);
-  cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size,
-                          ((m.getCols() + tile_size - 1) / tile_size) *
-                              tile_size);
-
-  kernels[Method::MULT].setArg(0, *buffer);
-  kernels[Method::MULT].setArg(1, *m.getBuffer());
-  kernels[Method::MULT].setArg(2, *b);
-  kernels[Method::MULT].setArg(3, bias);
-  kernels[Method::MULT].setArg(4, static_cast<int>(type));
-  kernels[Method::MULT].setArg(5, alpha);
-  kernels[Method::MULT].setArg(6, rows);
-  kernels[Method::MULT].setArg(7, m.getCols());
-  kernels[Method::MULT].setArg(8, cols);
-  cl::Event event;
-  queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, global_size,
-                             local_size, nullptr, &event);
-
-  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-  buffer = b;
-  cols = m.getCols();
-}
-
-void MutableMatrices::GPU::mult(float scalar) {
-  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                 rows * cols * sizeof(float));
-  kernels[Method::SCALAR_MULT].setArg(0, *buffer);
-  kernels[Method::SCALAR_MULT].setArg(1, *b);
-  kernels[Method::SCALAR_MULT].setArg(2, scalar);
-  kernels[Method::SCALAR_MULT].setArg(3, rows);
-  kernels[Method::SCALAR_MULT].setArg(4, cols);
-  cl::Event event;
-  queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
-                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                             &event);
-
-  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-  buffer = b;
-}
-
-void MutableMatrices::GPU::add(Matrices::GPU &m, float a, float b) {
-  validateSameDimensions(*this, m);
-
-  cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                   rows * cols * sizeof(float));
-  kernels[Method::ADD].setArg(0, *buffer);
-  kernels[Method::ADD].setArg(1, *m.getBuffer());
-  kernels[Method::ADD].setArg(2, *buf);
-  kernels[Method::ADD].setArg(3, a);
-  kernels[Method::ADD].setArg(4, b);
-  kernels[Method::ADD].setArg(5, rows);
-  kernels[Method::ADD].setArg(6, cols);
-  cl::Event event;
-  queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
-                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                             &event);
-
-  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-  buffer = buf;
-}
-
-void MutableMatrices::GPU::add(float scalar) {
-  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                 rows * cols * sizeof(float));
-  kernels[Method::SCALAR_ADD].setArg(0, *buffer);
-  kernels[Method::SCALAR_ADD].setArg(1, *b);
-  kernels[Method::SCALAR_ADD].setArg(2, scalar);
-  kernels[Method::SCALAR_ADD].setArg(3, rows);
-  kernels[Method::SCALAR_ADD].setArg(4, cols);
-  cl::Event event;
-  queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
-                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                             &event);
-
-  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-  buffer = b;
-}
-
-void MutableMatrices::GPU::activate(Activate type, float alpha) {
-  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                 rows * cols * sizeof(float));
-  kernels[Method::ACTIVATE].setArg(0, *buffer);
-  kernels[Method::ACTIVATE].setArg(1, *b);
-  kernels[Method::ACTIVATE].setArg(2, static_cast<int>(type));
-  kernels[Method::ACTIVATE].setArg(3, alpha);
-  kernels[Method::ACTIVATE].setArg(4, rows);
-  kernels[Method::ACTIVATE].setArg(5, cols);
-  cl::Event event;
-  queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange,
-                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                             &event);
-
-  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-  buffer = b;
-}
--- a/src/math/matrix/gpu/mutable_matrix.hpp
+++ b/src/math/matrix/gpu/mutable_matrix.hpp
@@ -1,46 +0,0 @@
-#pragma once
-
-#include "../../opencl/opencl.hpp"
-
-#include "matrix.hpp"
-
-#include "../mutable_matrix.hpp"
-
-namespace MutableMatrices {
-
-class GPU : public Matrices::GPU, public IMutableMatrix<Matrices::GPU> {
-private:
-  enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD, ACTIVATE };
-  std::unordered_map<Method, cl::Kernel> kernels;
-  std::unordered_map<Method, std::string> kernelsNames = {
-      {Method::MULT, "mult"},
-      {Method::SCALAR_MULT, "mult_sc"},
-      {Method::ADD, "add"},
-      {Method::SCALAR_ADD, "add_sc"},
-      {Method::ACTIVATE, "activate"}};
-
-  static void CL_CALLBACK releaseBuffer(cl_event, cl_int status, void *buf) {
-    if (status == CL_COMPLETE) {
-      //   std::cout << "Kernel complete!" << std::endl;
-      delete (cl::Buffer *)buf;
-    }
-  }
-
-public:
-  GPU(int rows, int cols);
-  GPU(int rows, int cols, const std::vector<float> &matrix);
-
-  GPU(const GPU &) = delete;
-  GPU &operator=(const GPU &) = delete;
-  GPU(GPU &&other) = default;
-  GPU &operator=(GPU &&other) = default;
-
-  void mult(Matrices::GPU &m, float bias = 0.0f,
-            Activate type = Activate::LINEAR, float alpha = 0.01f);
-  void mult(float scalar);
-  void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f);
-  void add(float scalar);
-  void activate(Activate type, float alpha = 0.01f);
-};
-
-}; // namespace MutableMatrices
--- a/src/math/matrix/matrix.hpp
+++ b/src/math/matrix/matrix.hpp
@@ -1,29 +0,0 @@
-#pragma once
-
-#include <random>
-#include <stdexcept>
-#include <vector>
-
-class IMatrix {
-protected:
-  int rows;
-  int cols;
-
-  void validateDimensions(int rows, int cols) const {
-    if (rows <= 0 || cols <= 0) {
-      throw std::invalid_argument("Matrix dimensions must be positive");
-    }
-  };
-  void checkIndices(int row, int col) const {
-    if (row < 0 || row >= rows || col < 0 || col >= cols) {
-      throw std::out_of_range("Matrix indices out of range");
-    }
-  };
-
-public:
-  IMatrix(int rows, int cols) : rows(rows), cols(cols) {}
-  virtual ~IMatrix() = default;
-  virtual int getRows() const = 0;
-  virtual int getCols() const = 0;
-  virtual const std::vector<float> toVector() const = 0;
-};
--- a/src/math/matrix/mutable_matrix.hpp
+++ b/src/math/matrix/mutable_matrix.hpp
@@ -1,29 +0,0 @@
-#pragma once
-
-#include "matrix.hpp"
-
-template <typename T> class IMutableMatrix {
-  static_assert(std::is_base_of<IMatrix, T>::value,
-                "T must be derived from IMatrix");
-
-public:
-  enum class Activate { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU };
-
-  virtual void mult(T &m, float bias, Activate type, float alpha) = 0;
-  virtual void mult(float s) = 0;
-  virtual void add(T &m, float a, float b) = 0;
-  virtual void add(float a) = 0;
-  virtual void activate(Activate type, float alpha = 0.01f) = 0;
-
-  void validateMultDimensions(T &a, T &b) const {
-    if (a.getCols() != b.getRows()) {
-      throw std::invalid_argument(
-          "Invalid matrix dimensions for multiplication");
-    }
-  };
-  void validateSameDimensions(T &a, T &b) const {
-    if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) {
-      throw std::invalid_argument("Invalid matrix dimensions for addition");
-    }
-  };
-};
--- a/src/math/opencl/opencl.hpp
+++ b/src/math/opencl/opencl.hpp
@@ -40,7 +40,7 @@ public:

  cl::Device &getDevice() { return device; }
  cl::Context &getContext() { return context; }
-  cl::CommandQueue &getDefaultQueue() { return defaultQueue; }
+  const cl::CommandQueue &getDefaultQueue() { return defaultQueue; }

  cl::Program &getProgram(Program program);
  void printDeviceInfo() const;
--- a/src/math/tensor/cpu/math.cpp
+++ b/src/math/tensor/cpu/math.cpp
@@ -0,0 +1 @@
+#include "math.hpp"
--- a/src/math/tensor/cpu/math.hpp
+++ b/src/math/tensor/cpu/math.hpp
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "tensor.hpp"
+
+#include "../math.hpp"
+
+#include <cmath>
+
+#define M_PI 3.14159265358979323846
+
+namespace CPU {
+template <ITensorType T> class TensorMath;
+class Tensor0Math;
+class Tensor1Math;
+class Tensor2Math;
+class Tensor3Math;
+
+template <ITensorType T> class TensorMath : public ITensorMath<T> {
+protected:
+  float activate_x(float x, Activation type, float alpha = 0.01f) {
+    switch (type) {
+    case Activation::LINEAR:
+      return x;
+    case Activation::SIGMOID:
+      return 1.0f / (1.0f + std::exp(-x));
+    case Activation::TANH:
+      return std::tanh(x);
+    case Activation::RELU:
+      return std::max(0.0f, x);
+    case Activation::LEAKY_RELU:
+      return (x > 0.0f) ? x : alpha * x;
+    case Activation::ELU:
+      return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f);
+    case Activation::GELU:
+      return 0.5f * x *
+             (1.0f +
+              std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x)));
+    default:
+      throw std::invalid_argument("Unknown activation type");
+    }
+  }
+
+public:
+  T activate(const T &t, Activation type = Activation::LINEAR,
+             float alpha = 0.0f) override {
+    T result(t.getShape(), false);
+    for (size_t i = 0; i < t.getSize(); ++i) {
+      result[i] = activate_x(t[i], type, alpha);
+    }
+    return result;
+  }
+
+  T mult(const T &t, float x) override {
+    T result(t.getShape(), false);
+    for (size_t i = 0; i < t.getSize(); ++i)
+      result[i] = t[i] * x;
+    return result;
+  }
+  T add(const T &a, const T &b, float x = 1.0f) override {
+    this->validateSameDimensions(a, b);
+    T result(a.getShape(), false);
+    for (size_t i = 0; i < a.getSize(); ++i)
+      result[i] = a[i] + (b[i] * x);
+    return result;
+  }
+  T add(const T &t, float x) override {
+    T result(t.getShape(), false);
+    for (size_t i = 0; i < t.getSize(); ++i)
+      result[i] = t[i] + x;
+    return result;
+  }
+};
+
+class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
+
+class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
+
+class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
+public:
+  Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+               float bias = 0.0f, Activation type = Activation::LINEAR,
+               float alpha = 0.01f) override {
+    validateMultDimensions(a, b, transpose);
+    Tensor2 result(a.getRows(), b.getCols(), 0.0f);
+    for (int i = 0; i < result.getRows(); ++i) {
+      for (int j = 0; j < result.getCols(); ++j) {
+        float sum = 0.0f;
+        for (int k = 0; k < a.getCols(); ++k)
+          sum += a(i, k) * (transpose ? b(j, k) : b(k, j));
+        result(i, j) = activate_x(sum + bias, type, alpha);
+      }
+    }
+    return result;
+  }
+};
+
+class Tensor3Math : public TensorMath<Tensor3>, public ITensor3Math<Tensor3> {};
+
+typedef Tensor0Math ScalarMath;
+typedef Tensor1Math VectorMath;
+typedef Tensor2Math MatrixMath;
+
+} // namespace CPU
--- a/src/math/tensor/cpu/tensor.cpp
+++ b/src/math/tensor/cpu/tensor.cpp
@@ -0,0 +1 @@
+#include "tensor.hpp"
--- a/src/math/tensor/cpu/tensor.hpp
+++ b/src/math/tensor/cpu/tensor.hpp
@@ -0,0 +1,296 @@
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "../tensor.hpp"
+
+extern std::mt19937 gen;
+
+namespace CPU {
+class Tensor;
+class Tensor0;
+class Tensor1;
+class Tensor2;
+class Tensor3;
+
+class Tensor : public ITensor {
+protected:
+  std::vector<float> data;
+
+  void resize(size_t size) { data.resize(size); }
+  void resize(const std::vector<int> &shape) {
+    size_t size = 1;
+    for (int dim : shape)
+      size *= dim;
+    resize(size);
+  }
+
+public:
+  Tensor(const std::vector<int> &shape) : ITensor(shape) {
+    resize(shape);
+    std::generate(data.begin(), data.end(),
+                  []() { return std::generate_canonical<float, 10>(gen); });
+  }
+  Tensor(const std::vector<int> &shape, float value) : ITensor(shape) {
+    resize(shape);
+    std::fill(data.begin(), data.end(), value);
+  }
+  Tensor(const std::vector<int> &shape, bool fill) : ITensor(shape) {
+    resize(shape);
+    if (fill)
+      std::fill(data.begin(), data.end(), 0.0f);
+  }
+  Tensor(const Tensor &) = default;
+  Tensor &operator=(const Tensor &) = default;
+  Tensor(Tensor &&other) = default;
+  Tensor &operator=(Tensor &&other) = default;
+
+  float &operator[](int index) { return data[index]; }
+  const float &operator[](int index) const { return data[index]; }
+
+  virtual void print() const {
+    std::cout << "Tensor(" << getDim() << "): [";
+    for (size_t i = 0; i < data.size(); ++i) {
+      std::cout << data[i];
+      if (i > 15) {
+        std::cout << "... ";
+        break;
+      }
+      if (i != data.size() - 1)
+        std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+  }
+
+  std::vector<float> toVector() const { return data; }
+
+  static Tensor0 *asScalar(Tensor *tensor) {
+    return tensor->getType() == Type::SCALAR
+               ? reinterpret_cast<Tensor0 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor0 *asScalar(const Tensor *tensor) {
+    return tensor->getType() == Type::SCALAR
+               ? reinterpret_cast<const Tensor0 *>(tensor)
+               : nullptr;
+  }
+  static Tensor1 *asVector(Tensor *tensor) {
+    return tensor->getType() == Type::VECTOR
+               ? reinterpret_cast<Tensor1 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor1 *asVector(const Tensor *tensor) {
+    return tensor->getType() == Type::VECTOR
+               ? reinterpret_cast<const Tensor1 *>(tensor)
+               : nullptr;
+  }
+  static Tensor2 *asMatrix(Tensor *tensor) {
+    return tensor->getType() == Type::MATRIX
+               ? reinterpret_cast<Tensor2 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor2 *asMatrix(const Tensor *tensor) {
+    return tensor->getType() == Type::MATRIX
+               ? reinterpret_cast<const Tensor2 *>(tensor)
+               : nullptr;
+  }
+  static Tensor3 *asTensor3(Tensor *tensor) {
+    return tensor->getType() == Type::TENSOR3
+               ? reinterpret_cast<Tensor3 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor3 *asTensor3(const Tensor *tensor) {
+    return tensor->getType() == Type::TENSOR3
+               ? reinterpret_cast<const Tensor3 *>(tensor)
+               : nullptr;
+  }
+};
+
+class Tensor0 : public Tensor, public ITensor0 {
+public:
+  Tensor0(const std::vector<int> &shape) : Tensor(shape) {
+    if (shape.size() != 0)
+      throw std::invalid_argument("Tensor0 dimension must be 0");
+  }
+  Tensor0(const std::vector<int> &shape, float value) : Tensor(shape, value) {
+    if (shape.size() != 0)
+      throw std::invalid_argument("Tensor0 dimension must be 0");
+  }
+  Tensor0() : Tensor({}) {
+    resize(1);
+    data[0] = std::generate_canonical<float, 10>(gen);
+  }
+  Tensor0(float value) : Tensor({}) {
+    resize(1);
+    data[0] = value;
+  }
+  Tensor0(const Tensor0 &) = default;
+  Tensor0 &operator=(const Tensor0 &) = default;
+  Tensor0(Tensor0 &&other) = default;
+  Tensor0 &operator=(Tensor0 &&other) = default;
+
+  void print() const override {
+    std::cout << "Scalar: " << data[0] << std::endl;
+  }
+
+  float &value() { return data[0]; }
+  const float &value() const { return data[0]; }
+};
+
+class Tensor1 : public Tensor, public ITensor1 {
+public:
+  Tensor1(const std::vector<int> &shape) : Tensor(shape) {
+    if (shape.size() != 1)
+      throw std::invalid_argument("Tensor1 dimension must be 1");
+  }
+  Tensor1(const std::vector<int> &shape, float value) : Tensor(shape, value) {
+    if (shape.size() != 1)
+      throw std::invalid_argument("Tensor1 dimension must be 1");
+  }
+  Tensor1(int size) : Tensor({size}) {}
+  Tensor1(int size, float value) : Tensor({size}, value) {}
+  Tensor1(const std::vector<float> &values) : Tensor({(int)values.size()}) {
+    data = values;
+  }
+  Tensor1(const Tensor1 &) = default;
+  Tensor1 &operator=(const Tensor1 &) = default;
+  Tensor1(Tensor1 &&other) = default;
+  Tensor1 &operator=(Tensor1 &&other) = default;
+
+  void print() const override {
+    std::cout << "Vector(" << shape[0] << "): [";
+    for (size_t i = 0; i < data.size(); ++i) {
+      std::cout << data[i];
+      if (i != data.size() - 1)
+        std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+  }
+
+  float &operator()(int i) { return data[i]; }
+  const float &operator()(int i) const { return data[i]; }
+
+  int getSize() const override { return shape[0]; }
+};
+
+class Tensor2 : public ITensor2, public Tensor {
+public:
+  Tensor2(const std::vector<int> &shape) : Tensor(shape) {
+    if (shape.size() != 2)
+      throw std::invalid_argument("Tensor2 dimension must be 2");
+  }
+  Tensor2(const std::vector<int> &shape, float value) : Tensor(shape, value) {
+    if (shape.size() != 2)
+      throw std::invalid_argument("Tensor2 dimension must be 2");
+  }
+  Tensor2(int rows, int cols) : ITensor2(), Tensor({rows, cols}) {}
+  Tensor2(int rows, int cols, float value)
+      : ITensor2(), Tensor({rows, cols}, value) {}
+  Tensor2(int rows, int cols, const std::vector<float> &values)
+      : Tensor({rows, cols}, false) {
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j) {
+        data[i * shape[1] + j] = values[i * shape[1] + j];
+      }
+    }
+  }
+  Tensor2(const std::vector<std::vector<float>> &values)
+      : Tensor({(int)values.size(), (int)values[0].size()}) {
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j) {
+        data[i * shape[1] + j] = values[i][j];
+      }
+    }
+  }
+  Tensor2(const Tensor2 &) = default;
+  Tensor2 &operator=(const Tensor2 &) = default;
+  Tensor2(Tensor2 &&other) = default;
+  Tensor2 &operator=(Tensor2 &&other) = default;
+
+  void print() const override {
+    std::cout << "Matrix(" << shape[0] << "x" << shape[1] << "):\n";
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j) {
+        std::cout << data[i * shape[1] + j] << " ";
+      }
+      std::cout << std::endl;
+    }
+  }
+
+  float &operator()(int i, int j) { return data[i * shape[1] + j]; }
+  const float &operator()(int i, int j) const { return data[i * shape[1] + j]; }
+
+  int getRows() const override { return shape[0]; }
+  int getCols() const override { return shape[1]; }
+};
+
+class Tensor3 : public Tensor, public ITensor3 {
+public:
+  Tensor3(const std::vector<int> &shape) : Tensor(shape) {
+    if (shape.size() != 3)
+      throw std::invalid_argument("Tensor3 dimension must be 3");
+  }
+  Tensor3(const std::vector<int> &shape, float value) : Tensor(shape, value) {
+    if (shape.size() != 3)
+      throw std::invalid_argument("Tensor3 dimension must be 3");
+  }
+  Tensor3(int d1, int d2, int d3) : Tensor({d1, d2, d3}) {}
+  Tensor3(int d1, int d2, int d3, float value) : Tensor({d1, d2, d3}, value) {}
+  Tensor3(int d1, int d2, int d3, const std::vector<float> &values)
+      : Tensor({d1, d2, d3}, false) {
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j) {
+        for (int k = 0; k < shape[2]; ++k) {
+          data[i * shape[1] * shape[2] + j * shape[2] + k] =
+              values[i * shape[1] * shape[2] + j * shape[2] + k];
+        }
+      }
+    }
+  }
+  Tensor3(const std::vector<std::vector<std::vector<float>>> &values)
+      : Tensor({(int)values.size(), (int)values[0].size(),
+                (int)values[0][0].size()}) {
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j) {
+        for (int k = 0; k < shape[2]; ++k) {
+          data[i * shape[1] * shape[2] + j * shape[2] + k] = values[i][j][k];
+        }
+      }
+    }
+  }
+  Tensor3(const Tensor3 &) = default;
+  Tensor3 &operator=(const Tensor3 &) = default;
+  Tensor3(Tensor3 &&other) = default;
+  Tensor3 &operator=(Tensor3 &&other) = default;
+
+  void print() const override {
+    std::cout << "Tensor3(" << shape[0] << "x" << shape[1] << "x" << shape[2]
+              << "):\n";
+    for (int i = 0; i < shape[0]; ++i) {
+      std::cout << "Slice " << i << ":\n";
+      for (int j = 0; j < shape[1]; ++j) {
+        for (int k = 0; k < shape[2]; ++k) {
+          std::cout << data[i * shape[1] * shape[2] + j * shape[2] + k] << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << std::endl;
+    }
+  }
+
+  float &operator()(int i, int j, int k) {
+    return data[i * shape[1] * shape[2] + j * shape[2] + k];
+  }
+  const float &operator()(int i, int j, int k) const {
+    return data[i * shape[1] * shape[2] + j * shape[2] + k];
+  }
+};
+
+typedef Tensor0 Scalar;
+typedef Tensor1 Vector;
+typedef Tensor2 Matrix;
+
+} // namespace CPU
--- a/src/math/tensor/gpu/math.cpp
+++ b/src/math/tensor/gpu/math.cpp
@@ -0,0 +1 @@
+#include "math.hpp"
--- a/src/math/tensor/gpu/math.hpp
+++ b/src/math/tensor/gpu/math.hpp
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "../../opencl/opencl.hpp"
+
+#include "tensor.hpp"
+
+#include "../math.hpp"
+
+namespace GPU {
+template <ITensorType T> class TensorMath;
+class Tensor0Math;
+class Tensor1Math;
+class Tensor2Math;
+class Tensor3Math;
+
+template <ITensorType T> class TensorMath : public ITensorMath<T> {
+protected:
+  enum class Method {
+    MULT,
+    MULT_SMALL,
+    SCALAR_MULT,
+    ADD,
+    SCALAR_ADD,
+    ACTIVATE
+  };
+  std::unordered_map<Method, cl::Kernel> kernels;
+  std::unordered_map<Method, std::string> kernelsNames = {
+      {Method::MULT, "mult"},           {Method::MULT_SMALL, "mult_small"},
+      {Method::SCALAR_MULT, "mult_sc"}, {Method::ADD, "add"},
+      {Method::SCALAR_ADD, "add_sc"},   {Method::ACTIVATE, "activate"}};
+
+  cl::CommandQueue queue;
+
+public:
+  TensorMath() {
+    queue = cl::CommandQueue(openCL.getContext(), openCL.getDevice());
+    for (const auto &entry : kernelsNames) {
+      kernels[entry.first] =
+          cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
+    }
+  }
+
+  const cl::CommandQueue &getQueue() const { return queue; }
+
+  void await() const { queue.finish(); }
+
+  T activate(const T &t, Activation type = Activation::LINEAR,
+             float alpha = 0.0f) override {
+    T result(t.getShape(), false, &queue);
+    kernels[Method::ACTIVATE].setArg(0, *t.getBuffer());
+    kernels[Method::ACTIVATE].setArg(1, *result.getBuffer());
+    kernels[Method::ACTIVATE].setArg(2, static_cast<int>(type));
+    kernels[Method::ACTIVATE].setArg(3, alpha);
+    queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange,
+                               cl::NDRange(t.getSize()));
+    return result;
+  }
+
+  T mult(const T &t, float x) override {
+    T result(t.getShape(), false, &queue);
+    kernels[Method::SCALAR_MULT].setArg(0, *t.getBuffer());
+    kernels[Method::SCALAR_MULT].setArg(1, *result.getBuffer());
+    kernels[Method::SCALAR_MULT].setArg(2, x);
+    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
+                               cl::NDRange(t.getSize()));
+    return result;
+  }
+
+  T add(const T &a, const T &b, float x = 1.0f) override {
+    this->validateSameDimensions(a, b);
+    T result(a.getShape(), false, &queue);
+    kernels[Method::ADD].setArg(0, *a.getBuffer());
+    kernels[Method::ADD].setArg(1, *b.getBuffer());
+    kernels[Method::ADD].setArg(2, *result.getBuffer());
+    kernels[Method::ADD].setArg(3, x);
+    queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
+                               cl::NDRange(a.getSize()));
+    return result;
+  }
+
+  T add(const T &t, float x) override {
+    T result(t.getShape(), false, &queue);
+    kernels[Method::SCALAR_ADD].setArg(0, *t.getBuffer());
+    kernels[Method::SCALAR_ADD].setArg(1, *result.getBuffer());
+    kernels[Method::SCALAR_ADD].setArg(2, x);
+    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
+                               cl::NDRange(t.getSize()));
+    return result;
+  }
+};
+
+class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
+
+class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
+
+class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
+private:
+  Tensor2 mult_tiled(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+                     float bias = 0.0f, Activation type = Activation::LINEAR,
+                     float alpha = 0.01f) {
+    validateMultDimensions(a, b, transpose);
+    Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
+                   &queue);
+
+    const int tile_size = 16;
+    cl::NDRange local_size(tile_size, tile_size);
+    cl::NDRange global_size(
+        ((result.getRows() + tile_size - 1) / tile_size) * tile_size,
+        ((result.getCols() + tile_size - 1) / tile_size) * tile_size);
+
+    kernels[Method::MULT].setArg(0, *a.getBuffer());
+    kernels[Method::MULT].setArg(1, *b.getBuffer());
+    kernels[Method::MULT].setArg(2, *result.getBuffer());
+    kernels[Method::MULT].setArg(3, bias);
+    kernels[Method::MULT].setArg(4, static_cast<int>(type));
+    kernels[Method::MULT].setArg(5, alpha);
+    kernels[Method::MULT].setArg(6, result.getRows());
+    kernels[Method::MULT].setArg(7, result.getCols());
+    kernels[Method::MULT].setArg(8, a.getCols());
+    kernels[Method::MULT].setArg(9, transpose ? 1 : 0);
+    queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange,
+                               global_size, local_size);
+    return result;
+  }
+  Tensor2 mult_small(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+                     float bias = 0.0f, Activation type = Activation::LINEAR,
+                     float alpha = 0.01f) {
+    validateMultDimensions(a, b, transpose);
+    Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
+                   &queue);
+    kernels[Method::MULT_SMALL].setArg(0, *a.getBuffer());
+    kernels[Method::MULT_SMALL].setArg(1, *b.getBuffer());
+    kernels[Method::MULT_SMALL].setArg(2, *result.getBuffer());
+    kernels[Method::MULT_SMALL].setArg(3, bias);
+    kernels[Method::MULT_SMALL].setArg(4, static_cast<int>(type));
+    kernels[Method::MULT_SMALL].setArg(5, alpha);
+    kernels[Method::MULT_SMALL].setArg(6, result.getRows());
+    kernels[Method::MULT_SMALL].setArg(7, result.getCols());
+    kernels[Method::MULT_SMALL].setArg(8, a.getCols());
+    kernels[Method::MULT_SMALL].setArg(9, transpose ? 1 : 0);
+    queue.enqueueNDRangeKernel(kernels[Method::MULT_SMALL], cl::NullRange,
+                               cl::NDRange(result.getRows(), result.getCols()));
+    return result;
+  }
+
+public:
+  Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+               float bias = 0.0f, Activation type = Activation::LINEAR,
+               float alpha = 0.01f) override {
+    if (a.getRows() > 64 || a.getCols() > 64 || b.getRows() > 64 ||
+        b.getCols() > 64)
+      return mult_tiled(a, b, transpose, bias, type, alpha);
+    else
+      return mult_small(a, b, transpose, bias, type, alpha);
+  }
+};
+
+class Tensor3Math : public TensorMath<Tensor3>, public ITensor3Math<Tensor3> {};
+
+typedef Tensor0Math ScalarMath;
+typedef Tensor1Math VectorMath;
+typedef Tensor2Math MatrixMath;
+
+} // namespace GPU
--- a/src/math/tensor/gpu/tensor.cpp
+++ b/src/math/tensor/gpu/tensor.cpp
@@ -0,0 +1 @@
+#include "tensor.hpp"
--- a/src/math/tensor/gpu/tensor.hpp
+++ b/src/math/tensor/gpu/tensor.hpp
@@ -0,0 +1,282 @@
+#pragma once
+
+#include "../../opencl/opencl.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "../tensor.hpp"
+#include "math.hpp"
+
+extern std::mt19937 gen;
+
+namespace GPU {
+class Tensor;
+class Tensor0;
+class Tensor1;
+class Tensor2;
+class Tensor3;
+
+class Tensor : public ITensor {
+protected:
+  cl::Buffer *buffer = nullptr;
+
+  size_t getShapeSize(const std::vector<int> &shape) {
+    size_t size = 1;
+    for (int dim : shape)
+      size *= dim;
+    return size;
+  }
+  void fillBuf(const std::vector<float> &v,
+               const cl::CommandQueue *queue = nullptr) {
+    if (buffer != nullptr)
+      throw std::runtime_error("Tensor buffer already exists");
+    buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                            v.size() * sizeof(float));
+    cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
+    q.enqueueWriteBuffer(*buffer, CL_TRUE, 0, v.size() * sizeof(float),
+                         v.data());
+    q.finish();
+  }
+  void createBuf(size_t size, const cl::CommandQueue *queue = nullptr) {
+    std::vector<float> v(size);
+    std::generate(v.begin(), v.end(),
+                  []() { return std::generate_canonical<float, 10>(gen); });
+    fillBuf(v, queue);
+  }
+  void createBuf(size_t size, float value,
+                 const cl::CommandQueue *queue = nullptr) {
+    std::vector<float> v(size);
+    std::fill(v.begin(), v.end(), value);
+    fillBuf(v, queue);
+  }
+
+public:
+  Tensor(const std::vector<int> &shape, const cl::CommandQueue *queue = nullptr)
+      : ITensor(shape) {
+    createBuf(getShapeSize(shape), queue);
+  }
+  Tensor(const std::vector<int> &shape, float value,
+         const cl::CommandQueue *queue = nullptr)
+      : ITensor(shape) {
+    createBuf(getShapeSize(shape), value, queue);
+  }
+  Tensor(const std::vector<int> &shape, bool fill,
+         const cl::CommandQueue *queue = nullptr)
+      : ITensor(shape) {
+    if (fill)
+      createBuf(getShapeSize(shape), 0.0f, queue);
+  }
+  Tensor(const Tensor &) = delete;
+  Tensor &operator=(const Tensor &) = delete;
+  Tensor(Tensor &&other) : ITensor(other.shape), buffer(other.buffer) {
+    other.buffer = nullptr;
+  };
+  Tensor &operator=(Tensor &&other) = delete;
+
+  std::vector<float> toVector(const cl::CommandQueue *queue = nullptr) {
+    size_t size = getShapeSize(shape);
+    std::vector<float> result(size);
+    cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
+    q.enqueueReadBuffer(*buffer, CL_TRUE, 0, size * sizeof(float),
+                        result.data());
+    q.finish();
+    return result;
+  }
+
+  const cl::Buffer *getBuffer() const { return buffer; }
+
+  static Tensor0 *asScalar(Tensor *tensor) {
+    return tensor->getType() == Type::SCALAR
+               ? reinterpret_cast<Tensor0 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor0 *asScalar(const Tensor *tensor) {
+    return tensor->getType() == Type::SCALAR
+               ? reinterpret_cast<const Tensor0 *>(tensor)
+               : nullptr;
+  }
+  static Tensor1 *asVector(Tensor *tensor) {
+    return tensor->getType() == Type::VECTOR
+               ? reinterpret_cast<Tensor1 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor1 *asVector(const Tensor *tensor) {
+    return tensor->getType() == Type::VECTOR
+               ? reinterpret_cast<const Tensor1 *>(tensor)
+               : nullptr;
+  }
+  static Tensor2 *asMatrix(Tensor *tensor) {
+    return tensor->getType() == Type::MATRIX
+               ? reinterpret_cast<Tensor2 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor2 *asMatrix(const Tensor *tensor) {
+    return tensor->getType() == Type::MATRIX
+               ? reinterpret_cast<const Tensor2 *>(tensor)
+               : nullptr;
+  }
+  static Tensor3 *asTensor3(Tensor *tensor) {
+    return tensor->getType() == Type::TENSOR3
+               ? reinterpret_cast<Tensor3 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor3 *asTensor3(const Tensor *tensor) {
+    return tensor->getType() == Type::TENSOR3
+               ? reinterpret_cast<const Tensor3 *>(tensor)
+               : nullptr;
+  }
+};
+
+class Tensor0 : public Tensor, public ITensor0 {
+public:
+  Tensor0(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 0)
+      throw std::invalid_argument("Tensor0 dimension must be 0");
+  }
+  Tensor0(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 0)
+      throw std::invalid_argument("Tensor0 dimension must be 0");
+  }
+  Tensor0(const cl::CommandQueue *queue = nullptr) : Tensor({}, queue) {
+    createBuf(1, queue);
+  }
+  Tensor0(float value, const cl::CommandQueue *queue = nullptr)
+      : Tensor({}, queue) {
+    createBuf(1, value, queue);
+  }
+  Tensor0(const Tensor0 &) = delete;
+  Tensor0 &operator=(const Tensor0 &) = delete;
+  Tensor0(Tensor0 &&other) : Tensor(std::move(other)) {};
+  Tensor0 &operator=(Tensor0 &&other) = delete;
+};
+
+class Tensor1 : public Tensor, public ITensor1 {
+public:
+  Tensor1(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 1)
+      throw std::invalid_argument("Tensor1 dimension must be 1");
+  }
+  Tensor1(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 1)
+      throw std::invalid_argument("Tensor1 dimension must be 1");
+  }
+  Tensor1(int size, const cl::CommandQueue *queue = nullptr)
+      : Tensor({size}, queue) {}
+  Tensor1(int size, float value, const cl::CommandQueue *queue = nullptr)
+      : Tensor({size}, value, queue) {}
+  Tensor1(const std::vector<float> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({(int)values.size()}, false, queue) {
+    fillBuf(values, queue);
+  }
+  Tensor1(const Tensor1 &) = delete;
+  Tensor1 &operator=(const Tensor1 &) = delete;
+  Tensor1(Tensor1 &&other) : Tensor(std::move(other)) {}
+  Tensor1 &operator=(Tensor1 &&other) = delete;
+
+  int getSize() const override { return shape[0]; }
+};
+
+class Tensor2 : public ITensor2, public Tensor {
+public:
+  Tensor2(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 2)
+      throw std::invalid_argument("Tensor2 dimension must be 2");
+  }
+  Tensor2(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 2)
+      throw std::invalid_argument("Tensor2 dimension must be 2");
+  }
+  Tensor2(int rows, int cols, const cl::CommandQueue *queue = nullptr)
+      : ITensor2(), Tensor({rows, cols}, queue) {}
+  Tensor2(int rows, int cols, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : ITensor2(), Tensor({rows, cols}, value, queue) {}
+  Tensor2(int rows, int cols, const std::vector<float> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({rows, cols}, false, queue) {
+    fillBuf(values, queue);
+  }
+  Tensor2(const std::vector<std::vector<float>> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({(int)values.size(), (int)values[0].size()}, false) {
+    std::vector<float> v(values.size() * values[0].size());
+    for (size_t i = 0; i < values.size(); ++i) {
+      for (size_t j = 0; j < values[i].size(); ++j)
+        v[i * values[0].size() + j] = values[i][j];
+    }
+    fillBuf(v, queue);
+  }
+
+  Tensor2(const Tensor2 &) = delete;
+  Tensor2 &operator=(const Tensor2 &) = delete;
+  Tensor2(Tensor2 &&other) : Tensor(std::move(other)) {}
+  Tensor2 &operator=(Tensor2 &&other) = delete;
+
+  int getRows() const override { return shape[0]; }
+  int getCols() const override { return shape[1]; }
+};
+
+class Tensor3 : public Tensor, public ITensor3 {
+public:
+  Tensor3(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 3)
+      throw std::invalid_argument("Tensor3 dimension must be 3");
+  }
+  Tensor3(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 3)
+      throw std::invalid_argument("Tensor3 dimension must be 3");
+  }
+  Tensor3(int d1, int d2, int d3, const cl::CommandQueue *queue = nullptr)
+      : Tensor({d1, d2, d3}, queue) {}
+  Tensor3(int d1, int d2, int d3, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({d1, d2, d3}, value, queue) {}
+  Tensor3(int d1, int d2, int d3, const std::vector<float> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({d1, d2, d3}, false, queue) {
+    fillBuf(values, queue);
+  }
+  Tensor3(const std::vector<std::vector<std::vector<float>>> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({(int)values.size(), (int)values[0].size(),
+                (int)values[0][0].size()},
+               false, queue) {
+    std::vector<float> v(shape[0] * shape[1] * shape[2]);
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j)
+        for (int k = 0; k < shape[2]; ++k)
+          v[i * shape[1] * shape[2] + j * shape[1] + k] = values[i][j][k];
+    }
+    fillBuf(v, queue);
+  }
+  Tensor3(const Tensor3 &) = delete;
+  Tensor3 &operator=(const Tensor3 &) = delete;
+  Tensor3(Tensor3 &&other) : Tensor(std::move(other)) {}
+  Tensor3 &operator=(Tensor3 &&other) = delete;
+};
+
+typedef Tensor0 Scalar;
+typedef Tensor1 Vector;
+typedef Tensor2 Matrix;
+
+} // namespace GPU
--- a/src/math/tensor/math.hpp
+++ b/src/math/tensor/math.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "tensor.hpp"
+
+enum class Activation { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU };
+
+template <typename T>
+concept ITensorType = std::is_base_of_v<ITensor, T>;
+
+template <typename T>
+concept ITensor0Type = std::is_base_of_v<ITensor0, T>;
+template <typename T>
+concept ITensor1Type = std::is_base_of_v<ITensor1, T>;
+template <typename T>
+concept ITensor2Type = std::is_base_of_v<ITensor2, T>;
+template <typename T>
+concept ITensor3Type = std::is_base_of_v<ITensor3, T>;
+
+template <ITensorType T> class ITensorMath {
+protected:
+  void validateSameDimensions(const T &a, const T &b) const {
+    if (a.getDim() != b.getDim())
+      throw std::invalid_argument("Tensors must have the same dimension");
+    if (a.getSize() != b.getSize())
+      throw std::invalid_argument("Tensors must have the same size");
+    for (int i = 0; i < a.getDim(); ++i) {
+      if (a.getShape()[i] != b.getShape()[i])
+        throw std::invalid_argument("Tensors must have the same shape");
+    }
+  };
+
+public:
+  virtual T activate(const T &m, Activation type, float alpha) = 0;
+
+  virtual T mult(const T &m, float x) = 0;
+  virtual T add(const T &a, const T &b, float x) = 0;
+  virtual T add(const T &m, float x) = 0;
+};
+
+template <ITensor0Type T> class ITensor0Math {};
+
+template <ITensor1Type T> class ITensor1Math {};
+
+template <ITensor2Type T> class ITensor2Math {
+public:
+  virtual T mult(const T &a, const T &b, bool transpose, float bias,
+                 Activation type, float alpha) = 0;
+
+  void validateMultDimensions(const T &a, const T &b, bool transpose) const {
+    if ((!transpose && a.getCols() != b.getRows()) ||
+        (transpose && a.getCols() != b.getCols())) {
+      throw std::invalid_argument(
+          "Invalid matrix dimensions for multiplication");
+    }
+  };
+};
+
+template <ITensor3Type T> class ITensor3Math {};
--- a/src/math/tensor/tensor.hpp
+++ b/src/math/tensor/tensor.hpp
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <stdexcept>
+#include <vector>
+
+std::random_device rd;
+std::mt19937 gen(rd());
+
+class ITensor {
+protected:
+  std::vector<int> shape;
+
+  void validateDimensions(const std::vector<int> &shape) const {
+    if (shape.empty())
+      throw std::invalid_argument("Tensor shape cannot be empty");
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] <= 0)
+        throw std::invalid_argument(
+            "All tensor dimensions must be positive, but dimension " +
+            std::to_string(i) + " is " + std::to_string(shape[i]));
+    }
+  };
+
+public:
+  ITensor(const std::vector<int> &shape) : shape(shape) {}
+  ITensor(const ITensor &) = default;
+  ITensor &operator=(const ITensor &) = default;
+  ITensor(ITensor &&other) = default;
+  ITensor &operator=(ITensor &&other) = default;
+
+  const std::vector<int> &getShape() const { return shape; }
+  int getDim() const { return static_cast<int>(shape.size()); }
+  size_t getSize() const {
+    size_t size = 1;
+    for (int dim : shape)
+      size *= dim;
+    return size;
+  };
+
+  enum class Type { SCALAR, VECTOR, MATRIX, TENSOR3 };
+  Type getType() const { return static_cast<Type>(shape.size()); };
+};
+
+class ITensor0 {};
+
+class ITensor1 {
+public:
+  virtual int getSize() const = 0;
+};
+
+class ITensor2 {
+public:
+  ITensor2() = default;
+  ITensor2(const ITensor2 &) = default;
+  ITensor2 &operator=(const ITensor2 &) = default;
+  ITensor2(ITensor2 &&other) = default;
+  ITensor2 &operator=(ITensor2 &&other) = default;
+
+  virtual int getRows() const = 0;
+  virtual int getCols() const = 0;
+};
+
+class ITensor3 {};
+
+typedef ITensor0 IScalar;
+typedef ITensor1 IVector;
+typedef ITensor2 IMatrix;