Complete refactor

2026-04-03 20:30:39 +04:00 · 2025-10-29 14:43:30 +04:00
parent 2955fbbe42
commit 187492c6b0
19 changed files with 631 additions and 470 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.vscode
+*.exe
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,8 +1,8 @@
 CXX = g++
-CXXFLAGS = -Wall -O2 -std=c++11
+CXXFLAGS = -Wall -Wextra -O2 -std=c++11
 LIBS = -lOpenCL
 TARGET = main
-SRC = main.cpp
+SRC = main.cpp ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp

 INCLUDES = -I"A:/Programs/OpenCL/include" 
 LIB_PATH = -L"A:/Programs/OpenCL/lib"
--- a/src/kernels/matrix.cl
+++ b/src/kernels/matrix.cl
@@ -1,5 +1,52 @@
-__kernel void mult(__global float* A, __global float* B, __global float* C, 
-                            int M, int N, int K) {
+float activate_x(float x, const int activation_type, const float alpha) {
+    switch(activation_type) {
+        case 0: // LINEAR
+            return x;
+        case 1: // SIGMOID
+            return 1.0f / (1.0f + exp(-x));
+        case 2: // TANH
+            return tanh(x);
+        case 3: // RELU
+            return fmax(0.0f, x);
+        case 4: // LEAKY_RELU
+            return (x > 0.0f) ? x : alpha * x;
+        case 5: // ELU
+            return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
+        case 6: // GELU
+            return 0.5f * x * (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x)));
+        default:
+            return x;
+    }
+}
+
+__kernel void activate(
+    __global float* input,
+    __global float* output,
+    const int activation_type,
+    const float alpha,
+    const int rows,
+    const int cols)
+{
+    int row = get_global_id(0);
+    int col = get_global_id(1);
+    
+    if (row < rows && col < cols) {
+        int idx = row * cols + col;
+        output[idx] = activate_x(input[idx], activation_type, alpha);
+    }
+}
+
+__kernel void mult(
+    __global float* A, 
+    __global float* B, 
+    __global float* C,
+    const float bias,          
+    const int activation_type, 
+    const float alpha,         
+    const int M, 
+    const int N, 
+    const int K) 
+{
    const int tile_size = 16;
    
    int local_i = get_local_id(0);
@@ -49,7 +96,11 @@ __kernel void mult(__global float* A, __global float* B, __global float* C,
    }
    
    if (global_i < M && global_j < N) {
-        C[global_i * N + global_j] = sum;
+        float result = sum + bias; 
+        if (activation_type != 0) {
+            result = activate_x(result, activation_type, alpha);
+        }
+        C[global_i * N + global_j] = result;
    }
 }

@@ -70,3 +121,4 @@ __kernel void add_sc(__global float* A, __global float* B, float scalar, int M,
    int j = get_global_id(1);
    B[i * N + j] = A[i * N + j] + scalar;
 }
+
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -59,7 +59,7 @@ int main() {
    auto op_start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < 10; i++) {
-      a.mult(b);
+      a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID);
    }

    auto op_end = std::chrono::high_resolution_clock::now();
@@ -82,7 +82,7 @@ int main() {
    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;

    std::cout << "First few elements: ";
-    for (int i = 0; i < 5 && i < v.size(); ++i) {
+    for (size_t i = 0; i < 5 && i < v.size(); ++i) {
      std::cout << v[i] << " ";
    }
    std::cout << std::endl;
@@ -103,7 +103,7 @@ int main() {
    auto op_start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < 10; i++) {
-      a.mult(b);
+      a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f);
    }

    auto op_end = std::chrono::high_resolution_clock::now();
@@ -126,7 +126,7 @@ int main() {
    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;

    std::cout << "First few elements: ";
-    for (int i = 0; i < 5 && i < v.size(); ++i) {
+    for (size_t i = 0; i < 5 && i < v.size(); ++i) {
      std::cout << v[i] << " ";
    }
    std::cout << std::endl;
--- a/src/math/math.hpp
+++ b/src/math/math.hpp
@@ -1,11 +1,9 @@
-#ifndef MATH_H
-#define MATH_H
+#pragma once

-#define __CL_ENABLE_EXCEPTIONS
-#include <CL/opencl.hpp>
-
-#include "matrix.hpp"
-#include "mutable_matrix.hpp"
 #include "opencl/opencl.hpp"

-#endif
+#include "matrix/cpu/matrix.hpp"
+#include "matrix/cpu/mutable_matrix.hpp"
+
+#include "matrix/gpu/matrix.hpp"
+#include "matrix/gpu/mutable_matrix.hpp"
--- a/src/math/matrix.hpp
+++ b/src/math/matrix.hpp
@@ -1,126 +0,0 @@
-#ifndef MATRIX_H
-#define MATRIX_H
-
-#include "./opencl/opencl.hpp"
-#include <algorithm>
-#include <memory>
-#include <stdexcept>
-#include <vector>
-
-class IMatrix {
-protected:
-  int rows;
-  int cols;
-
-  void validateDimensions(int rows, int cols) {
-    if (rows <= 0 || cols <= 0) {
-      throw std::invalid_argument("Matrix dimensions must be positive");
-    }
-  }
-
-  void checkIndices(int row, int col) const {
-    if (row < 0 || row >= rows || col < 0 || col >= cols) {
-      throw std::out_of_range("Matrix indices out of range");
-    }
-  }
-
-public:
-  IMatrix(int rows, int cols) : rows(rows), cols(cols) {}
-  virtual ~IMatrix() = default;
-  virtual int getRows() const = 0;
-  virtual int getCols() const = 0;
-  virtual const std::vector<float> toVector() const = 0;
-};
-
-namespace Matrices {
-class CPU;
-
-class GPU : public IMatrix {
-protected:
-  cl::Buffer *buffer;
-  cl::CommandQueue queue;
-
-public:
-  GPU(int rows, int cols, const std::vector<float> &matrix)
-      : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
-    validateDimensions(rows, cols);
-    if (matrix.size() != static_cast<size_t>(rows * cols)) {
-      throw std::invalid_argument("Matrix data size doesn't match dimensions");
-    }
-
-    buffer = new cl::Buffer(
-        openCL.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-        rows * cols * sizeof(float), const_cast<float *>(matrix.data()));
-  }
-  ~GPU() { delete buffer; }
-
-  GPU(const GPU &) = delete;
-  GPU &operator=(const GPU &) = delete;
-  GPU(GPU &&other) = default;
-  GPU &operator=(GPU &&other) = default;
-
-  int getRows() const override { return rows; }
-  int getCols() const override { return cols; }
-  size_t getSize() const { return rows * cols; }
-
-  const cl::Buffer *getBuffer() const { return buffer; }
-
-  const std::vector<float> toVector() const {
-    std::vector<float> result(rows * cols);
-    queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
-                            result.data());
-    queue.finish();
-    return result;
-  }
-
-  CPU toCPU() const;
-};
-
-class CPU : public IMatrix {
-protected:
-  std::vector<float> data;
-
-public:
-  CPU(int rows, int cols, float value = 0.0f)
-      : IMatrix(rows, cols), data(rows * cols, value) {
-    validateDimensions(rows, cols);
-  }
-
-  CPU(int rows, int cols, const std::vector<float> &matrix)
-      : IMatrix(rows, cols), data(matrix) {
-    validateDimensions(rows, cols);
-    if (matrix.size() != static_cast<size_t>(rows * cols)) {
-      throw std::invalid_argument("Data size doesn't match matrix dimensions");
-    }
-  }
-
-  CPU(const CPU &) = default;
-  CPU &operator=(const CPU &) = default;
-  CPU(CPU &&) = default;
-  CPU &operator=(CPU &&) = default;
-  ~CPU() override = default;
-
-  float &operator()(int row, int col) {
-    checkIndices(row, col);
-    return data[row * cols + col];
-  }
-
-  const float &operator()(int row, int col) const {
-    checkIndices(row, col);
-    return data[row * cols + col];
-  }
-
-  const std::vector<float> toVector() const { return data; }
-
-  int getRows() const override { return rows; }
-  int getCols() const override { return cols; }
-  size_t getSize() const { return data.size(); }
-
-  GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); }
-};
-
-CPU GPU::toCPU() const { return CPU(rows, cols, toVector()); }
-
-} // namespace Matrices
-
-#endif
--- a/src/math/matrix/cpu/matrix.cpp
+++ b/src/math/matrix/cpu/matrix.cpp
@@ -0,0 +1,24 @@
+#include "matrix.hpp"
+
+Matrices::CPU::CPU(int rows, int cols, float value)
+    : IMatrix(rows, cols), data(rows * cols, value) {
+  validateDimensions(rows, cols);
+}
+
+Matrices::CPU::CPU(int rows, int cols, const std::vector<float> &matrix)
+    : IMatrix(rows, cols), data(matrix) {
+  validateDimensions(rows, cols);
+  if (matrix.size() != static_cast<size_t>(rows * cols)) {
+    throw std::invalid_argument("Data size doesn't match matrix dimensions");
+  }
+}
+
+float &Matrices::CPU::operator()(int row, int col) {
+  checkIndices(row, col);
+  return data[row * cols + col];
+}
+
+const float &Matrices::CPU::operator()(int row, int col) const {
+  checkIndices(row, col);
+  return data[row * cols + col];
+}
--- a/src/math/matrix/cpu/matrix.hpp
+++ b/src/math/matrix/cpu/matrix.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+#include "../matrix.hpp"
+
+namespace Matrices {
+
+class CPU : public IMatrix {
+protected:
+  std::vector<float> data;
+
+public:
+  CPU(int rows, int cols, float value = 0.0f);
+  CPU(int rows, int cols, const std::vector<float> &matrix);
+
+  CPU(const CPU &) = default;
+  CPU &operator=(const CPU &) = default;
+  CPU(CPU &&) = default;
+  CPU &operator=(CPU &&) = default;
+  ~CPU() override = default;
+
+  float &operator()(int row, int col);
+  const float &operator()(int row, int col) const;
+
+  const std::vector<float> toVector() const { return data; }
+
+  int getRows() const override { return rows; }
+  int getCols() const override { return cols; }
+  size_t getSize() const { return data.size(); }
+
+  //   GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); }
+};
+
+} // namespace Matrices
--- a/src/math/matrix/cpu/mutable_matrix.cpp
+++ b/src/math/matrix/cpu/mutable_matrix.cpp
@@ -0,0 +1,76 @@
+#include "mutable_matrix.hpp"
+
+float MutableMatrices::CPU::activate_x(float x, Activate type, float alpha) {
+  switch (type) {
+  case Activate::LINEAR:
+    return x;
+  case Activate::SIGMOID:
+    return 1.0f / (1.0f + std::exp(-x));
+  case Activate::TANH:
+    return std::tanh(x);
+  case Activate::RELU:
+    return std::max(0.0f, x);
+  case Activate::LEAKY_RELU:
+    return (x > 0.0f) ? x : alpha * x;
+  case Activate::ELU:
+    return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f);
+  case Activate::GELU:
+    return 0.5f * x *
+           (1.0f +
+            std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x)));
+  default:
+    throw std::invalid_argument("Unknown activation type");
+  }
+}
+void MutableMatrices::CPU::mult(Matrices::CPU &m, float bias, Activate type,
+                                float alpha) {
+  validateMultDimensions(*this, m);
+
+  std::vector<float> result(rows * m.getCols(), 0.0f);
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < m.getCols(); j++) {
+      float sum = 0.0f;
+      for (int k = 0; k < cols; k++) {
+        sum += (*this)(i, k) * m(k, j);
+      }
+      result[i * m.getCols() + j] = activate_x(sum + bias, type, alpha);
+    }
+  }
+  data = std::move(result);
+  cols = m.getCols();
+}
+
+void MutableMatrices::CPU::mult(float scalar) {
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      data[i * cols + j] *= scalar;
+    }
+  }
+}
+
+void MutableMatrices::CPU::add(Matrices::CPU &m, float a, float b) {
+  validateSameDimensions(*this, m);
+
+  std::vector<float> result(rows * cols, 0.0f);
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b);
+    }
+  }
+  data = std::move(result);
+}
+
+void MutableMatrices::CPU::add(float scalar) {
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      data[i * cols + j] += scalar;
+    }
+  }
+}
+void MutableMatrices::CPU::activate(Activate type, float alpha) {
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      data[i * cols + j] = activate_x(data[i * cols + j], type, alpha);
+    }
+  }
+}
--- a/src/math/matrix/cpu/mutable_matrix.hpp
+++ b/src/math/matrix/cpu/mutable_matrix.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "matrix.hpp"
+
+#include "../mutable_matrix.hpp"
+
+#include <cmath>
+
+#define M_PI 3.14159265358979323846
+
+namespace MutableMatrices {
+
+class CPU : public Matrices::CPU, public IMutableMatrix<Matrices::CPU> {
+private:
+  static float activate_x(float x, Activate type, float alpha = 0.01f);
+
+public:
+  CPU(int rows, int cols, const std::vector<float> &matrix)
+      : Matrices::CPU(rows, cols, matrix) {}
+
+  void mult(Matrices::CPU &m, float bias = 0.0f,
+            Activate type = Activate::LINEAR, float alpha = 0.01f);
+  void mult(float scalar);
+  void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f);
+  void add(float scalar);
+  void activate(Activate type, float alpha = 0.01f);
+};
+}; // namespace MutableMatrices
--- a/src/math/matrix/gpu/matrix.cpp
+++ b/src/math/matrix/gpu/matrix.cpp
@@ -0,0 +1,21 @@
+#include "matrix.hpp"
+
+Matrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
+    : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
+  validateDimensions(rows, cols);
+  if (matrix.size() != static_cast<size_t>(rows * cols)) {
+    throw std::invalid_argument("Matrix data size doesn't match dimensions");
+  }
+
+  buffer = new cl::Buffer(
+      openCL.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+      rows * cols * sizeof(float), const_cast<float *>(matrix.data()));
+}
+
+const std::vector<float> Matrices::GPU::toVector() const {
+  std::vector<float> result(rows * cols);
+  queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
+                          result.data());
+  queue.finish();
+  return result;
+}
--- a/src/math/matrix/gpu/matrix.hpp
+++ b/src/math/matrix/gpu/matrix.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "../../opencl/opencl.hpp"
+
+#include "../matrix.hpp"
+
+namespace Matrices {
+class GPU : public IMatrix {
+protected:
+  cl::Buffer *buffer;
+  cl::CommandQueue queue;
+
+public:
+  GPU(int rows, int cols, const std::vector<float> &matrix);
+  ~GPU() { delete buffer; }
+
+  GPU(const GPU &) = delete;
+  GPU &operator=(const GPU &) = delete;
+  GPU(GPU &&other) = default;
+  GPU &operator=(GPU &&other) = default;
+
+  int getRows() const override { return rows; }
+  int getCols() const override { return cols; }
+  size_t getSize() const { return rows * cols; }
+
+  const cl::Buffer *getBuffer() const { return buffer; }
+
+  const std::vector<float> toVector() const;
+
+  // CPU toCPU() const { return CPU(rows, cols, toVector()); };
+};
+
+} // namespace Matrices
--- a/src/math/matrix/gpu/mutable_matrix.cpp
+++ b/src/math/matrix/gpu/mutable_matrix.cpp
@@ -0,0 +1,113 @@
+#include "mutable_matrix.hpp"
+
+MutableMatrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
+    : Matrices::GPU(rows, cols, matrix) {
+  for (const auto &entry : kernelsNames) {
+    kernels[entry.first] =
+        cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
+  }
+}
+
+void MutableMatrices::GPU::mult(Matrices::GPU &m, float bias, Activate type,
+                                float alpha) {
+  validateMultDimensions(*this, m);
+
+  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                 rows * m.getCols() * sizeof(float));
+
+  const int tile_size = 16;
+  cl::NDRange local_size(tile_size, tile_size);
+  cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size,
+                          ((m.getCols() + tile_size - 1) / tile_size) *
+                              tile_size);
+
+  kernels[Method::MULT].setArg(0, *buffer);
+  kernels[Method::MULT].setArg(1, *m.getBuffer());
+  kernels[Method::MULT].setArg(2, *b);
+  kernels[Method::MULT].setArg(3, bias);
+  kernels[Method::MULT].setArg(4, static_cast<int>(type));
+  kernels[Method::MULT].setArg(5, alpha);
+  kernels[Method::MULT].setArg(6, rows);
+  kernels[Method::MULT].setArg(7, m.getCols());
+  kernels[Method::MULT].setArg(8, cols);
+  cl::Event event;
+  queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, global_size,
+                             local_size, nullptr, &event);
+
+  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+  buffer = b;
+  cols = m.getCols();
+}
+
+void MutableMatrices::GPU::mult(float scalar) {
+  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                 rows * cols * sizeof(float));
+  kernels[Method::SCALAR_MULT].setArg(0, *buffer);
+  kernels[Method::SCALAR_MULT].setArg(1, *b);
+  kernels[Method::SCALAR_MULT].setArg(2, scalar);
+  kernels[Method::SCALAR_MULT].setArg(3, rows);
+  kernels[Method::SCALAR_MULT].setArg(4, cols);
+  cl::Event event;
+  queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
+                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                             &event);
+
+  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+  buffer = b;
+}
+
+void MutableMatrices::GPU::add(Matrices::GPU &m, float a, float b) {
+  validateSameDimensions(*this, m);
+
+  cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                   rows * cols * sizeof(float));
+  kernels[Method::ADD].setArg(0, *buffer);
+  kernels[Method::ADD].setArg(1, *m.getBuffer());
+  kernels[Method::ADD].setArg(2, *buf);
+  kernels[Method::ADD].setArg(3, a);
+  kernels[Method::ADD].setArg(4, b);
+  kernels[Method::ADD].setArg(5, rows);
+  kernels[Method::ADD].setArg(6, cols);
+  cl::Event event;
+  queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
+                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                             &event);
+
+  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+  buffer = buf;
+}
+
+void MutableMatrices::GPU::add(float scalar) {
+  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                 rows * cols * sizeof(float));
+  kernels[Method::SCALAR_ADD].setArg(0, *buffer);
+  kernels[Method::SCALAR_ADD].setArg(1, *b);
+  kernels[Method::SCALAR_ADD].setArg(2, scalar);
+  kernels[Method::SCALAR_ADD].setArg(3, rows);
+  kernels[Method::SCALAR_ADD].setArg(4, cols);
+  cl::Event event;
+  queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
+                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                             &event);
+
+  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+  buffer = b;
+}
+
+void MutableMatrices::GPU::activate(Activate type, float alpha) {
+  cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                 rows * cols * sizeof(float));
+  kernels[Method::ACTIVATE].setArg(0, *buffer);
+  kernels[Method::ACTIVATE].setArg(1, *b);
+  kernels[Method::ACTIVATE].setArg(2, static_cast<int>(type));
+  kernels[Method::ACTIVATE].setArg(3, alpha);
+  kernels[Method::ACTIVATE].setArg(4, rows);
+  kernels[Method::ACTIVATE].setArg(5, cols);
+  cl::Event event;
+  queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange,
+                             cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                             &event);
+
+  event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+  buffer = b;
+}
--- a/src/math/matrix/gpu/mutable_matrix.hpp
+++ b/src/math/matrix/gpu/mutable_matrix.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "../../opencl/opencl.hpp"
+
+#include "matrix.hpp"
+
+#include "../mutable_matrix.hpp"
+
+namespace MutableMatrices {
+
+class GPU : public Matrices::GPU, public IMutableMatrix<Matrices::GPU> {
+private:
+  enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD, ACTIVATE };
+  std::unordered_map<Method, cl::Kernel> kernels;
+  std::unordered_map<Method, std::string> kernelsNames = {
+      {Method::MULT, "mult"},
+      {Method::SCALAR_MULT, "mult_sc"},
+      {Method::ADD, "add"},
+      {Method::SCALAR_ADD, "add_sc"},
+      {Method::ACTIVATE, "activate"}};
+
+  static void CL_CALLBACK releaseBuffer(cl_event, cl_int status, void *buf) {
+    if (status == CL_COMPLETE) {
+      //   std::cout << "Kernel complete!" << std::endl;
+      delete (cl::Buffer *)buf;
+    }
+  }
+
+public:
+  GPU(int rows, int cols, const std::vector<float> &matrix);
+
+  void mult(Matrices::GPU &m, float bias = 0.0f,
+            Activate type = Activate::LINEAR, float alpha = 0.01f);
+  void mult(float scalar);
+  void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f);
+  void add(float scalar);
+  void activate(Activate type, float alpha = 0.01f);
+};
+
+}; // namespace MutableMatrices
--- a/src/math/matrix/matrix.hpp
+++ b/src/math/matrix/matrix.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <stdexcept>
+#include <vector>
+
+class IMatrix {
+protected:
+  int rows;
+  int cols;
+
+  void validateDimensions(int rows, int cols) const {
+    if (rows <= 0 || cols <= 0) {
+      throw std::invalid_argument("Matrix dimensions must be positive");
+    }
+  };
+  void checkIndices(int row, int col) const {
+    if (row < 0 || row >= rows || col < 0 || col >= cols) {
+      throw std::out_of_range("Matrix indices out of range");
+    }
+  };
+
+public:
+  IMatrix(int rows, int cols) : rows(rows), cols(cols) {}
+  virtual ~IMatrix() = default;
+  virtual int getRows() const = 0;
+  virtual int getCols() const = 0;
+  virtual const std::vector<float> toVector() const = 0;
+};
--- a/src/math/matrix/mutable_matrix.hpp
+++ b/src/math/matrix/mutable_matrix.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "matrix.hpp"
+
+template <typename T> class IMutableMatrix {
+  static_assert(std::is_base_of<IMatrix, T>::value,
+                "T must be derived from IMatrix");
+
+public:
+  enum class Activate { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU };
+
+  virtual void mult(T &m, float bias, Activate type, float alpha) = 0;
+  virtual void mult(float s) = 0;
+  virtual void add(T &m, float a, float b) = 0;
+  virtual void add(float a) = 0;
+  virtual void activate(Activate type, float alpha = 0.01f) = 0;
+
+  void validateMultDimensions(T &a, T &b) const {
+    if (a.getRows() != b.getCols()) {
+      throw std::invalid_argument(
+          "Invalid matrix dimensions for multiplication");
+    }
+  };
+  void validateSameDimensions(T &a, T &b) const {
+    if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) {
+      throw std::invalid_argument("Invalid matrix dimensions for addition");
+    }
+  };
+};
--- a/src/math/mutable_matrix.hpp
+++ b/src/math/mutable_matrix.hpp
@@ -1,194 +0,0 @@
-#ifndef MUTABLE_MATRIX_H
-#define MUTABLE_MATRIX_H
-
-#include "./opencl/opencl.hpp"
-
-#include "matrix.hpp"
-
-template <typename T> class IMutableMatrix {
-  static_assert(std::is_base_of<IMatrix, T>::value,
-                "T must be derived from IMatrix");
-
-public:
-  virtual void mult(T &m) = 0;
-  virtual void mult(float s) = 0;
-  virtual void add(T &m, float a, float b) = 0;
-  virtual void add(float a) = 0;
-
-  void validateMultDimensions(T &a, T &b) {
-    if (a.getRows() != b.getCols()) {
-      throw std::invalid_argument(
-          "Invalid matrix dimensions for multiplication");
-    }
-  }
-  void validateSameDimensions(T &a, T &b) {
-    if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) {
-      throw std::invalid_argument("Invalid matrix dimensions for addition");
-    }
-  }
-};
-
-namespace MutableMatrices {
-class GPU : public Matrices::GPU, public IMutableMatrix<Matrices::GPU> {
-private:
-  enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD };
-  std::unordered_map<Method, cl::Kernel> kernels;
-  std::unordered_map<Method, std::string> kernelsNames = {
-      {Method::MULT, "mult"},
-      {Method::SCALAR_MULT, "mult_sc"},
-      {Method::ADD, "add"},
-      {Method::SCALAR_ADD, "add_sc"}};
-
-  static void CL_CALLBACK releaseBuffer(cl_event event, cl_int status,
-                                        void *buf) {
-    if (status == CL_COMPLETE) {
-      //   std::cout << "Kernel complete!" << std::endl;
-      delete buf;
-    }
-  }
-
-public:
-  GPU(int rows, int cols, const std::vector<float> &matrix)
-      : Matrices::GPU(rows, cols, matrix) {
-    for (const auto &[method, kernelName] : kernelsNames) {
-      kernels[method] =
-          cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), kernelName);
-    }
-  }
-
-  void mult(Matrices::GPU &m) {
-    validateMultDimensions(*this, m);
-
-    cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                   rows * m.getCols() * sizeof(float));
-
-    const int tile_size = 16;
-    cl::NDRange local_size(tile_size, tile_size);
-    cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size,
-                            ((m.getCols() + tile_size - 1) / tile_size) *
-                                tile_size);
-
-    kernels[Method::MULT].setArg(0, *buffer);
-    kernels[Method::MULT].setArg(1, *m.getBuffer());
-    kernels[Method::MULT].setArg(2, *b);
-    kernels[Method::MULT].setArg(3, rows);
-    kernels[Method::MULT].setArg(4, m.getCols());
-    kernels[Method::MULT].setArg(5, cols);
-    cl::Event event;
-    queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange,
-                               global_size, local_size, nullptr, &event);
-
-    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-    buffer = b;
-    cols = m.getCols();
-  }
-
-  void mult(float scalar) {
-    cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                   rows * cols * sizeof(float));
-    kernels[Method::SCALAR_MULT].setArg(0, *buffer);
-    kernels[Method::SCALAR_MULT].setArg(1, *b);
-    kernels[Method::SCALAR_MULT].setArg(2, scalar);
-    kernels[Method::SCALAR_MULT].setArg(3, rows);
-    kernels[Method::SCALAR_MULT].setArg(4, cols);
-    cl::Event event;
-    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
-                               cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                               &event);
-
-    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-    buffer = b;
-  }
-
-  void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f) {
-    validateSameDimensions(*this, m);
-
-    cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                     rows * cols * sizeof(float));
-    kernels[Method::ADD].setArg(0, *buffer);
-    kernels[Method::ADD].setArg(1, *m.getBuffer());
-    kernels[Method::ADD].setArg(2, *buf);
-    kernels[Method::ADD].setArg(3, a);
-    kernels[Method::ADD].setArg(4, b);
-    kernels[Method::ADD].setArg(5, rows);
-    kernels[Method::ADD].setArg(6, cols);
-    cl::Event event;
-    queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
-                               cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                               &event);
-
-    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-    buffer = buf;
-  }
-
-  void add(float scalar) {
-    cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
-                                   rows * cols * sizeof(float));
-    kernels[Method::SCALAR_ADD].setArg(0, *buffer);
-    kernels[Method::SCALAR_ADD].setArg(1, *b);
-    kernels[Method::SCALAR_ADD].setArg(2, scalar);
-    kernels[Method::SCALAR_ADD].setArg(3, rows);
-    kernels[Method::SCALAR_ADD].setArg(4, cols);
-    cl::Event event;
-    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
-                               cl::NDRange(rows, cols), cl::NullRange, nullptr,
-                               &event);
-
-    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
-    buffer = b;
-  }
-};
-class CPU : public Matrices::CPU, public IMutableMatrix<Matrices::CPU> {
-
-public:
-  CPU(int rows, int cols, const std::vector<float> &matrix)
-      : Matrices::CPU(rows, cols, matrix) {}
-
-  void mult(Matrices::CPU &m) {
-    validateMultDimensions(*this, m);
-
-    std::vector<float> result(rows * m.getCols(), 0.0f);
-    for (int i = 0; i < rows; i++) {
-      for (int j = 0; j < m.getCols(); j++) {
-        float sum = 0.0f;
-        for (int k = 0; k < cols; k++) {
-          sum += (*this)(i, k) * m(k, j);
-        }
-        result[i * m.getCols() + j] = sum;
-      }
-    }
-    data = std::move(result);
-    cols = m.getCols();
-  }
-
-  void mult(float scalar) {
-    for (int i = 0; i < rows; i++) {
-      for (int j = 0; j < cols; j++) {
-        data[i * cols + j] *= scalar;
-      }
-    }
-  }
-
-  void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f) {
-    validateSameDimensions(*this, m);
-
-    std::vector<float> result(rows * cols, 0.0f);
-    for (int i = 0; i < rows; i++) {
-      for (int j = 0; j < cols; j++) {
-        result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b);
-      }
-    }
-    data = std::move(result);
-  }
-
-  void add(float scalar) {
-    for (int i = 0; i < rows; i++) {
-      for (int j = 0; j < cols; j++) {
-        data[i * cols + j] += scalar;
-      }
-    }
-  }
-};
-}; // namespace MutableMatrices
-
-#endif
--- a/src/math/opencl/opencl.cpp
+++ b/src/math/opencl/opencl.cpp
@@ -0,0 +1,121 @@
+#include "opencl.hpp"
+
+std::string OpenCL::readProgram(const std::string &filePath) {
+  std::ifstream file(filePath, std::ios::binary);
+  if (!file.is_open()) {
+    throw std::runtime_error("Cannot open file: " + filePath);
+  }
+
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  return buffer.str();
+}
+cl::Program OpenCL::compileProgram(const std::string &file) {
+  std::string source = readProgram(file);
+  cl::Program program(context, source);
+  try {
+    program.build({device});
+  } catch (cl::Error &e) {
+    std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+    std::cerr << "Build log:\n" << build_log << std::endl;
+    throw;
+  }
+  return program;
+}
+void OpenCL::loadPrograms() {
+  for (const auto &entry : programPaths) {
+    programs[entry.first] = compileProgram(entry.second);
+    std::cout << "Loaded program: " << entry.second << std::endl;
+  }
+}
+
+void OpenCL::initializeDevice() {
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+
+  if (platforms.empty()) {
+    throw std::runtime_error("No OpenCL platforms found");
+  }
+
+  std::vector<cl::Device> devices;
+  bool deviceFound = false;
+
+  for (const auto &platform : platforms) {
+    try {
+      platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+      if (!devices.empty()) {
+        deviceFound = true;
+        break;
+      }
+    } catch (const cl::Error &) {
+      continue;
+    }
+  }
+
+  if (!deviceFound) {
+    for (const auto &platform : platforms) {
+      try {
+        platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+        if (!devices.empty()) {
+          deviceFound = true;
+          break;
+        }
+      } catch (const cl::Error &) {
+        continue;
+      }
+    }
+  }
+
+  if (!deviceFound) {
+    throw std::runtime_error("No suitable OpenCL devices found");
+  }
+
+  device = devices[0];
+  context = cl::Context(device);
+  defaultQueue = cl::CommandQueue(context, device);
+
+  std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>()
+            << "\nPlatform: " << platforms[0].getInfo<CL_PLATFORM_NAME>()
+            << "\nCompute units: "
+            << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
+            << "\nGlobal memory: "
+            << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+            << " MB" << std::endl;
+}
+
+OpenCL::OpenCL() {
+  try {
+    initializeDevice();
+    loadPrograms();
+  } catch (const cl::Error &e) {
+    std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")"
+              << std::endl;
+    throw;
+  }
+}
+
+cl::Program &OpenCL::getProgram(Program program) {
+  auto it = programs.find(program);
+  if (it == programs.end()) {
+    throw std::invalid_argument("Program not loaded: " +
+                                std::to_string(static_cast<int>(program)));
+  }
+  return it->second;
+}
+
+void OpenCL::printDeviceInfo() const {
+  std::cout << "=== OpenCL Device Info ===" << std::endl;
+  std::cout << "Name: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+  std::cout << "Vendor: " << device.getInfo<CL_DEVICE_VENDOR>() << std::endl;
+  std::cout << "Version: " << device.getInfo<CL_DEVICE_VERSION>() << std::endl;
+  std::cout << "Compute Units: "
+            << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << std::endl;
+  std::cout << "Global Memory: "
+            << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+            << " MB" << std::endl;
+  std::cout << "Local Memory: "
+            << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << " KB"
+            << std::endl;
+  std::cout << "Max Work Group Size: "
+            << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
+}
--- a/src/math/opencl/opencl.hpp
+++ b/src/math/opencl/opencl.hpp
@@ -1,7 +1,9 @@
-#ifndef OPENCL_H
-#define OPENCL_H
+#pragma once

+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_TARGET_OPENCL_VERSION 300
 #include <CL/opencl.hpp>
+
 #include <fstream>
 #include <iostream>
 #include <memory>
@@ -22,108 +24,14 @@ private:
  std::unordered_map<Program, std::string> programPaths = {
      {Program::MATRIX, "./kernels/matrix.cl"}};

-  std::string readProgram(const std::string &filePath) {
-    std::ifstream file(filePath, std::ios::binary);
-    if (!file.is_open()) {
-      throw std::runtime_error("Cannot open file: " + filePath);
-    }
+  std::string readProgram(const std::string &filePath);
+  cl::Program compileProgram(const std::string &file);
+  void loadPrograms();

-    std::stringstream buffer;
-    buffer << file.rdbuf();
-    return buffer.str();
-  }
-
-  cl::Program compileProgram(const std::string &file) {
-    std::string source = readProgram(file);
-    cl::Program program(context, source);
-    try {
-      program.build({device});
-    } catch (cl::Error &e) {
-      std::string build_log =
-          program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
-      std::cerr << "Build log:\n" << build_log << std::endl;
-      throw;
-    }
-    return program;
-  }
-
-  void loadPrograms() {
-    for (const auto &[programType, filePath] : programPaths) {
-      try {
-        programs[programType] = compileProgram(filePath);
-        std::cout << "Loaded program: " << filePath << std::endl;
-      } catch (const std::exception &e) {
-        std::cerr << "Failed to load program " << filePath << ": " << e.what()
-                  << std::endl;
-      }
-    }
-  }
-
-  void initializeDevice() {
-    std::vector<cl::Platform> platforms;
-    cl::Platform::get(&platforms);
-
-    if (platforms.empty()) {
-      throw std::runtime_error("No OpenCL platforms found");
-    }
-
-    std::vector<cl::Device> devices;
-    bool deviceFound = false;
-
-    for (const auto &platform : platforms) {
-      try {
-        platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
-        if (!devices.empty()) {
-          deviceFound = true;
-          break;
-        }
-      } catch (const cl::Error &) {
-        continue;
-      }
-    }
-
-    if (!deviceFound) {
-      for (const auto &platform : platforms) {
-        try {
-          platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
-          if (!devices.empty()) {
-            deviceFound = true;
-            break;
-          }
-        } catch (const cl::Error &) {
-          continue;
-        }
-      }
-    }
-
-    if (!deviceFound) {
-      throw std::runtime_error("No suitable OpenCL devices found");
-    }
-
-    device = devices[0];
-    context = cl::Context(device);
-    defaultQueue = cl::CommandQueue(context, device);
-
-    std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>()
-              << "\nPlatform: " << platforms[0].getInfo<CL_PLATFORM_NAME>()
-              << "\nCompute units: "
-              << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
-              << "\nGlobal memory: "
-              << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
-              << " MB" << std::endl;
-  }
+  void initializeDevice();

 public:
-  OpenCL() {
-    try {
-      initializeDevice();
-      loadPrograms();
-    } catch (const cl::Error &e) {
-      std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")"
-                << std::endl;
-      throw;
-    }
-  }
+  OpenCL();

  OpenCL(const OpenCL &) = delete;
  OpenCL &operator=(const OpenCL &) = delete;
@@ -134,38 +42,8 @@ public:
  cl::Context &getContext() { return context; }
  cl::CommandQueue &getDefaultQueue() { return defaultQueue; }

-  cl::Program &getProgram(Program program) {
-    auto it = programs.find(program);
-    if (it == programs.end()) {
-      throw std::invalid_argument("Program not loaded: " +
-                                  std::to_string(static_cast<int>(program)));
-    }
-    return it->second;
-  }
-
-  void printDeviceInfo() const {
-    std::cout << "=== OpenCL Device Info ===" << std::endl;
-    std::cout << "Name: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
-    std::cout << "Vendor: " << device.getInfo<CL_DEVICE_VENDOR>() << std::endl;
-    std::cout << "Version: " << device.getInfo<CL_DEVICE_VERSION>()
-              << std::endl;
-    std::cout << "Compute Units: "
-              << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << std::endl;
-    std::cout << "Global Memory: "
-              << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
-              << " MB" << std::endl;
-    std::cout << "Local Memory: "
-              << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << " KB"
-              << std::endl;
-    std::cout << "Max Work Group Size: "
-              << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
-  }
-
-  bool hasProgram(Program program) const {
-    return programs.find(program) != programs.end();
-  }
+  cl::Program &getProgram(Program program);
+  void printDeviceInfo() const;
 };

 extern OpenCL openCL;
-
-#endif