GPU vs CPU math test complete

2026-07-28 19:35:49 +04:00 · 2025-10-29 03:10:47 +04:00
parent d6dd49c9da
commit 2955fbbe42
10 changed files with 694 additions and 271 deletions
--- a/src/device.hpp
+++ b/src/device.hpp
@@ -1,131 +0,0 @@
-#ifndef DEVICE_H
-#define DEVICE_H
-
-#include <CL/cl.h>
-
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "opencl.hpp"
-
-class CalcEngine {
-private:
-  cl_platform_id platform;
-  cl_device_id device;
-  cl_context context;
-  std::string device_name;
-
-  void initializeOpenCL() {
-    OpenCL::checkError(clGetPlatformIDs(1, &platform, nullptr),
-                       "clGetPlatformIDs");
-    OpenCL::checkError(
-        clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, nullptr),
-        "clGetDeviceIDs");
-
-    char name[128];
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(name), name, nullptr);
-    device_name = name;
-
-    context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
-    if (!context) {
-      throw OpenCLException(-1, "clCreateContext");
-    }
-
-    std::cout << "OpenCL initialized successfully" << std::endl;
-  }
-
-  void cleanup() {
-    if (context)
-      clReleaseContext(context);
-  }
-
-public:
-  CalcEngine() { initializeOpenCL(); }
-
-  ~CalcEngine() { cleanup(); }
-
-  const cl_platform_id getPlatform() const { return platform; };
-  const cl_device_id getDevice() const { return device; };
-  const cl_context getContext() const { return context; };
-  const std::string getDeviceName() const { return device_name; };
-
-  void printDeviceInfo() const {
-    std::cout << "Using OpenCL device: " << device_name << std::endl;
-  }
-
-  cl_mem createBuffer(cl_mem_flags flags, size_t size, void *host_ptr) {
-    cl_int ret;
-    cl_mem buffer = clCreateBuffer(context, flags, size, host_ptr, &ret);
-    OpenCL::checkError(ret, "clCreateBuffer");
-    return buffer;
-  }
-
-  cl_kernel loadKernel(const std::string &filename) {
-    std::string kernelSource = OpenCL::readFile(filename);
-
-    const char *source_str = kernelSource.c_str();
-    cl_program program =
-        clCreateProgramWithSource(context, 1, &source_str, nullptr, nullptr);
-    if (!program) {
-      throw OpenCLException(-1, "clCreateProgramWithSource");
-    }
-
-    cl_int ret = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
-    if (ret != CL_SUCCESS) {
-      size_t log_size;
-      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr,
-                            &log_size);
-      std::vector<char> log(log_size);
-      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size,
-                            log.data(), nullptr);
-
-      std::cerr << "Build log:\n" << log.data() << std::endl;
-      throw OpenCLException(ret, "clBuildProgram");
-    }
-
-    cl_kernel kernel = clCreateKernel(program, "matrix_mult", nullptr);
-    if (!kernel) {
-      throw OpenCLException(-1, "clCreateKernel");
-    }
-
-    std::cout << "Kernel loaded and compiled successfully" << std::endl;
-
-    return kernel;
-  }
-
-  void runKernel(cl_command_queue queue, cl_kernel kernel, int M, int N) {
-    size_t globalSize[2] = {static_cast<size_t>(M), static_cast<size_t>(N)};
-    OpenCL::checkError(clEnqueueNDRangeKernel(queue, kernel, 2, nullptr,
-                                              globalSize, nullptr, 0, nullptr,
-                                              nullptr),
-                       "clEnqueueNDRangeKernel");
-  }
-
-  void readResult(cl_command_queue queue, cl_mem buf,
-                  std::vector<float> &result) {
-    OpenCL::checkError(clEnqueueReadBuffer(queue, buf, CL_TRUE, 0,
-                                           result.size() * sizeof(float),
-                                           result.data(), 0, nullptr, nullptr),
-                       "clEnqueueReadBuffer");
-  }
-
-  void setKernelArgs(cl_kernel kernel, cl_mem bufA, cl_mem bufB, cl_mem bufC,
-                     int M, int N, int K) {
-    OpenCL::checkError(clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA),
-                       "clSetKernelArg for A");
-    OpenCL::checkError(clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB),
-                       "clSetKernelArg for B");
-    OpenCL::checkError(clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC),
-                       "clSetKernelArg for C");
-    OpenCL::checkError(clSetKernelArg(kernel, 3, sizeof(int), &M),
-                       "clSetKernelArg for M");
-    OpenCL::checkError(clSetKernelArg(kernel, 4, sizeof(int), &N),
-                       "clSetKernelArg for N");
-    OpenCL::checkError(clSetKernelArg(kernel, 5, sizeof(int), &K),
-                       "clSetKernelArg for K");
-  }
-};
-
-#endif
--- a/src/kernels/matrix.cl
+++ b/src/kernels/matrix.cl
@@ -0,0 +1,72 @@
+__kernel void mult(__global float* A, __global float* B, __global float* C, 
+                            int M, int N, int K) {
+    const int tile_size = 16;
+    
+    int local_i = get_local_id(0);
+    int local_j = get_local_id(1);
+    int local_size_i = get_local_size(0);
+    int local_size_j = get_local_size(1);
+    
+    int global_i = get_group_id(0) * local_size_i + local_i;
+    int global_j = get_group_id(1) * local_size_j + local_j;
+    
+    __local float tile_A[16][16];
+    __local float tile_B[16][16];
+    
+    float sum = 0.0f;
+    
+    int num_tiles = (K + tile_size - 1) / tile_size;
+    
+    for (int tile = 0; tile < num_tiles; tile++) {
+        int tile_offset = tile * tile_size;
+        
+        int load_i_A = tile_offset + local_i;
+        int load_j_A = tile_offset + local_j;
+        
+        if (global_i < M && load_j_A < K) {
+            tile_A[local_j][local_i] = A[global_i * K + load_j_A];
+        } else {
+            tile_A[local_j][local_i] = 0.0f;
+        }
+        
+        int load_i_B = tile_offset + local_i;
+        int load_j_B = tile_offset + local_j;
+        
+        if (load_i_B < K && global_j < N) {
+            tile_B[local_j][local_i] = B[load_i_B * N + global_j];
+        } else {
+            tile_B[local_j][local_i] = 0.0f;
+        }
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        #pragma unroll
+        for (int k = 0; k < tile_size; k++) {
+            sum += tile_A[k][local_i] * tile_B[local_j][k];
+        }
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    
+    if (global_i < M && global_j < N) {
+        C[global_i * N + global_j] = sum;
+    }
+}
+
+__kernel void mult_sc(__global float* A, __global float* B, float scalar, int M, int N) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    B[i * N + j] = A[i * N + j] * scalar;
+}
+
+__kernel void add(__global float* A, __global float* B, __global float* C, float a, float b, int M, int N) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    C[i * N + j] = (A[i * N + j] * a) + (B[i * N + j] * b);
+}
+
+__kernel void add_sc(__global float* A, __global float* B, float scalar, int M, int N) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    B[i * N + j] = A[i * N + j] + scalar;
+}
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,74 +1,135 @@
-#include <CL/cl.h>
-
+#include <chrono>
+#include <iostream>
+#include <random>
 #include <stdexcept>
 #include <vector>

-#include "device.hpp"
-#include "matrix.hpp"
+#include "./math/math.hpp"

-class MutableMatrix : public Matrix {
-private:
-  CalcEngine *calcEngine;
-  cl_command_queue queue;
-  cl_kernel kernel;
+typedef Matrices::CPU Matrix;
+typedef MutableMatrices::CPU MutableMatrix;

-public:
-  MutableMatrix(CalcEngine &calcEngine, size_t rows, size_t cols, float *matrix)
-      : Matrix(calcEngine, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, rows, cols,
-               matrix) {
-    this->calcEngine = &calcEngine;
-    kernel = calcEngine.loadKernel("matrix_mult.cl");
-    queue = clCreateCommandQueue(calcEngine.getContext(),
-                                 calcEngine.getDevice(), 0, nullptr);
-    if (!queue) {
-      throw OpenCLException(-1, "clCreateCommandQueue");
+OpenCL openCL;
+
+std::vector<float> generateRandomMatrix(int rows, int cols) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+
+  std::vector<float> matrix(rows * cols);
+  for (int i = 0; i < rows * cols; ++i) {
+    matrix[i] = dis(gen);
  }
+  return matrix;
+}
+std::vector<float> generateIdentityMatrix(int size) {
+  std::vector<float> matrix(size * size, 0.0f);
+  for (int i = 0; i < size; ++i) {
+    matrix[i * size + i] = 1.0f;
  }
-
-  ~MutableMatrix() {
-    if (queue)
-      clReleaseCommandQueue(queue);
-  }
-
-  void mult_by(Matrix &m) {
-    if (cols != m.getRows()) {
-      throw std::invalid_argument("Invalid matrix dimensions");
-    }
-
-    cl_mem b =
-        calcEngine->createBuffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                                 rows * m.getCols() * sizeof(float), nullptr);
-
-    calcEngine->setKernelArgs(kernel, buf, m.getBuf(), b, rows, m.getCols(),
-                              cols);
-    calcEngine->runKernel(queue, kernel, rows, m.getCols());
-
-    clReleaseMemObject(buf);
-    buf = b;
-  }
-
-  std::vector<float> exportMatrix() {
-    std::vector<float> C(rows, cols);
-    calcEngine->readResult(queue, buf, C);
-    return C;
-  }
-};
+  return matrix;
+}

 int main() {
-  CalcEngine calcEngine;
-  calcEngine.printDeviceInfo();
+  const int SIZE = 1024;

-  float matrixA[2 * 3] = {1, 2, 3, 4, 5, 6};
-  MutableMatrix a(calcEngine, 2, 3, matrixA);
+  std::cout << "Testing with " << SIZE << "x" << SIZE << " matrices..."
+            << std::endl;

-  float matrixB[3 * 2] = {1, 2, 3, 4, 5, 6};
-  Matrix b(calcEngine, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 3, 2, matrixB);
+  std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
+  std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
+  std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);

-  a.mult_by(b);
+  // std::vector<float> matrixA = generateIdentityMatrix(SIZE);
+  // std::vector<float> matrixB = generateIdentityMatrix(SIZE);
+  // std::vector<float> matrixC = generateIdentityMatrix(SIZE);

-  std::vector<float> v = a.exportMatrix();
-  for (const auto &element : v) {
-    std::cout << element << " ";
+  // Тестирование на CPU
+  {
+    std::cout << "\n=== CPU Version ===" << std::endl;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    MutableMatrices::CPU a(SIZE, SIZE, matrixA);
+    Matrices::CPU b(SIZE, SIZE, matrixB);
+    Matrices::CPU c(SIZE, SIZE, matrixC);
+
+    auto gen_end = std::chrono::high_resolution_clock::now();
+
+    auto op_start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < 10; i++) {
+      a.mult(b);
+    }
+
+    auto op_end = std::chrono::high_resolution_clock::now();
+
+    std::vector<float> v = a.toVector();
+
+    auto total_end = std::chrono::high_resolution_clock::now();
+
+    auto gen_duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - start);
+    auto op_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        op_end - op_start);
+    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        total_end - start);
+
+    std::cout << "Matrix generation time: " << gen_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Operations time: " << op_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;
+
+    std::cout << "First few elements: ";
+    for (int i = 0; i < 5 && i < v.size(); ++i) {
+      std::cout << v[i] << " ";
+    }
+    std::cout << std::endl;
+  }
+
+  // Тестирование на GPU
+  {
+    std::cout << "\n=== GPU Version ===" << std::endl;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    MutableMatrices::GPU a(SIZE, SIZE, matrixA);
+    Matrices::GPU b(SIZE, SIZE, matrixB);
+    Matrices::GPU c(SIZE, SIZE, matrixC);
+
+    auto gen_end = std::chrono::high_resolution_clock::now();
+
+    auto op_start = std::chrono::high_resolution_clock::now();
+
+    for (int i = 0; i < 10; i++) {
+      a.mult(b);
+    }
+
+    auto op_end = std::chrono::high_resolution_clock::now();
+
+    std::vector<float> v = a.toVector();
+
+    auto total_end = std::chrono::high_resolution_clock::now();
+
+    auto gen_duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(gen_end - start);
+    auto op_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        op_end - op_start);
+    auto total_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+        total_end - start);
+
+    std::cout << "Matrix generation time: " << gen_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Operations time: " << op_duration.count() << " ms"
+              << std::endl;
+    std::cout << "Total time: " << total_duration.count() << " ms" << std::endl;
+
+    std::cout << "First few elements: ";
+    for (int i = 0; i < 5 && i < v.size(); ++i) {
+      std::cout << v[i] << " ";
+    }
+    std::cout << std::endl;
  }

  return 0;
--- a/src/math/math.hpp
+++ b/src/math/math.hpp
@@ -0,0 +1,11 @@
+#ifndef MATH_H
+#define MATH_H
+
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/opencl.hpp>
+
+#include "matrix.hpp"
+#include "mutable_matrix.hpp"
+#include "opencl/opencl.hpp"
+
+#endif
--- a/src/math/matrix.hpp
+++ b/src/math/matrix.hpp
@@ -0,0 +1,126 @@
+#ifndef MATRIX_H
+#define MATRIX_H
+
+#include "./opencl/opencl.hpp"
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+class IMatrix {
+protected:
+  int rows;
+  int cols;
+
+  void validateDimensions(int rows, int cols) {
+    if (rows <= 0 || cols <= 0) {
+      throw std::invalid_argument("Matrix dimensions must be positive");
+    }
+  }
+
+  void checkIndices(int row, int col) const {
+    if (row < 0 || row >= rows || col < 0 || col >= cols) {
+      throw std::out_of_range("Matrix indices out of range");
+    }
+  }
+
+public:
+  IMatrix(int rows, int cols) : rows(rows), cols(cols) {}
+  virtual ~IMatrix() = default;
+  virtual int getRows() const = 0;
+  virtual int getCols() const = 0;
+  virtual const std::vector<float> toVector() const = 0;
+};
+
+namespace Matrices {
+class CPU;
+
+class GPU : public IMatrix {
+protected:
+  cl::Buffer *buffer;
+  cl::CommandQueue queue;
+
+public:
+  GPU(int rows, int cols, const std::vector<float> &matrix)
+      : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
+    validateDimensions(rows, cols);
+    if (matrix.size() != static_cast<size_t>(rows * cols)) {
+      throw std::invalid_argument("Matrix data size doesn't match dimensions");
+    }
+
+    buffer = new cl::Buffer(
+        openCL.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        rows * cols * sizeof(float), const_cast<float *>(matrix.data()));
+  }
+  ~GPU() { delete buffer; }
+
+  GPU(const GPU &) = delete;
+  GPU &operator=(const GPU &) = delete;
+  GPU(GPU &&other) = default;
+  GPU &operator=(GPU &&other) = default;
+
+  int getRows() const override { return rows; }
+  int getCols() const override { return cols; }
+  size_t getSize() const { return rows * cols; }
+
+  const cl::Buffer *getBuffer() const { return buffer; }
+
+  const std::vector<float> toVector() const {
+    std::vector<float> result(rows * cols);
+    queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
+                            result.data());
+    queue.finish();
+    return result;
+  }
+
+  CPU toCPU() const;
+};
+
+class CPU : public IMatrix {
+protected:
+  std::vector<float> data;
+
+public:
+  CPU(int rows, int cols, float value = 0.0f)
+      : IMatrix(rows, cols), data(rows * cols, value) {
+    validateDimensions(rows, cols);
+  }
+
+  CPU(int rows, int cols, const std::vector<float> &matrix)
+      : IMatrix(rows, cols), data(matrix) {
+    validateDimensions(rows, cols);
+    if (matrix.size() != static_cast<size_t>(rows * cols)) {
+      throw std::invalid_argument("Data size doesn't match matrix dimensions");
+    }
+  }
+
+  CPU(const CPU &) = default;
+  CPU &operator=(const CPU &) = default;
+  CPU(CPU &&) = default;
+  CPU &operator=(CPU &&) = default;
+  ~CPU() override = default;
+
+  float &operator()(int row, int col) {
+    checkIndices(row, col);
+    return data[row * cols + col];
+  }
+
+  const float &operator()(int row, int col) const {
+    checkIndices(row, col);
+    return data[row * cols + col];
+  }
+
+  const std::vector<float> toVector() const { return data; }
+
+  int getRows() const override { return rows; }
+  int getCols() const override { return cols; }
+  size_t getSize() const { return data.size(); }
+
+  GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); }
+};
+
+CPU GPU::toCPU() const { return CPU(rows, cols, toVector()); }
+
+} // namespace Matrices
+
+#endif
--- a/src/math/mutable_matrix.hpp
+++ b/src/math/mutable_matrix.hpp
@@ -0,0 +1,194 @@
+#ifndef MUTABLE_MATRIX_H
+#define MUTABLE_MATRIX_H
+
+#include "./opencl/opencl.hpp"
+
+#include "matrix.hpp"
+
+template <typename T> class IMutableMatrix {
+  static_assert(std::is_base_of<IMatrix, T>::value,
+                "T must be derived from IMatrix");
+
+public:
+  virtual void mult(T &m) = 0;
+  virtual void mult(float s) = 0;
+  virtual void add(T &m, float a, float b) = 0;
+  virtual void add(float a) = 0;
+
+  void validateMultDimensions(T &a, T &b) {
+    if (a.getRows() != b.getCols()) {
+      throw std::invalid_argument(
+          "Invalid matrix dimensions for multiplication");
+    }
+  }
+  void validateSameDimensions(T &a, T &b) {
+    if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) {
+      throw std::invalid_argument("Invalid matrix dimensions for addition");
+    }
+  }
+};
+
+namespace MutableMatrices {
+class GPU : public Matrices::GPU, public IMutableMatrix<Matrices::GPU> {
+private:
+  enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD };
+  std::unordered_map<Method, cl::Kernel> kernels;
+  std::unordered_map<Method, std::string> kernelsNames = {
+      {Method::MULT, "mult"},
+      {Method::SCALAR_MULT, "mult_sc"},
+      {Method::ADD, "add"},
+      {Method::SCALAR_ADD, "add_sc"}};
+
+  static void CL_CALLBACK releaseBuffer(cl_event event, cl_int status,
+                                        void *buf) {
+    if (status == CL_COMPLETE) {
+      //   std::cout << "Kernel complete!" << std::endl;
+      delete buf;
+    }
+  }
+
+public:
+  GPU(int rows, int cols, const std::vector<float> &matrix)
+      : Matrices::GPU(rows, cols, matrix) {
+    for (const auto &[method, kernelName] : kernelsNames) {
+      kernels[method] =
+          cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), kernelName);
+    }
+  }
+
+  void mult(Matrices::GPU &m) {
+    validateMultDimensions(*this, m);
+
+    cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                   rows * m.getCols() * sizeof(float));
+
+    const int tile_size = 16;
+    cl::NDRange local_size(tile_size, tile_size);
+    cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size,
+                            ((m.getCols() + tile_size - 1) / tile_size) *
+                                tile_size);
+
+    kernels[Method::MULT].setArg(0, *buffer);
+    kernels[Method::MULT].setArg(1, *m.getBuffer());
+    kernels[Method::MULT].setArg(2, *b);
+    kernels[Method::MULT].setArg(3, rows);
+    kernels[Method::MULT].setArg(4, m.getCols());
+    kernels[Method::MULT].setArg(5, cols);
+    cl::Event event;
+    queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange,
+                               global_size, local_size, nullptr, &event);
+
+    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+    buffer = b;
+    cols = m.getCols();
+  }
+
+  void mult(float scalar) {
+    cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                   rows * cols * sizeof(float));
+    kernels[Method::SCALAR_MULT].setArg(0, *buffer);
+    kernels[Method::SCALAR_MULT].setArg(1, *b);
+    kernels[Method::SCALAR_MULT].setArg(2, scalar);
+    kernels[Method::SCALAR_MULT].setArg(3, rows);
+    kernels[Method::SCALAR_MULT].setArg(4, cols);
+    cl::Event event;
+    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
+                               cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                               &event);
+
+    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+    buffer = b;
+  }
+
+  void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f) {
+    validateSameDimensions(*this, m);
+
+    cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                     rows * cols * sizeof(float));
+    kernels[Method::ADD].setArg(0, *buffer);
+    kernels[Method::ADD].setArg(1, *m.getBuffer());
+    kernels[Method::ADD].setArg(2, *buf);
+    kernels[Method::ADD].setArg(3, a);
+    kernels[Method::ADD].setArg(4, b);
+    kernels[Method::ADD].setArg(5, rows);
+    kernels[Method::ADD].setArg(6, cols);
+    cl::Event event;
+    queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
+                               cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                               &event);
+
+    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+    buffer = buf;
+  }
+
+  void add(float scalar) {
+    cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                                   rows * cols * sizeof(float));
+    kernels[Method::SCALAR_ADD].setArg(0, *buffer);
+    kernels[Method::SCALAR_ADD].setArg(1, *b);
+    kernels[Method::SCALAR_ADD].setArg(2, scalar);
+    kernels[Method::SCALAR_ADD].setArg(3, rows);
+    kernels[Method::SCALAR_ADD].setArg(4, cols);
+    cl::Event event;
+    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
+                               cl::NDRange(rows, cols), cl::NullRange, nullptr,
+                               &event);
+
+    event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
+    buffer = b;
+  }
+};
+class CPU : public Matrices::CPU, public IMutableMatrix<Matrices::CPU> {
+
+public:
+  CPU(int rows, int cols, const std::vector<float> &matrix)
+      : Matrices::CPU(rows, cols, matrix) {}
+
+  void mult(Matrices::CPU &m) {
+    validateMultDimensions(*this, m);
+
+    std::vector<float> result(rows * m.getCols(), 0.0f);
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < m.getCols(); j++) {
+        float sum = 0.0f;
+        for (int k = 0; k < cols; k++) {
+          sum += (*this)(i, k) * m(k, j);
+        }
+        result[i * m.getCols() + j] = sum;
+      }
+    }
+    data = std::move(result);
+    cols = m.getCols();
+  }
+
+  void mult(float scalar) {
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        data[i * cols + j] *= scalar;
+      }
+    }
+  }
+
+  void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f) {
+    validateSameDimensions(*this, m);
+
+    std::vector<float> result(rows * cols, 0.0f);
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b);
+      }
+    }
+    data = std::move(result);
+  }
+
+  void add(float scalar) {
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        data[i * cols + j] += scalar;
+      }
+    }
+  }
+};
+}; // namespace MutableMatrices
+
+#endif
--- a/src/math/opencl/opencl.hpp
+++ b/src/math/opencl/opencl.hpp
@@ -0,0 +1,171 @@
+#ifndef OPENCL_H
+#define OPENCL_H
+
+#include <CL/opencl.hpp>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+
+class OpenCL {
+public:
+  enum class Program { MATRIX, MATH, IMAGE_PROCESSING };
+
+private:
+  cl::Device device;
+  cl::Context context;
+  cl::CommandQueue defaultQueue;
+
+  std::unordered_map<Program, cl::Program> programs;
+  std::unordered_map<Program, std::string> programPaths = {
+      {Program::MATRIX, "./kernels/matrix.cl"}};
+
+  std::string readProgram(const std::string &filePath) {
+    std::ifstream file(filePath, std::ios::binary);
+    if (!file.is_open()) {
+      throw std::runtime_error("Cannot open file: " + filePath);
+    }
+
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    return buffer.str();
+  }
+
+  cl::Program compileProgram(const std::string &file) {
+    std::string source = readProgram(file);
+    cl::Program program(context, source);
+    try {
+      program.build({device});
+    } catch (cl::Error &e) {
+      std::string build_log =
+          program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+      std::cerr << "Build log:\n" << build_log << std::endl;
+      throw;
+    }
+    return program;
+  }
+
+  void loadPrograms() {
+    for (const auto &[programType, filePath] : programPaths) {
+      try {
+        programs[programType] = compileProgram(filePath);
+        std::cout << "Loaded program: " << filePath << std::endl;
+      } catch (const std::exception &e) {
+        std::cerr << "Failed to load program " << filePath << ": " << e.what()
+                  << std::endl;
+      }
+    }
+  }
+
+  void initializeDevice() {
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+
+    if (platforms.empty()) {
+      throw std::runtime_error("No OpenCL platforms found");
+    }
+
+    std::vector<cl::Device> devices;
+    bool deviceFound = false;
+
+    for (const auto &platform : platforms) {
+      try {
+        platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+        if (!devices.empty()) {
+          deviceFound = true;
+          break;
+        }
+      } catch (const cl::Error &) {
+        continue;
+      }
+    }
+
+    if (!deviceFound) {
+      for (const auto &platform : platforms) {
+        try {
+          platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+          if (!devices.empty()) {
+            deviceFound = true;
+            break;
+          }
+        } catch (const cl::Error &) {
+          continue;
+        }
+      }
+    }
+
+    if (!deviceFound) {
+      throw std::runtime_error("No suitable OpenCL devices found");
+    }
+
+    device = devices[0];
+    context = cl::Context(device);
+    defaultQueue = cl::CommandQueue(context, device);
+
+    std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>()
+              << "\nPlatform: " << platforms[0].getInfo<CL_PLATFORM_NAME>()
+              << "\nCompute units: "
+              << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
+              << "\nGlobal memory: "
+              << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+              << " MB" << std::endl;
+  }
+
+public:
+  OpenCL() {
+    try {
+      initializeDevice();
+      loadPrograms();
+    } catch (const cl::Error &e) {
+      std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")"
+                << std::endl;
+      throw;
+    }
+  }
+
+  OpenCL(const OpenCL &) = delete;
+  OpenCL &operator=(const OpenCL &) = delete;
+  OpenCL(OpenCL &&) = delete;
+  OpenCL &operator=(OpenCL &&) = delete;
+
+  cl::Device &getDevice() { return device; }
+  cl::Context &getContext() { return context; }
+  cl::CommandQueue &getDefaultQueue() { return defaultQueue; }
+
+  cl::Program &getProgram(Program program) {
+    auto it = programs.find(program);
+    if (it == programs.end()) {
+      throw std::invalid_argument("Program not loaded: " +
+                                  std::to_string(static_cast<int>(program)));
+    }
+    return it->second;
+  }
+
+  void printDeviceInfo() const {
+    std::cout << "=== OpenCL Device Info ===" << std::endl;
+    std::cout << "Name: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+    std::cout << "Vendor: " << device.getInfo<CL_DEVICE_VENDOR>() << std::endl;
+    std::cout << "Version: " << device.getInfo<CL_DEVICE_VERSION>()
+              << std::endl;
+    std::cout << "Compute Units: "
+              << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << std::endl;
+    std::cout << "Global Memory: "
+              << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+              << " MB" << std::endl;
+    std::cout << "Local Memory: "
+              << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << " KB"
+              << std::endl;
+    std::cout << "Max Work Group Size: "
+              << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
+  }
+
+  bool hasProgram(Program program) const {
+    return programs.find(program) != programs.end();
+  }
+};
+
+extern OpenCL openCL;
+
+#endif
--- a/src/matrix.hpp
+++ b/src/matrix.hpp
@@ -1,32 +0,0 @@
-#ifndef MATRIX_H
-#define MATRIX_H
-
-#include <stdexcept>
-
-#include "device.hpp"
-
-class Matrix {
-protected:
-  cl_mem buf;
-  size_t rows;
-  size_t cols;
-
-public:
-  Matrix(CalcEngine &calcEngine, cl_mem_flags flags, size_t rows, size_t cols,
-         float *matrix)
-      : rows(rows), cols(cols) {
-    if (rows == 0 || cols == 0) {
-      throw std::invalid_argument("Размеры матрицы должны быть больше 0");
-    }
-    buf = calcEngine.createBuffer(flags, rows * cols * sizeof(float), matrix);
-  }
-
-  ~Matrix() { clReleaseMemObject(buf); }
-
-  size_t getRows() const { return rows; }
-  size_t getCols() const { return cols; }
-
-  const cl_mem getBuf() const { return buf; }
-};
-
-#endif
--- a/src/matrix_mult.cl
+++ b/src/matrix_mult.cl
@@ -1,9 +0,0 @@
-__kernel void matrix_mult(__global float* A, __global float* B, __global float* C, int M, int N, int K) {
-    int i = get_global_id(0);
-    int j = get_global_id(1);
-    float sum = 0.0f;
-    for (int k = 0; k < K; k++) {
-        sum += A[i * K + k] * B[k * N + j];
-    }
-    C[i * N + j] = sum;
-}
--- a/src/opencl.hpp
+++ b/src/opencl.hpp
@@ -1,40 +0,0 @@
-#ifndef OPENCL_H
-#define OPENCL_H
-
-#include <CL/cl.h>
-#include <fstream>
-#include <stdexcept>
-
-class OpenCLException : public std::runtime_error {
-private:
-  cl_int error_code;
-
-public:
-  OpenCLException(cl_int error, const std::string &operation)
-      : std::runtime_error("Error during " + operation + ": " +
-                           std::to_string(error)),
-        error_code(error) {}
-
-  cl_int getErrorCode() const { return error_code; }
-};
-
-class OpenCL {
-public:
-  static void checkError(cl_int error, const std::string &operation) {
-    if (error != CL_SUCCESS) {
-      throw OpenCLException(error, operation);
-    }
-  }
-
-  static std::string readFile(const std::string &filename) {
-    std::ifstream file(filename);
-    if (!file.is_open()) {
-      throw std::runtime_error("Failed to open kernel file: " + filename);
-    }
-
-    return std::string((std::istreambuf_iterator<char>(file)),
-                       std::istreambuf_iterator<char>());
-  }
-};
-
-#endif