From 187492c6b06dcead8a861bdfa4d0efe9fb1e5ed1 Mon Sep 17 00:00:00 2001 From: StepanovPlaton Date: Wed, 29 Oct 2025 14:43:30 +0400 Subject: [PATCH] Complete refactor --- .gitignore | 2 + src/Makefile | 4 +- src/kernels/matrix.cl | 58 +++++++- src/main.cpp | 8 +- src/math/math.hpp | 14 +- src/math/matrix.hpp | 126 ---------------- src/math/matrix/cpu/matrix.cpp | 24 +++ src/math/matrix/cpu/matrix.hpp | 38 +++++ src/math/matrix/cpu/mutable_matrix.cpp | 76 ++++++++++ src/math/matrix/cpu/mutable_matrix.hpp | 28 ++++ src/math/matrix/gpu/matrix.cpp | 21 +++ src/math/matrix/gpu/matrix.hpp | 33 +++++ src/math/matrix/gpu/mutable_matrix.cpp | 113 ++++++++++++++ src/math/matrix/gpu/mutable_matrix.hpp | 40 +++++ src/math/matrix/matrix.hpp | 28 ++++ src/math/matrix/mutable_matrix.hpp | 29 ++++ src/math/mutable_matrix.hpp | 194 ------------------------- src/math/opencl/opencl.cpp | 121 +++++++++++++++ src/math/opencl/opencl.hpp | 144 ++---------------- 19 files changed, 631 insertions(+), 470 deletions(-) create mode 100644 .gitignore delete mode 100644 src/math/matrix.hpp create mode 100644 src/math/matrix/cpu/matrix.cpp create mode 100644 src/math/matrix/cpu/matrix.hpp create mode 100644 src/math/matrix/cpu/mutable_matrix.cpp create mode 100644 src/math/matrix/cpu/mutable_matrix.hpp create mode 100644 src/math/matrix/gpu/matrix.cpp create mode 100644 src/math/matrix/gpu/matrix.hpp create mode 100644 src/math/matrix/gpu/mutable_matrix.cpp create mode 100644 src/math/matrix/gpu/mutable_matrix.hpp create mode 100644 src/math/matrix/matrix.hpp create mode 100644 src/math/matrix/mutable_matrix.hpp delete mode 100644 src/math/mutable_matrix.hpp create mode 100644 src/math/opencl/opencl.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0f662bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.vscode +*.exe diff --git a/src/Makefile b/src/Makefile index c542ded..540c858 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,8 +1,8 @@ CXX = g++ -CXXFLAGS = -Wall -O2 -std=c++11 +CXXFLAGS = -Wall -Wextra -O2 -std=c++11 LIBS = -lOpenCL TARGET = main -SRC = main.cpp +SRC = main.cpp ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp INCLUDES = -I"A:/Programs/OpenCL/include" LIB_PATH = -L"A:/Programs/OpenCL/lib" diff --git a/src/kernels/matrix.cl b/src/kernels/matrix.cl index 327a83f..f710fd5 100644 --- a/src/kernels/matrix.cl +++ b/src/kernels/matrix.cl @@ -1,5 +1,52 @@ -__kernel void mult(__global float* A, __global float* B, __global float* C, - int M, int N, int K) { +float activate_x(float x, const int activation_type, const float alpha) { + switch(activation_type) { + case 0: // LINEAR + return x; + case 1: // SIGMOID + return 1.0f / (1.0f + exp(-x)); + case 2: // TANH + return tanh(x); + case 3: // RELU + return fmax(0.0f, x); + case 4: // LEAKY_RELU + return (x > 0.0f) ? x : alpha * x; + case 5: // ELU + return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f); + case 6: // GELU + return 0.5f * x * (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x))); + default: + return x; + } +} + +__kernel void activate( + __global float* input, + __global float* output, + const int activation_type, + const float alpha, + const int rows, + const int cols) +{ + int row = get_global_id(0); + int col = get_global_id(1); + + if (row < rows && col < cols) { + int idx = row * cols + col; + output[idx] = activate_x(input[idx], activation_type, alpha); + } +} + +__kernel void mult( + __global float* A, + __global float* B, + __global float* C, + const float bias, + const int activation_type, + const float alpha, + const int M, + const int N, + const int K) +{ const int tile_size = 16; int local_i = get_local_id(0); @@ -49,7 +96,11 @@ __kernel void mult(__global float* A, __global float* B, __global float* C, } if (global_i < M && global_j < N) { - C[global_i * N + global_j] = sum; + float result = sum + bias; + if (activation_type != 0) { + result = activate_x(result, activation_type, alpha); + } + C[global_i * N + global_j] = result; } } @@ -70,3 +121,4 @@ __kernel void add_sc(__global float* A, __global float* B, float scalar, int M, int j = get_global_id(1); B[i * N + j] = A[i * N + j] + scalar; } + diff --git a/src/main.cpp b/src/main.cpp index 414de44..d3e1492 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -59,7 +59,7 @@ int main() { auto op_start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < 10; i++) { - a.mult(b); + a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID); } auto op_end = std::chrono::high_resolution_clock::now(); @@ -82,7 +82,7 @@ int main() { std::cout << "Total time: " << total_duration.count() << " ms" << std::endl; std::cout << "First few elements: "; - for (int i = 0; i < 5 && i < v.size(); ++i) { + for (size_t i = 0; i < 5 && i < v.size(); ++i) { std::cout << v[i] << " "; } std::cout << std::endl; @@ -103,7 +103,7 @@ int main() { auto op_start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < 10; i++) { - a.mult(b); + a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f); } auto op_end = std::chrono::high_resolution_clock::now(); @@ -126,7 +126,7 @@ int main() { std::cout << "Total time: " << total_duration.count() << " ms" << std::endl; std::cout << "First few elements: "; - for (int i = 0; i < 5 && i < v.size(); ++i) { + for (size_t i = 0; i < 5 && i < v.size(); ++i) { std::cout << v[i] << " "; } std::cout << std::endl; diff --git a/src/math/math.hpp b/src/math/math.hpp index 7b0c4f4..b553fb6 100644 --- a/src/math/math.hpp +++ b/src/math/math.hpp @@ -1,11 +1,9 @@ -#ifndef MATH_H -#define MATH_H +#pragma once -#define __CL_ENABLE_EXCEPTIONS -#include - -#include "matrix.hpp" -#include "mutable_matrix.hpp" #include "opencl/opencl.hpp" -#endif \ No newline at end of file +#include "matrix/cpu/matrix.hpp" +#include "matrix/cpu/mutable_matrix.hpp" + +#include "matrix/gpu/matrix.hpp" +#include "matrix/gpu/mutable_matrix.hpp" diff --git a/src/math/matrix.hpp b/src/math/matrix.hpp deleted file mode 100644 index 8959009..0000000 --- a/src/math/matrix.hpp +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef MATRIX_H -#define MATRIX_H - -#include "./opencl/opencl.hpp" -#include -#include -#include -#include - -class IMatrix { -protected: - int rows; - int cols; - - void validateDimensions(int rows, int cols) { - if (rows <= 0 || cols <= 0) { - throw std::invalid_argument("Matrix dimensions must be positive"); - } - } - - void checkIndices(int row, int col) const { - if (row < 0 || row >= rows || col < 0 || col >= cols) { - throw std::out_of_range("Matrix indices out of range"); - } - } - -public: - IMatrix(int rows, int cols) : rows(rows), cols(cols) {} - virtual ~IMatrix() = default; - virtual int getRows() const = 0; - virtual int getCols() const = 0; - virtual const std::vector toVector() const = 0; -}; - -namespace Matrices { -class CPU; - -class GPU : public IMatrix { -protected: - cl::Buffer *buffer; - cl::CommandQueue queue; - -public: - GPU(int rows, int cols, const std::vector &matrix) - : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) { - validateDimensions(rows, cols); - if (matrix.size() != static_cast(rows * cols)) { - throw std::invalid_argument("Matrix data size doesn't match dimensions"); - } - - buffer = new cl::Buffer( - openCL.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - rows * cols * sizeof(float), const_cast(matrix.data())); - } - ~GPU() { delete buffer; } - - GPU(const GPU &) = delete; - GPU &operator=(const GPU &) = delete; - GPU(GPU &&other) = default; - GPU &operator=(GPU &&other) = default; - - int getRows() const override { return rows; } - int getCols() const override { return cols; } - size_t getSize() const { return rows * cols; } - - const cl::Buffer *getBuffer() const { return buffer; } - - const std::vector toVector() const { - std::vector result(rows * cols); - queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float), - result.data()); - queue.finish(); - return result; - } - - CPU toCPU() const; -}; - -class CPU : public IMatrix { -protected: - std::vector data; - -public: - CPU(int rows, int cols, float value = 0.0f) - : IMatrix(rows, cols), data(rows * cols, value) { - validateDimensions(rows, cols); - } - - CPU(int rows, int cols, const std::vector &matrix) - : IMatrix(rows, cols), data(matrix) { - validateDimensions(rows, cols); - if (matrix.size() != static_cast(rows * cols)) { - throw std::invalid_argument("Data size doesn't match matrix dimensions"); - } - } - - CPU(const CPU &) = default; - CPU &operator=(const CPU &) = default; - CPU(CPU &&) = default; - CPU &operator=(CPU &&) = default; - ~CPU() override = default; - - float &operator()(int row, int col) { - checkIndices(row, col); - return data[row * cols + col]; - } - - const float &operator()(int row, int col) const { - checkIndices(row, col); - return data[row * cols + col]; - } - - const std::vector toVector() const { return data; } - - int getRows() const override { return rows; } - int getCols() const override { return cols; } - size_t getSize() const { return data.size(); } - - GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); } -}; - -CPU GPU::toCPU() const { return CPU(rows, cols, toVector()); } - -} // namespace Matrices - -#endif \ No newline at end of file diff --git a/src/math/matrix/cpu/matrix.cpp b/src/math/matrix/cpu/matrix.cpp new file mode 100644 index 0000000..ab07f5b --- /dev/null +++ b/src/math/matrix/cpu/matrix.cpp @@ -0,0 +1,24 @@ +#include "matrix.hpp" + +Matrices::CPU::CPU(int rows, int cols, float value) + : IMatrix(rows, cols), data(rows * cols, value) { + validateDimensions(rows, cols); +} + +Matrices::CPU::CPU(int rows, int cols, const std::vector &matrix) + : IMatrix(rows, cols), data(matrix) { + validateDimensions(rows, cols); + if (matrix.size() != static_cast(rows * cols)) { + throw std::invalid_argument("Data size doesn't match matrix dimensions"); + } +} + +float &Matrices::CPU::operator()(int row, int col) { + checkIndices(row, col); + return data[row * cols + col]; +} + +const float &Matrices::CPU::operator()(int row, int col) const { + checkIndices(row, col); + return data[row * cols + col]; +} \ No newline at end of file diff --git a/src/math/matrix/cpu/matrix.hpp b/src/math/matrix/cpu/matrix.hpp new file mode 100644 index 0000000..c8d6842 --- /dev/null +++ b/src/math/matrix/cpu/matrix.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include +#include + +#include "../matrix.hpp" + +namespace Matrices { + +class CPU : public IMatrix { +protected: + std::vector data; + +public: + CPU(int rows, int cols, float value = 0.0f); + CPU(int rows, int cols, const std::vector &matrix); + + CPU(const CPU &) = default; + CPU &operator=(const CPU &) = default; + CPU(CPU &&) = default; + CPU &operator=(CPU &&) = default; + ~CPU() override = default; + + float &operator()(int row, int col); + const float &operator()(int row, int col) const; + + const std::vector toVector() const { return data; } + + int getRows() const override { return rows; } + int getCols() const override { return cols; } + size_t getSize() const { return data.size(); } + + // GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); } +}; + +} // namespace Matrices diff --git a/src/math/matrix/cpu/mutable_matrix.cpp b/src/math/matrix/cpu/mutable_matrix.cpp new file mode 100644 index 0000000..5ea45e1 --- /dev/null +++ b/src/math/matrix/cpu/mutable_matrix.cpp @@ -0,0 +1,76 @@ +#include "mutable_matrix.hpp" + +float MutableMatrices::CPU::activate_x(float x, Activate type, float alpha) { + switch (type) { + case Activate::LINEAR: + return x; + case Activate::SIGMOID: + return 1.0f / (1.0f + std::exp(-x)); + case Activate::TANH: + return std::tanh(x); + case Activate::RELU: + return std::max(0.0f, x); + case Activate::LEAKY_RELU: + return (x > 0.0f) ? x : alpha * x; + case Activate::ELU: + return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f); + case Activate::GELU: + return 0.5f * x * + (1.0f + + std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x))); + default: + throw std::invalid_argument("Unknown activation type"); + } +} +void MutableMatrices::CPU::mult(Matrices::CPU &m, float bias, Activate type, + float alpha) { + validateMultDimensions(*this, m); + + std::vector result(rows * m.getCols(), 0.0f); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < m.getCols(); j++) { + float sum = 0.0f; + for (int k = 0; k < cols; k++) { + sum += (*this)(i, k) * m(k, j); + } + result[i * m.getCols() + j] = activate_x(sum + bias, type, alpha); + } + } + data = std::move(result); + cols = m.getCols(); +} + +void MutableMatrices::CPU::mult(float scalar) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + data[i * cols + j] *= scalar; + } + } +} + +void MutableMatrices::CPU::add(Matrices::CPU &m, float a, float b) { + validateSameDimensions(*this, m); + + std::vector result(rows * cols, 0.0f); + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b); + } + } + data = std::move(result); +} + +void MutableMatrices::CPU::add(float scalar) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + data[i * cols + j] += scalar; + } + } +} +void MutableMatrices::CPU::activate(Activate type, float alpha) { + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + data[i * cols + j] = activate_x(data[i * cols + j], type, alpha); + } + } +} \ No newline at end of file diff --git a/src/math/matrix/cpu/mutable_matrix.hpp b/src/math/matrix/cpu/mutable_matrix.hpp new file mode 100644 index 0000000..5110413 --- /dev/null +++ b/src/math/matrix/cpu/mutable_matrix.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "matrix.hpp" + +#include "../mutable_matrix.hpp" + +#include + +#define M_PI 3.14159265358979323846 + +namespace MutableMatrices { + +class CPU : public Matrices::CPU, public IMutableMatrix { +private: + static float activate_x(float x, Activate type, float alpha = 0.01f); + +public: + CPU(int rows, int cols, const std::vector &matrix) + : Matrices::CPU(rows, cols, matrix) {} + + void mult(Matrices::CPU &m, float bias = 0.0f, + Activate type = Activate::LINEAR, float alpha = 0.01f); + void mult(float scalar); + void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f); + void add(float scalar); + void activate(Activate type, float alpha = 0.01f); +}; +}; // namespace MutableMatrices diff --git a/src/math/matrix/gpu/matrix.cpp b/src/math/matrix/gpu/matrix.cpp new file mode 100644 index 0000000..3b46ac3 --- /dev/null +++ b/src/math/matrix/gpu/matrix.cpp @@ -0,0 +1,21 @@ +#include "matrix.hpp" + +Matrices::GPU::GPU(int rows, int cols, const std::vector &matrix) + : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) { + validateDimensions(rows, cols); + if (matrix.size() != static_cast(rows * cols)) { + throw std::invalid_argument("Matrix data size doesn't match dimensions"); + } + + buffer = new cl::Buffer( + openCL.getContext(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + rows * cols * sizeof(float), const_cast(matrix.data())); +} + +const std::vector Matrices::GPU::toVector() const { + std::vector result(rows * cols); + queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float), + result.data()); + queue.finish(); + return result; +} \ No newline at end of file diff --git a/src/math/matrix/gpu/matrix.hpp b/src/math/matrix/gpu/matrix.hpp new file mode 100644 index 0000000..c6b7fae --- /dev/null +++ b/src/math/matrix/gpu/matrix.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include "../../opencl/opencl.hpp" + +#include "../matrix.hpp" + +namespace Matrices { +class GPU : public IMatrix { +protected: + cl::Buffer *buffer; + cl::CommandQueue queue; + +public: + GPU(int rows, int cols, const std::vector &matrix); + ~GPU() { delete buffer; } + + GPU(const GPU &) = delete; + GPU &operator=(const GPU &) = delete; + GPU(GPU &&other) = default; + GPU &operator=(GPU &&other) = default; + + int getRows() const override { return rows; } + int getCols() const override { return cols; } + size_t getSize() const { return rows * cols; } + + const cl::Buffer *getBuffer() const { return buffer; } + + const std::vector toVector() const; + + // CPU toCPU() const { return CPU(rows, cols, toVector()); }; +}; + +} // namespace Matrices diff --git a/src/math/matrix/gpu/mutable_matrix.cpp b/src/math/matrix/gpu/mutable_matrix.cpp new file mode 100644 index 0000000..1e09e0a --- /dev/null +++ b/src/math/matrix/gpu/mutable_matrix.cpp @@ -0,0 +1,113 @@ +#include "mutable_matrix.hpp" + +MutableMatrices::GPU::GPU(int rows, int cols, const std::vector &matrix) + : Matrices::GPU(rows, cols, matrix) { + for (const auto &entry : kernelsNames) { + kernels[entry.first] = + cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second); + } +} + +void MutableMatrices::GPU::mult(Matrices::GPU &m, float bias, Activate type, + float alpha) { + validateMultDimensions(*this, m); + + cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, + rows * m.getCols() * sizeof(float)); + + const int tile_size = 16; + cl::NDRange local_size(tile_size, tile_size); + cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size, + ((m.getCols() + tile_size - 1) / tile_size) * + tile_size); + + kernels[Method::MULT].setArg(0, *buffer); + kernels[Method::MULT].setArg(1, *m.getBuffer()); + kernels[Method::MULT].setArg(2, *b); + kernels[Method::MULT].setArg(3, bias); + kernels[Method::MULT].setArg(4, static_cast(type)); + kernels[Method::MULT].setArg(5, alpha); + kernels[Method::MULT].setArg(6, rows); + kernels[Method::MULT].setArg(7, m.getCols()); + kernels[Method::MULT].setArg(8, cols); + cl::Event event; + queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, global_size, + local_size, nullptr, &event); + + event.setCallback(CL_COMPLETE, releaseBuffer, buffer); + buffer = b; + cols = m.getCols(); +} + +void MutableMatrices::GPU::mult(float scalar) { + cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, + rows * cols * sizeof(float)); + kernels[Method::SCALAR_MULT].setArg(0, *buffer); + kernels[Method::SCALAR_MULT].setArg(1, *b); + kernels[Method::SCALAR_MULT].setArg(2, scalar); + kernels[Method::SCALAR_MULT].setArg(3, rows); + kernels[Method::SCALAR_MULT].setArg(4, cols); + cl::Event event; + queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange, + cl::NDRange(rows, cols), cl::NullRange, nullptr, + &event); + + event.setCallback(CL_COMPLETE, releaseBuffer, buffer); + buffer = b; +} + +void MutableMatrices::GPU::add(Matrices::GPU &m, float a, float b) { + validateSameDimensions(*this, m); + + cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, + rows * cols * sizeof(float)); + kernels[Method::ADD].setArg(0, *buffer); + kernels[Method::ADD].setArg(1, *m.getBuffer()); + kernels[Method::ADD].setArg(2, *buf); + kernels[Method::ADD].setArg(3, a); + kernels[Method::ADD].setArg(4, b); + kernels[Method::ADD].setArg(5, rows); + kernels[Method::ADD].setArg(6, cols); + cl::Event event; + queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange, + cl::NDRange(rows, cols), cl::NullRange, nullptr, + &event); + + event.setCallback(CL_COMPLETE, releaseBuffer, buffer); + buffer = buf; +} + +void MutableMatrices::GPU::add(float scalar) { + cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, + rows * cols * sizeof(float)); + kernels[Method::SCALAR_ADD].setArg(0, *buffer); + kernels[Method::SCALAR_ADD].setArg(1, *b); + kernels[Method::SCALAR_ADD].setArg(2, scalar); + kernels[Method::SCALAR_ADD].setArg(3, rows); + kernels[Method::SCALAR_ADD].setArg(4, cols); + cl::Event event; + queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange, + cl::NDRange(rows, cols), cl::NullRange, nullptr, + &event); + + event.setCallback(CL_COMPLETE, releaseBuffer, buffer); + buffer = b; +} + +void MutableMatrices::GPU::activate(Activate type, float alpha) { + cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, + rows * cols * sizeof(float)); + kernels[Method::ACTIVATE].setArg(0, *buffer); + kernels[Method::ACTIVATE].setArg(1, *b); + kernels[Method::ACTIVATE].setArg(2, static_cast(type)); + kernels[Method::ACTIVATE].setArg(3, alpha); + kernels[Method::ACTIVATE].setArg(4, rows); + kernels[Method::ACTIVATE].setArg(5, cols); + cl::Event event; + queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange, + cl::NDRange(rows, cols), cl::NullRange, nullptr, + &event); + + event.setCallback(CL_COMPLETE, releaseBuffer, buffer); + buffer = b; +} \ No newline at end of file diff --git a/src/math/matrix/gpu/mutable_matrix.hpp b/src/math/matrix/gpu/mutable_matrix.hpp new file mode 100644 index 0000000..02df50f --- /dev/null +++ b/src/math/matrix/gpu/mutable_matrix.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include "../../opencl/opencl.hpp" + +#include "matrix.hpp" + +#include "../mutable_matrix.hpp" + +namespace MutableMatrices { + +class GPU : public Matrices::GPU, public IMutableMatrix { +private: + enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD, ACTIVATE }; + std::unordered_map kernels; + std::unordered_map kernelsNames = { + {Method::MULT, "mult"}, + {Method::SCALAR_MULT, "mult_sc"}, + {Method::ADD, "add"}, + {Method::SCALAR_ADD, "add_sc"}, + {Method::ACTIVATE, "activate"}}; + + static void CL_CALLBACK releaseBuffer(cl_event, cl_int status, void *buf) { + if (status == CL_COMPLETE) { + // std::cout << "Kernel complete!" << std::endl; + delete (cl::Buffer *)buf; + } + } + +public: + GPU(int rows, int cols, const std::vector &matrix); + + void mult(Matrices::GPU &m, float bias = 0.0f, + Activate type = Activate::LINEAR, float alpha = 0.01f); + void mult(float scalar); + void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f); + void add(float scalar); + void activate(Activate type, float alpha = 0.01f); +}; + +}; // namespace MutableMatrices diff --git a/src/math/matrix/matrix.hpp b/src/math/matrix/matrix.hpp new file mode 100644 index 0000000..fa8748b --- /dev/null +++ b/src/math/matrix/matrix.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +class IMatrix { +protected: + int rows; + int cols; + + void validateDimensions(int rows, int cols) const { + if (rows <= 0 || cols <= 0) { + throw std::invalid_argument("Matrix dimensions must be positive"); + } + }; + void checkIndices(int row, int col) const { + if (row < 0 || row >= rows || col < 0 || col >= cols) { + throw std::out_of_range("Matrix indices out of range"); + } + }; + +public: + IMatrix(int rows, int cols) : rows(rows), cols(cols) {} + virtual ~IMatrix() = default; + virtual int getRows() const = 0; + virtual int getCols() const = 0; + virtual const std::vector toVector() const = 0; +}; diff --git a/src/math/matrix/mutable_matrix.hpp b/src/math/matrix/mutable_matrix.hpp new file mode 100644 index 0000000..5f84236 --- /dev/null +++ b/src/math/matrix/mutable_matrix.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "matrix.hpp" + +template class IMutableMatrix { + static_assert(std::is_base_of::value, + "T must be derived from IMatrix"); + +public: + enum class Activate { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU }; + + virtual void mult(T &m, float bias, Activate type, float alpha) = 0; + virtual void mult(float s) = 0; + virtual void add(T &m, float a, float b) = 0; + virtual void add(float a) = 0; + virtual void activate(Activate type, float alpha = 0.01f) = 0; + + void validateMultDimensions(T &a, T &b) const { + if (a.getRows() != b.getCols()) { + throw std::invalid_argument( + "Invalid matrix dimensions for multiplication"); + } + }; + void validateSameDimensions(T &a, T &b) const { + if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) { + throw std::invalid_argument("Invalid matrix dimensions for addition"); + } + }; +}; diff --git a/src/math/mutable_matrix.hpp b/src/math/mutable_matrix.hpp deleted file mode 100644 index 812eade..0000000 --- a/src/math/mutable_matrix.hpp +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef MUTABLE_MATRIX_H -#define MUTABLE_MATRIX_H - -#include "./opencl/opencl.hpp" - -#include "matrix.hpp" - -template class IMutableMatrix { - static_assert(std::is_base_of::value, - "T must be derived from IMatrix"); - -public: - virtual void mult(T &m) = 0; - virtual void mult(float s) = 0; - virtual void add(T &m, float a, float b) = 0; - virtual void add(float a) = 0; - - void validateMultDimensions(T &a, T &b) { - if (a.getRows() != b.getCols()) { - throw std::invalid_argument( - "Invalid matrix dimensions for multiplication"); - } - } - void validateSameDimensions(T &a, T &b) { - if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) { - throw std::invalid_argument("Invalid matrix dimensions for addition"); - } - } -}; - -namespace MutableMatrices { -class GPU : public Matrices::GPU, public IMutableMatrix { -private: - enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD }; - std::unordered_map kernels; - std::unordered_map kernelsNames = { - {Method::MULT, "mult"}, - {Method::SCALAR_MULT, "mult_sc"}, - {Method::ADD, "add"}, - {Method::SCALAR_ADD, "add_sc"}}; - - static void CL_CALLBACK releaseBuffer(cl_event event, cl_int status, - void *buf) { - if (status == CL_COMPLETE) { - // std::cout << "Kernel complete!" << std::endl; - delete buf; - } - } - -public: - GPU(int rows, int cols, const std::vector &matrix) - : Matrices::GPU(rows, cols, matrix) { - for (const auto &[method, kernelName] : kernelsNames) { - kernels[method] = - cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), kernelName); - } - } - - void mult(Matrices::GPU &m) { - validateMultDimensions(*this, m); - - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * m.getCols() * sizeof(float)); - - const int tile_size = 16; - cl::NDRange local_size(tile_size, tile_size); - cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size, - ((m.getCols() + tile_size - 1) / tile_size) * - tile_size); - - kernels[Method::MULT].setArg(0, *buffer); - kernels[Method::MULT].setArg(1, *m.getBuffer()); - kernels[Method::MULT].setArg(2, *b); - kernels[Method::MULT].setArg(3, rows); - kernels[Method::MULT].setArg(4, m.getCols()); - kernels[Method::MULT].setArg(5, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, - global_size, local_size, nullptr, &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; - cols = m.getCols(); - } - - void mult(float scalar) { - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::SCALAR_MULT].setArg(0, *buffer); - kernels[Method::SCALAR_MULT].setArg(1, *b); - kernels[Method::SCALAR_MULT].setArg(2, scalar); - kernels[Method::SCALAR_MULT].setArg(3, rows); - kernels[Method::SCALAR_MULT].setArg(4, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; - } - - void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f) { - validateSameDimensions(*this, m); - - cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::ADD].setArg(0, *buffer); - kernels[Method::ADD].setArg(1, *m.getBuffer()); - kernels[Method::ADD].setArg(2, *buf); - kernels[Method::ADD].setArg(3, a); - kernels[Method::ADD].setArg(4, b); - kernels[Method::ADD].setArg(5, rows); - kernels[Method::ADD].setArg(6, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = buf; - } - - void add(float scalar) { - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::SCALAR_ADD].setArg(0, *buffer); - kernels[Method::SCALAR_ADD].setArg(1, *b); - kernels[Method::SCALAR_ADD].setArg(2, scalar); - kernels[Method::SCALAR_ADD].setArg(3, rows); - kernels[Method::SCALAR_ADD].setArg(4, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; - } -}; -class CPU : public Matrices::CPU, public IMutableMatrix { - -public: - CPU(int rows, int cols, const std::vector &matrix) - : Matrices::CPU(rows, cols, matrix) {} - - void mult(Matrices::CPU &m) { - validateMultDimensions(*this, m); - - std::vector result(rows * m.getCols(), 0.0f); - for (int i = 0; i < rows; i++) { - for (int j = 0; j < m.getCols(); j++) { - float sum = 0.0f; - for (int k = 0; k < cols; k++) { - sum += (*this)(i, k) * m(k, j); - } - result[i * m.getCols() + j] = sum; - } - } - data = std::move(result); - cols = m.getCols(); - } - - void mult(float scalar) { - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - data[i * cols + j] *= scalar; - } - } - } - - void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f) { - validateSameDimensions(*this, m); - - std::vector result(rows * cols, 0.0f); - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b); - } - } - data = std::move(result); - } - - void add(float scalar) { - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - data[i * cols + j] += scalar; - } - } - } -}; -}; // namespace MutableMatrices - -#endif \ No newline at end of file diff --git a/src/math/opencl/opencl.cpp b/src/math/opencl/opencl.cpp new file mode 100644 index 0000000..853029d --- /dev/null +++ b/src/math/opencl/opencl.cpp @@ -0,0 +1,121 @@ +#include "opencl.hpp" + +std::string OpenCL::readProgram(const std::string &filePath) { + std::ifstream file(filePath, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filePath); + } + + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); +} +cl::Program OpenCL::compileProgram(const std::string &file) { + std::string source = readProgram(file); + cl::Program program(context, source); + try { + program.build({device}); + } catch (cl::Error &e) { + std::string build_log = program.getBuildInfo(device); + std::cerr << "Build log:\n" << build_log << std::endl; + throw; + } + return program; +} +void OpenCL::loadPrograms() { + for (const auto &entry : programPaths) { + programs[entry.first] = compileProgram(entry.second); + std::cout << "Loaded program: " << entry.second << std::endl; + } +} + +void OpenCL::initializeDevice() { + std::vector platforms; + cl::Platform::get(&platforms); + + if (platforms.empty()) { + throw std::runtime_error("No OpenCL platforms found"); + } + + std::vector devices; + bool deviceFound = false; + + for (const auto &platform : platforms) { + try { + platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); + if (!devices.empty()) { + deviceFound = true; + break; + } + } catch (const cl::Error &) { + continue; + } + } + + if (!deviceFound) { + for (const auto &platform : platforms) { + try { + platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); + if (!devices.empty()) { + deviceFound = true; + break; + } + } catch (const cl::Error &) { + continue; + } + } + } + + if (!deviceFound) { + throw std::runtime_error("No suitable OpenCL devices found"); + } + + device = devices[0]; + context = cl::Context(device); + defaultQueue = cl::CommandQueue(context, device); + + std::cout << "Using device: " << device.getInfo() + << "\nPlatform: " << platforms[0].getInfo() + << "\nCompute units: " + << device.getInfo() + << "\nGlobal memory: " + << device.getInfo() / (1024 * 1024) + << " MB" << std::endl; +} + +OpenCL::OpenCL() { + try { + initializeDevice(); + loadPrograms(); + } catch (const cl::Error &e) { + std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")" + << std::endl; + throw; + } +} + +cl::Program &OpenCL::getProgram(Program program) { + auto it = programs.find(program); + if (it == programs.end()) { + throw std::invalid_argument("Program not loaded: " + + std::to_string(static_cast(program))); + } + return it->second; +} + +void OpenCL::printDeviceInfo() const { + std::cout << "=== OpenCL Device Info ===" << std::endl; + std::cout << "Name: " << device.getInfo() << std::endl; + std::cout << "Vendor: " << device.getInfo() << std::endl; + std::cout << "Version: " << device.getInfo() << std::endl; + std::cout << "Compute Units: " + << device.getInfo() << std::endl; + std::cout << "Global Memory: " + << device.getInfo() / (1024 * 1024) + << " MB" << std::endl; + std::cout << "Local Memory: " + << device.getInfo() / 1024 << " KB" + << std::endl; + std::cout << "Max Work Group Size: " + << device.getInfo() << std::endl; +} \ No newline at end of file diff --git a/src/math/opencl/opencl.hpp b/src/math/opencl/opencl.hpp index 7a314c2..8b32101 100644 --- a/src/math/opencl/opencl.hpp +++ b/src/math/opencl/opencl.hpp @@ -1,7 +1,9 @@ -#ifndef OPENCL_H -#define OPENCL_H +#pragma once +#define CL_HPP_ENABLE_EXCEPTIONS +#define CL_HPP_TARGET_OPENCL_VERSION 300 #include + #include #include #include @@ -22,108 +24,14 @@ private: std::unordered_map programPaths = { {Program::MATRIX, "./kernels/matrix.cl"}}; - std::string readProgram(const std::string &filePath) { - std::ifstream file(filePath, std::ios::binary); - if (!file.is_open()) { - throw std::runtime_error("Cannot open file: " + filePath); - } + std::string readProgram(const std::string &filePath); + cl::Program compileProgram(const std::string &file); + void loadPrograms(); - std::stringstream buffer; - buffer << file.rdbuf(); - return buffer.str(); - } - - cl::Program compileProgram(const std::string &file) { - std::string source = readProgram(file); - cl::Program program(context, source); - try { - program.build({device}); - } catch (cl::Error &e) { - std::string build_log = - program.getBuildInfo(device); - std::cerr << "Build log:\n" << build_log << std::endl; - throw; - } - return program; - } - - void loadPrograms() { - for (const auto &[programType, filePath] : programPaths) { - try { - programs[programType] = compileProgram(filePath); - std::cout << "Loaded program: " << filePath << std::endl; - } catch (const std::exception &e) { - std::cerr << "Failed to load program " << filePath << ": " << e.what() - << std::endl; - } - } - } - - void initializeDevice() { - std::vector platforms; - cl::Platform::get(&platforms); - - if (platforms.empty()) { - throw std::runtime_error("No OpenCL platforms found"); - } - - std::vector devices; - bool deviceFound = false; - - for (const auto &platform : platforms) { - try { - platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); - if (!devices.empty()) { - deviceFound = true; - break; - } - } catch (const cl::Error &) { - continue; - } - } - - if (!deviceFound) { - for (const auto &platform : platforms) { - try { - platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); - if (!devices.empty()) { - deviceFound = true; - break; - } - } catch (const cl::Error &) { - continue; - } - } - } - - if (!deviceFound) { - throw std::runtime_error("No suitable OpenCL devices found"); - } - - device = devices[0]; - context = cl::Context(device); - defaultQueue = cl::CommandQueue(context, device); - - std::cout << "Using device: " << device.getInfo() - << "\nPlatform: " << platforms[0].getInfo() - << "\nCompute units: " - << device.getInfo() - << "\nGlobal memory: " - << device.getInfo() / (1024 * 1024) - << " MB" << std::endl; - } + void initializeDevice(); public: - OpenCL() { - try { - initializeDevice(); - loadPrograms(); - } catch (const cl::Error &e) { - std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")" - << std::endl; - throw; - } - } + OpenCL(); OpenCL(const OpenCL &) = delete; OpenCL &operator=(const OpenCL &) = delete; @@ -134,38 +42,8 @@ public: cl::Context &getContext() { return context; } cl::CommandQueue &getDefaultQueue() { return defaultQueue; } - cl::Program &getProgram(Program program) { - auto it = programs.find(program); - if (it == programs.end()) { - throw std::invalid_argument("Program not loaded: " + - std::to_string(static_cast(program))); - } - return it->second; - } - - void printDeviceInfo() const { - std::cout << "=== OpenCL Device Info ===" << std::endl; - std::cout << "Name: " << device.getInfo() << std::endl; - std::cout << "Vendor: " << device.getInfo() << std::endl; - std::cout << "Version: " << device.getInfo() - << std::endl; - std::cout << "Compute Units: " - << device.getInfo() << std::endl; - std::cout << "Global Memory: " - << device.getInfo() / (1024 * 1024) - << " MB" << std::endl; - std::cout << "Local Memory: " - << device.getInfo() / 1024 << " KB" - << std::endl; - std::cout << "Max Work Group Size: " - << device.getInfo() << std::endl; - } - - bool hasProgram(Program program) const { - return programs.find(program) != programs.end(); - } + cl::Program &getProgram(Program program); + void printDeviceInfo() const; }; extern OpenCL openCL; - -#endif \ No newline at end of file