From f1dfe1b33571d20fea08e33531d333c8d4631d29 Mon Sep 17 00:00:00 2001 From: StepanovPlaton Date: Sat, 1 Nov 2025 10:30:32 +0400 Subject: [PATCH] Complete tensors math --- src/Makefile | 2 +- src/benchmark.cpp | 52 ++--- src/kernels/matrix.cl | 239 +++++++++++--------- src/main.cpp | 74 ++----- src/math/math.hpp | 6 +- src/math/matrix/cpu/matrix.cpp | 24 -- src/math/matrix/cpu/matrix.hpp | 38 ---- src/math/matrix/cpu/mutable_matrix.cpp | 76 ------- src/math/matrix/cpu/mutable_matrix.hpp | 28 --- src/math/matrix/gpu/matrix.cpp | 41 ---- src/math/matrix/gpu/matrix.hpp | 40 ---- src/math/matrix/gpu/mutable_matrix.cpp | 120 ---------- src/math/matrix/gpu/mutable_matrix.hpp | 46 ---- src/math/matrix/matrix.hpp | 29 --- src/math/matrix/mutable_matrix.hpp | 29 --- src/math/opencl/opencl.hpp | 2 +- src/math/tensor/cpu/math.cpp | 1 + src/math/tensor/cpu/math.hpp | 103 +++++++++ src/math/tensor/cpu/tensor.cpp | 1 + src/math/tensor/cpu/tensor.hpp | 296 +++++++++++++++++++++++++ src/math/tensor/gpu/math.cpp | 1 + src/math/tensor/gpu/math.hpp | 164 ++++++++++++++ src/math/tensor/gpu/tensor.cpp | 1 + src/math/tensor/gpu/tensor.hpp | 282 +++++++++++++++++++++++ src/math/tensor/math.hpp | 58 +++++ src/math/tensor/tensor.hpp | 67 ++++++ 26 files changed, 1147 insertions(+), 673 deletions(-) delete mode 100644 src/math/matrix/cpu/matrix.cpp delete mode 100644 src/math/matrix/cpu/matrix.hpp delete mode 100644 src/math/matrix/cpu/mutable_matrix.cpp delete mode 100644 src/math/matrix/cpu/mutable_matrix.hpp delete mode 100644 src/math/matrix/gpu/matrix.cpp delete mode 100644 src/math/matrix/gpu/matrix.hpp delete mode 100644 src/math/matrix/gpu/mutable_matrix.cpp delete mode 100644 src/math/matrix/gpu/mutable_matrix.hpp delete mode 100644 src/math/matrix/matrix.hpp delete mode 100644 src/math/matrix/mutable_matrix.hpp create mode 100644 src/math/tensor/cpu/math.cpp create mode 100644 src/math/tensor/cpu/math.hpp create mode 100644 src/math/tensor/cpu/tensor.cpp create mode 100644 src/math/tensor/cpu/tensor.hpp create mode 100644 src/math/tensor/gpu/math.cpp create mode 100644 src/math/tensor/gpu/math.hpp create mode 100644 src/math/tensor/gpu/tensor.cpp create mode 100644 src/math/tensor/gpu/tensor.hpp create mode 100644 src/math/tensor/math.hpp create mode 100644 src/math/tensor/tensor.hpp diff --git a/src/Makefile b/src/Makefile index 8d6b1b3..dd89464 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,7 +2,7 @@ CXX = g++ CXXFLAGS = -Wall -Wextra -O2 -std=c++23 LIBS = -lOpenCL TARGET = main -COMMON_SRC = ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp +COMMON_SRC = ./math/opencl/opencl.cpp MAIN_SRC = main.cpp $(COMMON_SRC) BENCHMARK_SRC = benchmark.cpp $(COMMON_SRC) diff --git a/src/benchmark.cpp b/src/benchmark.cpp index d3e1492..58ff15a 100644 --- a/src/benchmark.cpp +++ b/src/benchmark.cpp @@ -6,8 +6,7 @@ #include "./math/math.hpp" -typedef Matrices::CPU Matrix; -typedef MutableMatrices::CPU MutableMatrix; +using namespace GPU; OpenCL openCL; @@ -31,40 +30,37 @@ std::vector generateIdentityMatrix(int size) { } int main() { - const int SIZE = 1024; + const int SIZE = 48; std::cout << "Testing with " << SIZE << "x" << SIZE << " matrices..." << std::endl; - std::vector matrixA = generateRandomMatrix(SIZE, SIZE); - std::vector matrixB = generateRandomMatrix(SIZE, SIZE); - std::vector matrixC = generateRandomMatrix(SIZE, SIZE); + // std::vector matrixA = generateRandomMatrix(SIZE, SIZE); + // std::vector matrixB = generateRandomMatrix(SIZE, SIZE); + // std::vector matrixC = generateRandomMatrix(SIZE, SIZE); - // std::vector matrixA = generateIdentityMatrix(SIZE); - // std::vector matrixB = generateIdentityMatrix(SIZE); - // std::vector matrixC = generateIdentityMatrix(SIZE); + std::vector matrixA = generateIdentityMatrix(SIZE); + std::vector matrixB = generateIdentityMatrix(SIZE); + std::vector matrixC = generateIdentityMatrix(SIZE); - // Тестирование на CPU + // Тестирование на GPU { - std::cout << "\n=== CPU Version ===" << std::endl; + std::cout << "\n=== GPU Version ===" << std::endl; auto start = std::chrono::high_resolution_clock::now(); - MutableMatrices::CPU a(SIZE, SIZE, matrixA); - Matrices::CPU b(SIZE, SIZE, matrixB); - Matrices::CPU c(SIZE, SIZE, matrixC); + MatrixMath mm; + Matrix a(SIZE, SIZE, matrixA); + Matrix b(SIZE, SIZE, matrixB); auto gen_end = std::chrono::high_resolution_clock::now(); - auto op_start = std::chrono::high_resolution_clock::now(); - - for (int i = 0; i < 10; i++) { - a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID); + for (int i = 0; i < 100; ++i) { + Matrix x = mm.mult(a, b); } - auto op_end = std::chrono::high_resolution_clock::now(); - std::vector v = a.toVector(); + std::vector v = a.toVector(&mm.getQueue()); auto total_end = std::chrono::high_resolution_clock::now(); @@ -88,24 +84,22 @@ int main() { std::cout << std::endl; } - // Тестирование на GPU + // Тестирование на CPU { - std::cout << "\n=== GPU Version ===" << std::endl; + std::cout << "\n=== CPU Version ===" << std::endl; auto start = std::chrono::high_resolution_clock::now(); - MutableMatrices::GPU a(SIZE, SIZE, matrixA); - Matrices::GPU b(SIZE, SIZE, matrixB); - Matrices::GPU c(SIZE, SIZE, matrixC); + CPU::MatrixMath mm; + CPU::Matrix a(SIZE, SIZE, matrixA); + CPU::Matrix b(SIZE, SIZE, matrixB); auto gen_end = std::chrono::high_resolution_clock::now(); auto op_start = std::chrono::high_resolution_clock::now(); - - for (int i = 0; i < 10; i++) { - a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f); + for (int i = 0; i < 100; ++i) { + CPU::Matrix x = mm.mult(a, b); } - auto op_end = std::chrono::high_resolution_clock::now(); std::vector v = a.toVector(); diff --git a/src/kernels/matrix.cl b/src/kernels/matrix.cl index f710fd5..95810b9 100644 --- a/src/kernels/matrix.cl +++ b/src/kernels/matrix.cl @@ -1,124 +1,147 @@ float activate_x(float x, const int activation_type, const float alpha) { - switch(activation_type) { - case 0: // LINEAR - return x; - case 1: // SIGMOID - return 1.0f / (1.0f + exp(-x)); - case 2: // TANH - return tanh(x); - case 3: // RELU - return fmax(0.0f, x); - case 4: // LEAKY_RELU - return (x > 0.0f) ? x : alpha * x; - case 5: // ELU - return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f); - case 6: // GELU - return 0.5f * x * (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x))); - default: - return x; - } + switch (activation_type) { + case 0: // LINEAR + return x; + case 1: // SIGMOID + return 1.0f / (1.0f + exp(-x)); + case 2: // TANH + return tanh(x); + case 3: // RELU + return fmax(0.0f, x); + case 4: // LEAKY_RELU + return (x > 0.0f) ? x : alpha * x; + case 5: // ELU + return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f); + case 6: // GELU + return 0.5f * x * + (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x))); + default: + return x; + } } -__kernel void activate( - __global float* input, - __global float* output, - const int activation_type, - const float alpha, - const int rows, - const int cols) -{ - int row = get_global_id(0); - int col = get_global_id(1); - - if (row < rows && col < cols) { - int idx = row * cols + col; - output[idx] = activate_x(input[idx], activation_type, alpha); - } +__kernel void activate(__global float *input, __global float *output, + const int activation_type, const float alpha) { + int i = get_global_id(0); + output[i] = activate_x(input[i], activation_type, alpha); } -__kernel void mult( - __global float* A, - __global float* B, - __global float* C, - const float bias, - const int activation_type, - const float alpha, - const int M, - const int N, - const int K) -{ - const int tile_size = 16; - - int local_i = get_local_id(0); - int local_j = get_local_id(1); - int local_size_i = get_local_size(0); - int local_size_j = get_local_size(1); - - int global_i = get_group_id(0) * local_size_i + local_i; - int global_j = get_group_id(1) * local_size_j + local_j; - - __local float tile_A[16][16]; - __local float tile_B[16][16]; - +__kernel void mult_small(__global float *A, __global float *B, + __global float *C, const float bias, + const int activation_type, const float alpha, + const int M, const int N, const int K, + const int transpose_B) { + const int row = get_global_id(0); + const int col = get_global_id(1); + + if (row < M && col < N) { float sum = 0.0f; - - int num_tiles = (K + tile_size - 1) / tile_size; - - for (int tile = 0; tile < num_tiles; tile++) { - int tile_offset = tile * tile_size; - - int load_i_A = tile_offset + local_i; - int load_j_A = tile_offset + local_j; - - if (global_i < M && load_j_A < K) { - tile_A[local_j][local_i] = A[global_i * K + load_j_A]; - } else { - tile_A[local_j][local_i] = 0.0f; - } - - int load_i_B = tile_offset + local_i; - int load_j_B = tile_offset + local_j; - - if (load_i_B < K && global_j < N) { - tile_B[local_j][local_i] = B[load_i_B * N + global_j]; - } else { - tile_B[local_j][local_i] = 0.0f; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - #pragma unroll - for (int k = 0; k < tile_size; k++) { - sum += tile_A[k][local_i] * tile_B[local_j][k]; - } - - barrier(CLK_LOCAL_MEM_FENCE); + for (int k = 0; k < K; k++) { + float a_val = A[row * K + k]; + + float b_val; + if (transpose_B) { + b_val = B[col * K + k]; + } else { + b_val = B[k * N + col]; + } + + sum += a_val * b_val; } - - if (global_i < M && global_j < N) { - float result = sum + bias; - if (activation_type != 0) { - result = activate_x(result, activation_type, alpha); - } - C[global_i * N + global_j] = result; + + float result = sum + bias; + if (activation_type != 0) { + result = activate_x(result, activation_type, alpha); } + C[row * N + col] = result; + } } -__kernel void mult_sc(__global float* A, __global float* B, float scalar, int M, int N) { - int i = get_global_id(0); - int j = get_global_id(1); - B[i * N + j] = A[i * N + j] * scalar; +__kernel void mult(__global float *A, __global float *B, __global float *C, + const float bias, const int activation_type, + const float alpha, const int M, const int N, const int K, + const int transpose_B) { + const int tile_size = 16; + + int local_i = get_local_id(0); + int local_j = get_local_id(1); + int local_size_i = get_local_size(0); + int local_size_j = get_local_size(1); + + int global_i = get_group_id(0) * local_size_i + local_i; + int global_j = get_group_id(1) * local_size_j + local_j; + + __local float tile_A[16][16]; + __local float tile_B[16][16]; + + float sum = 0.0f; + + int num_tiles = (K + tile_size - 1) / tile_size; + + for (int tile = 0; tile < num_tiles; tile++) { + int tile_offset = tile * tile_size; + + // Загрузка tile_A (без изменений) + int load_i_A = tile_offset + local_i; + int load_j_A = tile_offset + local_j; + + if (global_i < M && load_j_A < K) { + tile_A[local_j][local_i] = A[global_i * K + load_j_A]; + } else { + tile_A[local_j][local_i] = 0.0f; + } + + // Загрузка tile_B с учетом транспонирования + int load_i_B = tile_offset + local_i; + int load_j_B = tile_offset + local_j; + + if (transpose_B) { + // B транспонирована: обращаем индексы + if (load_i_B < N && global_j < K) { + tile_B[local_j][local_i] = B[global_j * N + load_i_B]; + } else { + tile_B[local_j][local_i] = 0.0f; + } + } else { + // B не транспонирована (оригинальная логика) + if (load_i_B < K && global_j < N) { + tile_B[local_j][local_i] = B[load_i_B * N + global_j]; + } else { + tile_B[local_j][local_i] = 0.0f; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + +#pragma unroll + for (int k = 0; k < tile_size; ++k) { + sum += tile_A[k][local_i] * tile_B[local_j][k]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (global_i < M && global_j < N) { + float result = sum + bias; + if (activation_type != 0) { + result = activate_x(result, activation_type, alpha); + } + C[global_i * N + global_j] = result; + } } -__kernel void add(__global float* A, __global float* B, __global float* C, float a, float b, int M, int N) { - int i = get_global_id(0); - int j = get_global_id(1); - C[i * N + j] = (A[i * N + j] * a) + (B[i * N + j] * b); +__kernel void mult_sc(__global float *A, __global float *B, float scalar) { + int i = get_global_id(0); + B[i] = A[i] * scalar; } -__kernel void add_sc(__global float* A, __global float* B, float scalar, int M, int N) { - int i = get_global_id(0); - int j = get_global_id(1); - B[i * N + j] = A[i * N + j] + scalar; +__kernel void add(__global float *A, __global float *B, __global float *C, + float x) { + int i = get_global_id(0); + C[i] = A[i] + (B[i] * x); } +__kernel void add_sc(__global float *A, __global float *B, float scalar) { + int i = get_global_id(0); + B[i] = A[i] + scalar; +} diff --git a/src/main.cpp b/src/main.cpp index cb9731c..f3fa5ec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,73 +3,29 @@ #include #include -typedef Matrices::GPU M; -typedef MutableMatrices::GPU MM; - -class Layer { -protected: - int features; - float bias; - MM::Activate activate; - float alpha; - -public: - Layer(int features, MM::Activate activate = MM::Activate::LINEAR, - float bias = 0.0f, float alpha = 0.0f) - : features(features), activate(activate), bias(bias), alpha(alpha) {} - - int getFeatures() const { return features; } - float getBias() const { return bias; } - MM::Activate getActivate() const { return activate; } - float getAlpha() const { return alpha; } -}; - -class NeuralNetwork { -private: - std::vector layers; - std::vector weights; - -public: - NeuralNetwork(int n, std::initializer_list l) : layers(l) { - weights.emplace_back(n, layers[0].getFeatures()); - for (int i = 0; i < layers.size() - 1; i++) - weights.emplace_back(layers[i].getFeatures(), - layers[i + 1].getFeatures()); - } - - std::vector predict(std::vector i) { - if (i.size() != weights[0].getRows()) - std::invalid_argument("Invalid input size"); - MM input(1, (int)i.size(), i); - for (size_t i = 0; i < weights.size(); i++) - input.mult(weights[i], layers[i + 1].getBias(), - layers[i + 1].getActivate(), layers[i + 1].getAlpha()); - return input.toVector(); - } -}; +using namespace GPU; OpenCL openCL; int main() { - NeuralNetwork nn( - 2, {Layer(3, MM::Activate::RELU), Layer(1, MM::Activate::RELU)}); + MatrixMath mm; - for (int i = 0; i < 10; i++) { - int v1 = (i / 2) % 2; - int v2 = i % 2; + Matrix a(2, 2); + Matrix b(2, 2); - std::vector v = {static_cast(v1), static_cast(v2)}; + CPU::Matrix a_(2, 2, a.toVector()); + CPU::Matrix b_(2, 2, b.toVector()); - std::vector r = nn.predict(v); - float expected = static_cast(v1 ^ v2); + a_.print(); + b_.print(); - std::cout << "XOR(" << v1 << ", " << v2 << ") = " << expected; - std::cout << " | Network: "; - for (size_t j = 0; j < r.size(); ++j) { - std::cout << r[j] << " "; - } - std::cout << std::endl; - } + Matrix c = mm.add(a, b); + + CPU::Matrix c_(2, 2, c.toVector(&mm.getQueue())); + + mm.await(); + + c_.print(); return 0; } \ No newline at end of file diff --git a/src/math/math.hpp b/src/math/math.hpp index b553fb6..54ff3a1 100644 --- a/src/math/math.hpp +++ b/src/math/math.hpp @@ -2,8 +2,6 @@ #include "opencl/opencl.hpp" -#include "matrix/cpu/matrix.hpp" -#include "matrix/cpu/mutable_matrix.hpp" +#include "tensor/cpu/math.hpp" -#include "matrix/gpu/matrix.hpp" -#include "matrix/gpu/mutable_matrix.hpp" +#include "tensor/gpu/math.hpp" diff --git a/src/math/matrix/cpu/matrix.cpp b/src/math/matrix/cpu/matrix.cpp deleted file mode 100644 index ab07f5b..0000000 --- a/src/math/matrix/cpu/matrix.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "matrix.hpp" - -Matrices::CPU::CPU(int rows, int cols, float value) - : IMatrix(rows, cols), data(rows * cols, value) { - validateDimensions(rows, cols); -} - -Matrices::CPU::CPU(int rows, int cols, const std::vector &matrix) - : IMatrix(rows, cols), data(matrix) { - validateDimensions(rows, cols); - if (matrix.size() != static_cast(rows * cols)) { - throw std::invalid_argument("Data size doesn't match matrix dimensions"); - } -} - -float &Matrices::CPU::operator()(int row, int col) { - checkIndices(row, col); - return data[row * cols + col]; -} - -const float &Matrices::CPU::operator()(int row, int col) const { - checkIndices(row, col); - return data[row * cols + col]; -} \ No newline at end of file diff --git a/src/math/matrix/cpu/matrix.hpp b/src/math/matrix/cpu/matrix.hpp deleted file mode 100644 index c8d6842..0000000 --- a/src/math/matrix/cpu/matrix.hpp +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "../matrix.hpp" - -namespace Matrices { - -class CPU : public IMatrix { -protected: - std::vector data; - -public: - CPU(int rows, int cols, float value = 0.0f); - CPU(int rows, int cols, const std::vector &matrix); - - CPU(const CPU &) = default; - CPU &operator=(const CPU &) = default; - CPU(CPU &&) = default; - CPU &operator=(CPU &&) = default; - ~CPU() override = default; - - float &operator()(int row, int col); - const float &operator()(int row, int col) const; - - const std::vector toVector() const { return data; } - - int getRows() const override { return rows; } - int getCols() const override { return cols; } - size_t getSize() const { return data.size(); } - - // GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); } -}; - -} // namespace Matrices diff --git a/src/math/matrix/cpu/mutable_matrix.cpp b/src/math/matrix/cpu/mutable_matrix.cpp deleted file mode 100644 index 5ea45e1..0000000 --- a/src/math/matrix/cpu/mutable_matrix.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include "mutable_matrix.hpp" - -float MutableMatrices::CPU::activate_x(float x, Activate type, float alpha) { - switch (type) { - case Activate::LINEAR: - return x; - case Activate::SIGMOID: - return 1.0f / (1.0f + std::exp(-x)); - case Activate::TANH: - return std::tanh(x); - case Activate::RELU: - return std::max(0.0f, x); - case Activate::LEAKY_RELU: - return (x > 0.0f) ? x : alpha * x; - case Activate::ELU: - return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f); - case Activate::GELU: - return 0.5f * x * - (1.0f + - std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x))); - default: - throw std::invalid_argument("Unknown activation type"); - } -} -void MutableMatrices::CPU::mult(Matrices::CPU &m, float bias, Activate type, - float alpha) { - validateMultDimensions(*this, m); - - std::vector result(rows * m.getCols(), 0.0f); - for (int i = 0; i < rows; i++) { - for (int j = 0; j < m.getCols(); j++) { - float sum = 0.0f; - for (int k = 0; k < cols; k++) { - sum += (*this)(i, k) * m(k, j); - } - result[i * m.getCols() + j] = activate_x(sum + bias, type, alpha); - } - } - data = std::move(result); - cols = m.getCols(); -} - -void MutableMatrices::CPU::mult(float scalar) { - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - data[i * cols + j] *= scalar; - } - } -} - -void MutableMatrices::CPU::add(Matrices::CPU &m, float a, float b) { - validateSameDimensions(*this, m); - - std::vector result(rows * cols, 0.0f); - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b); - } - } - data = std::move(result); -} - -void MutableMatrices::CPU::add(float scalar) { - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - data[i * cols + j] += scalar; - } - } -} -void MutableMatrices::CPU::activate(Activate type, float alpha) { - for (int i = 0; i < rows; i++) { - for (int j = 0; j < cols; j++) { - data[i * cols + j] = activate_x(data[i * cols + j], type, alpha); - } - } -} \ No newline at end of file diff --git a/src/math/matrix/cpu/mutable_matrix.hpp b/src/math/matrix/cpu/mutable_matrix.hpp deleted file mode 100644 index 5110413..0000000 --- a/src/math/matrix/cpu/mutable_matrix.hpp +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "matrix.hpp" - -#include "../mutable_matrix.hpp" - -#include - -#define M_PI 3.14159265358979323846 - -namespace MutableMatrices { - -class CPU : public Matrices::CPU, public IMutableMatrix { -private: - static float activate_x(float x, Activate type, float alpha = 0.01f); - -public: - CPU(int rows, int cols, const std::vector &matrix) - : Matrices::CPU(rows, cols, matrix) {} - - void mult(Matrices::CPU &m, float bias = 0.0f, - Activate type = Activate::LINEAR, float alpha = 0.01f); - void mult(float scalar); - void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f); - void add(float scalar); - void activate(Activate type, float alpha = 0.01f); -}; -}; // namespace MutableMatrices diff --git a/src/math/matrix/gpu/matrix.cpp b/src/math/matrix/gpu/matrix.cpp deleted file mode 100644 index 116b94f..0000000 --- a/src/math/matrix/gpu/matrix.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include - -#include "matrix.hpp" - -std::random_device rd; -std::mt19937 gen(rd()); - -Matrices::GPU::GPU(int rows, int cols) - : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) { - validateDimensions(rows, cols); - std::vector matrix; - matrix.reserve(rows * cols); - for (size_t i = 0; i < (size_t)rows * (size_t)cols; ++i) - matrix.push_back(std::generate_canonical(gen)); - buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float), - matrix.data()); - queue.finish(); -} - -Matrices::GPU::GPU(int rows, int cols, const std::vector &matrix) - : IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) { - validateDimensions(rows, cols); - if (matrix.size() != static_cast(rows * cols)) { - throw std::invalid_argument("Matrix data size doesn't match dimensions"); - } - buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float), - matrix.data()); - queue.finish(); -} - -const std::vector Matrices::GPU::toVector() const { - std::vector result(rows * cols); - queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float), - result.data()); - queue.finish(); - return result; -} \ No newline at end of file diff --git a/src/math/matrix/gpu/matrix.hpp b/src/math/matrix/gpu/matrix.hpp deleted file mode 100644 index 1c4b244..0000000 --- a/src/math/matrix/gpu/matrix.hpp +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -#include "../../opencl/opencl.hpp" - -#include "../matrix.hpp" - -namespace Matrices { -class GPU : public IMatrix { -protected: - cl::Buffer *buffer; - cl::CommandQueue queue; - -public: - GPU(int rows, int cols); - GPU(int rows, int cols, const std::vector &matrix); - ~GPU() { delete buffer; } - - GPU(const GPU &) = delete; - GPU &operator=(const GPU &) = delete; - GPU(GPU &&other) - : IMatrix(other.rows, other.cols), buffer(other.buffer), - queue(std::move(other.queue)) { - other.buffer = nullptr; - other.rows = 0; - other.cols = 0; - } - GPU &operator=(GPU &&other) = default; - - int getRows() const override { return rows; } - int getCols() const override { return cols; } - size_t getSize() const { return rows * cols; } - - const cl::Buffer *getBuffer() const { return buffer; } - - const std::vector toVector() const; - - // CPU toCPU() const { return CPU(rows, cols, toVector()); }; -}; - -} // namespace Matrices diff --git a/src/math/matrix/gpu/mutable_matrix.cpp b/src/math/matrix/gpu/mutable_matrix.cpp deleted file mode 100644 index ebe2e03..0000000 --- a/src/math/matrix/gpu/mutable_matrix.cpp +++ /dev/null @@ -1,120 +0,0 @@ -#include "mutable_matrix.hpp" - -MutableMatrices::GPU::GPU(int rows, int cols) : Matrices::GPU(rows, cols) { - for (const auto &entry : kernelsNames) { - kernels[entry.first] = - cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second); - } -} - -MutableMatrices::GPU::GPU(int rows, int cols, const std::vector &matrix) - : Matrices::GPU(rows, cols, matrix) { - for (const auto &entry : kernelsNames) { - kernels[entry.first] = - cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second); - } -} - -void MutableMatrices::GPU::mult(Matrices::GPU &m, float bias, Activate type, - float alpha) { - validateMultDimensions(*this, m); - - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * m.getCols() * sizeof(float)); - - const int tile_size = 16; - cl::NDRange local_size(tile_size, tile_size); - cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size, - ((m.getCols() + tile_size - 1) / tile_size) * - tile_size); - - kernels[Method::MULT].setArg(0, *buffer); - kernels[Method::MULT].setArg(1, *m.getBuffer()); - kernels[Method::MULT].setArg(2, *b); - kernels[Method::MULT].setArg(3, bias); - kernels[Method::MULT].setArg(4, static_cast(type)); - kernels[Method::MULT].setArg(5, alpha); - kernels[Method::MULT].setArg(6, rows); - kernels[Method::MULT].setArg(7, m.getCols()); - kernels[Method::MULT].setArg(8, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, global_size, - local_size, nullptr, &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; - cols = m.getCols(); -} - -void MutableMatrices::GPU::mult(float scalar) { - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::SCALAR_MULT].setArg(0, *buffer); - kernels[Method::SCALAR_MULT].setArg(1, *b); - kernels[Method::SCALAR_MULT].setArg(2, scalar); - kernels[Method::SCALAR_MULT].setArg(3, rows); - kernels[Method::SCALAR_MULT].setArg(4, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; -} - -void MutableMatrices::GPU::add(Matrices::GPU &m, float a, float b) { - validateSameDimensions(*this, m); - - cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::ADD].setArg(0, *buffer); - kernels[Method::ADD].setArg(1, *m.getBuffer()); - kernels[Method::ADD].setArg(2, *buf); - kernels[Method::ADD].setArg(3, a); - kernels[Method::ADD].setArg(4, b); - kernels[Method::ADD].setArg(5, rows); - kernels[Method::ADD].setArg(6, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = buf; -} - -void MutableMatrices::GPU::add(float scalar) { - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::SCALAR_ADD].setArg(0, *buffer); - kernels[Method::SCALAR_ADD].setArg(1, *b); - kernels[Method::SCALAR_ADD].setArg(2, scalar); - kernels[Method::SCALAR_ADD].setArg(3, rows); - kernels[Method::SCALAR_ADD].setArg(4, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; -} - -void MutableMatrices::GPU::activate(Activate type, float alpha) { - cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, - rows * cols * sizeof(float)); - kernels[Method::ACTIVATE].setArg(0, *buffer); - kernels[Method::ACTIVATE].setArg(1, *b); - kernels[Method::ACTIVATE].setArg(2, static_cast(type)); - kernels[Method::ACTIVATE].setArg(3, alpha); - kernels[Method::ACTIVATE].setArg(4, rows); - kernels[Method::ACTIVATE].setArg(5, cols); - cl::Event event; - queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange, - cl::NDRange(rows, cols), cl::NullRange, nullptr, - &event); - - event.setCallback(CL_COMPLETE, releaseBuffer, buffer); - buffer = b; -} \ No newline at end of file diff --git a/src/math/matrix/gpu/mutable_matrix.hpp b/src/math/matrix/gpu/mutable_matrix.hpp deleted file mode 100644 index 8cd6f9d..0000000 --- a/src/math/matrix/gpu/mutable_matrix.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include "../../opencl/opencl.hpp" - -#include "matrix.hpp" - -#include "../mutable_matrix.hpp" - -namespace MutableMatrices { - -class GPU : public Matrices::GPU, public IMutableMatrix { -private: - enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD, ACTIVATE }; - std::unordered_map kernels; - std::unordered_map kernelsNames = { - {Method::MULT, "mult"}, - {Method::SCALAR_MULT, "mult_sc"}, - {Method::ADD, "add"}, - {Method::SCALAR_ADD, "add_sc"}, - {Method::ACTIVATE, "activate"}}; - - static void CL_CALLBACK releaseBuffer(cl_event, cl_int status, void *buf) { - if (status == CL_COMPLETE) { - // std::cout << "Kernel complete!" << std::endl; - delete (cl::Buffer *)buf; - } - } - -public: - GPU(int rows, int cols); - GPU(int rows, int cols, const std::vector &matrix); - - GPU(const GPU &) = delete; - GPU &operator=(const GPU &) = delete; - GPU(GPU &&other) = default; - GPU &operator=(GPU &&other) = default; - - void mult(Matrices::GPU &m, float bias = 0.0f, - Activate type = Activate::LINEAR, float alpha = 0.01f); - void mult(float scalar); - void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f); - void add(float scalar); - void activate(Activate type, float alpha = 0.01f); -}; - -}; // namespace MutableMatrices diff --git a/src/math/matrix/matrix.hpp b/src/math/matrix/matrix.hpp deleted file mode 100644 index 339509a..0000000 --- a/src/math/matrix/matrix.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include -#include -#include - -class IMatrix { -protected: - int rows; - int cols; - - void validateDimensions(int rows, int cols) const { - if (rows <= 0 || cols <= 0) { - throw std::invalid_argument("Matrix dimensions must be positive"); - } - }; - void checkIndices(int row, int col) const { - if (row < 0 || row >= rows || col < 0 || col >= cols) { - throw std::out_of_range("Matrix indices out of range"); - } - }; - -public: - IMatrix(int rows, int cols) : rows(rows), cols(cols) {} - virtual ~IMatrix() = default; - virtual int getRows() const = 0; - virtual int getCols() const = 0; - virtual const std::vector toVector() const = 0; -}; diff --git a/src/math/matrix/mutable_matrix.hpp b/src/math/matrix/mutable_matrix.hpp deleted file mode 100644 index 72e3867..0000000 --- a/src/math/matrix/mutable_matrix.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "matrix.hpp" - -template class IMutableMatrix { - static_assert(std::is_base_of::value, - "T must be derived from IMatrix"); - -public: - enum class Activate { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU }; - - virtual void mult(T &m, float bias, Activate type, float alpha) = 0; - virtual void mult(float s) = 0; - virtual void add(T &m, float a, float b) = 0; - virtual void add(float a) = 0; - virtual void activate(Activate type, float alpha = 0.01f) = 0; - - void validateMultDimensions(T &a, T &b) const { - if (a.getCols() != b.getRows()) { - throw std::invalid_argument( - "Invalid matrix dimensions for multiplication"); - } - }; - void validateSameDimensions(T &a, T &b) const { - if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) { - throw std::invalid_argument("Invalid matrix dimensions for addition"); - } - }; -}; diff --git a/src/math/opencl/opencl.hpp b/src/math/opencl/opencl.hpp index 5623655..e509b9d 100644 --- a/src/math/opencl/opencl.hpp +++ b/src/math/opencl/opencl.hpp @@ -40,7 +40,7 @@ public: cl::Device &getDevice() { return device; } cl::Context &getContext() { return context; } - cl::CommandQueue &getDefaultQueue() { return defaultQueue; } + const cl::CommandQueue &getDefaultQueue() { return defaultQueue; } cl::Program &getProgram(Program program); void printDeviceInfo() const; diff --git a/src/math/tensor/cpu/math.cpp b/src/math/tensor/cpu/math.cpp new file mode 100644 index 0000000..07a104b --- /dev/null +++ b/src/math/tensor/cpu/math.cpp @@ -0,0 +1 @@ +#include "math.hpp" diff --git a/src/math/tensor/cpu/math.hpp b/src/math/tensor/cpu/math.hpp new file mode 100644 index 0000000..d2c1b3f --- /dev/null +++ b/src/math/tensor/cpu/math.hpp @@ -0,0 +1,103 @@ +#pragma once + +#include "tensor.hpp" + +#include "../math.hpp" + +#include + +#define M_PI 3.14159265358979323846 + +namespace CPU { +template class TensorMath; +class Tensor0Math; +class Tensor1Math; +class Tensor2Math; +class Tensor3Math; + +template class TensorMath : public ITensorMath { +protected: + float activate_x(float x, Activation type, float alpha = 0.01f) { + switch (type) { + case Activation::LINEAR: + return x; + case Activation::SIGMOID: + return 1.0f / (1.0f + std::exp(-x)); + case Activation::TANH: + return std::tanh(x); + case Activation::RELU: + return std::max(0.0f, x); + case Activation::LEAKY_RELU: + return (x > 0.0f) ? x : alpha * x; + case Activation::ELU: + return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f); + case Activation::GELU: + return 0.5f * x * + (1.0f + + std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x))); + default: + throw std::invalid_argument("Unknown activation type"); + } + } + +public: + T activate(const T &t, Activation type = Activation::LINEAR, + float alpha = 0.0f) override { + T result(t.getShape(), false); + for (size_t i = 0; i < t.getSize(); ++i) { + result[i] = activate_x(t[i], type, alpha); + } + return result; + } + + T mult(const T &t, float x) override { + T result(t.getShape(), false); + for (size_t i = 0; i < t.getSize(); ++i) + result[i] = t[i] * x; + return result; + } + T add(const T &a, const T &b, float x = 1.0f) override { + this->validateSameDimensions(a, b); + T result(a.getShape(), false); + for (size_t i = 0; i < a.getSize(); ++i) + result[i] = a[i] + (b[i] * x); + return result; + } + T add(const T &t, float x) override { + T result(t.getShape(), false); + for (size_t i = 0; i < t.getSize(); ++i) + result[i] = t[i] + x; + return result; + } +}; + +class Tensor0Math : public TensorMath, public ITensor0Math {}; + +class Tensor1Math : public TensorMath, public ITensor1Math {}; + +class Tensor2Math : public TensorMath, public ITensor2Math { +public: + Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false, + float bias = 0.0f, Activation type = Activation::LINEAR, + float alpha = 0.01f) override { + validateMultDimensions(a, b, transpose); + Tensor2 result(a.getRows(), b.getCols(), 0.0f); + for (int i = 0; i < result.getRows(); ++i) { + for (int j = 0; j < result.getCols(); ++j) { + float sum = 0.0f; + for (int k = 0; k < a.getCols(); ++k) + sum += a(i, k) * (transpose ? b(j, k) : b(k, j)); + result(i, j) = activate_x(sum + bias, type, alpha); + } + } + return result; + } +}; + +class Tensor3Math : public TensorMath, public ITensor3Math {}; + +typedef Tensor0Math ScalarMath; +typedef Tensor1Math VectorMath; +typedef Tensor2Math MatrixMath; + +} // namespace CPU diff --git a/src/math/tensor/cpu/tensor.cpp b/src/math/tensor/cpu/tensor.cpp new file mode 100644 index 0000000..f7d3338 --- /dev/null +++ b/src/math/tensor/cpu/tensor.cpp @@ -0,0 +1 @@ +#include "tensor.hpp" diff --git a/src/math/tensor/cpu/tensor.hpp b/src/math/tensor/cpu/tensor.hpp new file mode 100644 index 0000000..ab67455 --- /dev/null +++ b/src/math/tensor/cpu/tensor.hpp @@ -0,0 +1,296 @@ +#pragma once + +#include +#include +#include +#include + +#include "../tensor.hpp" + +extern std::mt19937 gen; + +namespace CPU { +class Tensor; +class Tensor0; +class Tensor1; +class Tensor2; +class Tensor3; + +class Tensor : public ITensor { +protected: + std::vector data; + + void resize(size_t size) { data.resize(size); } + void resize(const std::vector &shape) { + size_t size = 1; + for (int dim : shape) + size *= dim; + resize(size); + } + +public: + Tensor(const std::vector &shape) : ITensor(shape) { + resize(shape); + std::generate(data.begin(), data.end(), + []() { return std::generate_canonical(gen); }); + } + Tensor(const std::vector &shape, float value) : ITensor(shape) { + resize(shape); + std::fill(data.begin(), data.end(), value); + } + Tensor(const std::vector &shape, bool fill) : ITensor(shape) { + resize(shape); + if (fill) + std::fill(data.begin(), data.end(), 0.0f); + } + Tensor(const Tensor &) = default; + Tensor &operator=(const Tensor &) = default; + Tensor(Tensor &&other) = default; + Tensor &operator=(Tensor &&other) = default; + + float &operator[](int index) { return data[index]; } + const float &operator[](int index) const { return data[index]; } + + virtual void print() const { + std::cout << "Tensor(" << getDim() << "): ["; + for (size_t i = 0; i < data.size(); ++i) { + std::cout << data[i]; + if (i > 15) { + std::cout << "... "; + break; + } + if (i != data.size() - 1) + std::cout << ", "; + } + std::cout << "]" << std::endl; + } + + std::vector toVector() const { return data; } + + static Tensor0 *asScalar(Tensor *tensor) { + return tensor->getType() == Type::SCALAR + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor0 *asScalar(const Tensor *tensor) { + return tensor->getType() == Type::SCALAR + ? reinterpret_cast(tensor) + : nullptr; + } + static Tensor1 *asVector(Tensor *tensor) { + return tensor->getType() == Type::VECTOR + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor1 *asVector(const Tensor *tensor) { + return tensor->getType() == Type::VECTOR + ? reinterpret_cast(tensor) + : nullptr; + } + static Tensor2 *asMatrix(Tensor *tensor) { + return tensor->getType() == Type::MATRIX + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor2 *asMatrix(const Tensor *tensor) { + return tensor->getType() == Type::MATRIX + ? reinterpret_cast(tensor) + : nullptr; + } + static Tensor3 *asTensor3(Tensor *tensor) { + return tensor->getType() == Type::TENSOR3 + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor3 *asTensor3(const Tensor *tensor) { + return tensor->getType() == Type::TENSOR3 + ? reinterpret_cast(tensor) + : nullptr; + } +}; + +class Tensor0 : public Tensor, public ITensor0 { +public: + Tensor0(const std::vector &shape) : Tensor(shape) { + if (shape.size() != 0) + throw std::invalid_argument("Tensor0 dimension must be 0"); + } + Tensor0(const std::vector &shape, float value) : Tensor(shape, value) { + if (shape.size() != 0) + throw std::invalid_argument("Tensor0 dimension must be 0"); + } + Tensor0() : Tensor({}) { + resize(1); + data[0] = std::generate_canonical(gen); + } + Tensor0(float value) : Tensor({}) { + resize(1); + data[0] = value; + } + Tensor0(const Tensor0 &) = default; + Tensor0 &operator=(const Tensor0 &) = default; + Tensor0(Tensor0 &&other) = default; + Tensor0 &operator=(Tensor0 &&other) = default; + + void print() const override { + std::cout << "Scalar: " << data[0] << std::endl; + } + + float &value() { return data[0]; } + const float &value() const { return data[0]; } +}; + +class Tensor1 : public Tensor, public ITensor1 { +public: + Tensor1(const std::vector &shape) : Tensor(shape) { + if (shape.size() != 1) + throw std::invalid_argument("Tensor1 dimension must be 1"); + } + Tensor1(const std::vector &shape, float value) : Tensor(shape, value) { + if (shape.size() != 1) + throw std::invalid_argument("Tensor1 dimension must be 1"); + } + Tensor1(int size) : Tensor({size}) {} + Tensor1(int size, float value) : Tensor({size}, value) {} + Tensor1(const std::vector &values) : Tensor({(int)values.size()}) { + data = values; + } + Tensor1(const Tensor1 &) = default; + Tensor1 &operator=(const Tensor1 &) = default; + Tensor1(Tensor1 &&other) = default; + Tensor1 &operator=(Tensor1 &&other) = default; + + void print() const override { + std::cout << "Vector(" << shape[0] << "): ["; + for (size_t i = 0; i < data.size(); ++i) { + std::cout << data[i]; + if (i != data.size() - 1) + std::cout << ", "; + } + std::cout << "]" << std::endl; + } + + float &operator()(int i) { return data[i]; } + const float &operator()(int i) const { return data[i]; } + + int getSize() const override { return shape[0]; } +}; + +class Tensor2 : public ITensor2, public Tensor { +public: + Tensor2(const std::vector &shape) : Tensor(shape) { + if (shape.size() != 2) + throw std::invalid_argument("Tensor2 dimension must be 2"); + } + Tensor2(const std::vector &shape, float value) : Tensor(shape, value) { + if (shape.size() != 2) + throw std::invalid_argument("Tensor2 dimension must be 2"); + } + Tensor2(int rows, int cols) : ITensor2(), Tensor({rows, cols}) {} + Tensor2(int rows, int cols, float value) + : ITensor2(), Tensor({rows, cols}, value) {} + Tensor2(int rows, int cols, const std::vector &values) + : Tensor({rows, cols}, false) { + for (int i = 0; i < shape[0]; ++i) { + for (int j = 0; j < shape[1]; ++j) { + data[i * shape[1] + j] = values[i * shape[1] + j]; + } + } + } + Tensor2(const std::vector> &values) + : Tensor({(int)values.size(), (int)values[0].size()}) { + for (int i = 0; i < shape[0]; ++i) { + for (int j = 0; j < shape[1]; ++j) { + data[i * shape[1] + j] = values[i][j]; + } + } + } + Tensor2(const Tensor2 &) = default; + Tensor2 &operator=(const Tensor2 &) = default; + Tensor2(Tensor2 &&other) = default; + Tensor2 &operator=(Tensor2 &&other) = default; + + void print() const override { + std::cout << "Matrix(" << shape[0] << "x" << shape[1] << "):\n"; + for (int i = 0; i < shape[0]; ++i) { + for (int j = 0; j < shape[1]; ++j) { + std::cout << data[i * shape[1] + j] << " "; + } + std::cout << std::endl; + } + } + + float &operator()(int i, int j) { return data[i * shape[1] + j]; } + const float &operator()(int i, int j) const { return data[i * shape[1] + j]; } + + int getRows() const override { return shape[0]; } + int getCols() const override { return shape[1]; } +}; + +class Tensor3 : public Tensor, public ITensor3 { +public: + Tensor3(const std::vector &shape) : Tensor(shape) { + if (shape.size() != 3) + throw std::invalid_argument("Tensor3 dimension must be 3"); + } + Tensor3(const std::vector &shape, float value) : Tensor(shape, value) { + if (shape.size() != 3) + throw std::invalid_argument("Tensor3 dimension must be 3"); + } + Tensor3(int d1, int d2, int d3) : Tensor({d1, d2, d3}) {} + Tensor3(int d1, int d2, int d3, float value) : Tensor({d1, d2, d3}, value) {} + Tensor3(int d1, int d2, int d3, const std::vector &values) + : Tensor({d1, d2, d3}, false) { + for (int i = 0; i < shape[0]; ++i) { + for (int j = 0; j < shape[1]; ++j) { + for (int k = 0; k < shape[2]; ++k) { + data[i * shape[1] * shape[2] + j * shape[2] + k] = + values[i * shape[1] * shape[2] + j * shape[2] + k]; + } + } + } + } + Tensor3(const std::vector>> &values) + : Tensor({(int)values.size(), (int)values[0].size(), + (int)values[0][0].size()}) { + for (int i = 0; i < shape[0]; ++i) { + for (int j = 0; j < shape[1]; ++j) { + for (int k = 0; k < shape[2]; ++k) { + data[i * shape[1] * shape[2] + j * shape[2] + k] = values[i][j][k]; + } + } + } + } + Tensor3(const Tensor3 &) = default; + Tensor3 &operator=(const Tensor3 &) = default; + Tensor3(Tensor3 &&other) = default; + Tensor3 &operator=(Tensor3 &&other) = default; + + void print() const override { + std::cout << "Tensor3(" << shape[0] << "x" << shape[1] << "x" << shape[2] + << "):\n"; + for (int i = 0; i < shape[0]; ++i) { + std::cout << "Slice " << i << ":\n"; + for (int j = 0; j < shape[1]; ++j) { + for (int k = 0; k < shape[2]; ++k) { + std::cout << data[i * shape[1] * shape[2] + j * shape[2] + k] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + } + + float &operator()(int i, int j, int k) { + return data[i * shape[1] * shape[2] + j * shape[2] + k]; + } + const float &operator()(int i, int j, int k) const { + return data[i * shape[1] * shape[2] + j * shape[2] + k]; + } +}; + +typedef Tensor0 Scalar; +typedef Tensor1 Vector; +typedef Tensor2 Matrix; + +} // namespace CPU diff --git a/src/math/tensor/gpu/math.cpp b/src/math/tensor/gpu/math.cpp new file mode 100644 index 0000000..07a104b --- /dev/null +++ b/src/math/tensor/gpu/math.cpp @@ -0,0 +1 @@ +#include "math.hpp" diff --git a/src/math/tensor/gpu/math.hpp b/src/math/tensor/gpu/math.hpp new file mode 100644 index 0000000..d783d3a --- /dev/null +++ b/src/math/tensor/gpu/math.hpp @@ -0,0 +1,164 @@ +#pragma once + +#include "../../opencl/opencl.hpp" + +#include "tensor.hpp" + +#include "../math.hpp" + +namespace GPU { +template class TensorMath; +class Tensor0Math; +class Tensor1Math; +class Tensor2Math; +class Tensor3Math; + +template class TensorMath : public ITensorMath { +protected: + enum class Method { + MULT, + MULT_SMALL, + SCALAR_MULT, + ADD, + SCALAR_ADD, + ACTIVATE + }; + std::unordered_map kernels; + std::unordered_map kernelsNames = { + {Method::MULT, "mult"}, {Method::MULT_SMALL, "mult_small"}, + {Method::SCALAR_MULT, "mult_sc"}, {Method::ADD, "add"}, + {Method::SCALAR_ADD, "add_sc"}, {Method::ACTIVATE, "activate"}}; + + cl::CommandQueue queue; + +public: + TensorMath() { + queue = cl::CommandQueue(openCL.getContext(), openCL.getDevice()); + for (const auto &entry : kernelsNames) { + kernels[entry.first] = + cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second); + } + } + + const cl::CommandQueue &getQueue() const { return queue; } + + void await() const { queue.finish(); } + + T activate(const T &t, Activation type = Activation::LINEAR, + float alpha = 0.0f) override { + T result(t.getShape(), false, &queue); + kernels[Method::ACTIVATE].setArg(0, *t.getBuffer()); + kernels[Method::ACTIVATE].setArg(1, *result.getBuffer()); + kernels[Method::ACTIVATE].setArg(2, static_cast(type)); + kernels[Method::ACTIVATE].setArg(3, alpha); + queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange, + cl::NDRange(t.getSize())); + return result; + } + + T mult(const T &t, float x) override { + T result(t.getShape(), false, &queue); + kernels[Method::SCALAR_MULT].setArg(0, *t.getBuffer()); + kernels[Method::SCALAR_MULT].setArg(1, *result.getBuffer()); + kernels[Method::SCALAR_MULT].setArg(2, x); + queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange, + cl::NDRange(t.getSize())); + return result; + } + + T add(const T &a, const T &b, float x = 1.0f) override { + this->validateSameDimensions(a, b); + T result(a.getShape(), false, &queue); + kernels[Method::ADD].setArg(0, *a.getBuffer()); + kernels[Method::ADD].setArg(1, *b.getBuffer()); + kernels[Method::ADD].setArg(2, *result.getBuffer()); + kernels[Method::ADD].setArg(3, x); + queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange, + cl::NDRange(a.getSize())); + return result; + } + + T add(const T &t, float x) override { + T result(t.getShape(), false, &queue); + kernels[Method::SCALAR_ADD].setArg(0, *t.getBuffer()); + kernels[Method::SCALAR_ADD].setArg(1, *result.getBuffer()); + kernels[Method::SCALAR_ADD].setArg(2, x); + queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange, + cl::NDRange(t.getSize())); + return result; + } +}; + +class Tensor0Math : public TensorMath, public ITensor0Math {}; + +class Tensor1Math : public TensorMath, public ITensor1Math {}; + +class Tensor2Math : public TensorMath, public ITensor2Math { +private: + Tensor2 mult_tiled(const Tensor2 &a, const Tensor2 &b, bool transpose = false, + float bias = 0.0f, Activation type = Activation::LINEAR, + float alpha = 0.01f) { + validateMultDimensions(a, b, transpose); + Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false, + &queue); + + const int tile_size = 16; + cl::NDRange local_size(tile_size, tile_size); + cl::NDRange global_size( + ((result.getRows() + tile_size - 1) / tile_size) * tile_size, + ((result.getCols() + tile_size - 1) / tile_size) * tile_size); + + kernels[Method::MULT].setArg(0, *a.getBuffer()); + kernels[Method::MULT].setArg(1, *b.getBuffer()); + kernels[Method::MULT].setArg(2, *result.getBuffer()); + kernels[Method::MULT].setArg(3, bias); + kernels[Method::MULT].setArg(4, static_cast(type)); + kernels[Method::MULT].setArg(5, alpha); + kernels[Method::MULT].setArg(6, result.getRows()); + kernels[Method::MULT].setArg(7, result.getCols()); + kernels[Method::MULT].setArg(8, a.getCols()); + kernels[Method::MULT].setArg(9, transpose ? 1 : 0); + queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, + global_size, local_size); + return result; + } + Tensor2 mult_small(const Tensor2 &a, const Tensor2 &b, bool transpose = false, + float bias = 0.0f, Activation type = Activation::LINEAR, + float alpha = 0.01f) { + validateMultDimensions(a, b, transpose); + Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false, + &queue); + kernels[Method::MULT_SMALL].setArg(0, *a.getBuffer()); + kernels[Method::MULT_SMALL].setArg(1, *b.getBuffer()); + kernels[Method::MULT_SMALL].setArg(2, *result.getBuffer()); + kernels[Method::MULT_SMALL].setArg(3, bias); + kernels[Method::MULT_SMALL].setArg(4, static_cast(type)); + kernels[Method::MULT_SMALL].setArg(5, alpha); + kernels[Method::MULT_SMALL].setArg(6, result.getRows()); + kernels[Method::MULT_SMALL].setArg(7, result.getCols()); + kernels[Method::MULT_SMALL].setArg(8, a.getCols()); + kernels[Method::MULT_SMALL].setArg(9, transpose ? 1 : 0); + queue.enqueueNDRangeKernel(kernels[Method::MULT_SMALL], cl::NullRange, + cl::NDRange(result.getRows(), result.getCols())); + return result; + } + +public: + Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false, + float bias = 0.0f, Activation type = Activation::LINEAR, + float alpha = 0.01f) override { + if (a.getRows() > 64 || a.getCols() > 64 || b.getRows() > 64 || + b.getCols() > 64) + return mult_tiled(a, b, transpose, bias, type, alpha); + else + return mult_small(a, b, transpose, bias, type, alpha); + } +}; + +class Tensor3Math : public TensorMath, public ITensor3Math {}; + +typedef Tensor0Math ScalarMath; +typedef Tensor1Math VectorMath; +typedef Tensor2Math MatrixMath; + +} // namespace GPU diff --git a/src/math/tensor/gpu/tensor.cpp b/src/math/tensor/gpu/tensor.cpp new file mode 100644 index 0000000..f7d3338 --- /dev/null +++ b/src/math/tensor/gpu/tensor.cpp @@ -0,0 +1 @@ +#include "tensor.hpp" diff --git a/src/math/tensor/gpu/tensor.hpp b/src/math/tensor/gpu/tensor.hpp new file mode 100644 index 0000000..58fe563 --- /dev/null +++ b/src/math/tensor/gpu/tensor.hpp @@ -0,0 +1,282 @@ +#pragma once + +#include "../../opencl/opencl.hpp" + +#include +#include +#include +#include + +#include "../tensor.hpp" +#include "math.hpp" + +extern std::mt19937 gen; + +namespace GPU { +class Tensor; +class Tensor0; +class Tensor1; +class Tensor2; +class Tensor3; + +class Tensor : public ITensor { +protected: + cl::Buffer *buffer = nullptr; + + size_t getShapeSize(const std::vector &shape) { + size_t size = 1; + for (int dim : shape) + size *= dim; + return size; + } + void fillBuf(const std::vector &v, + const cl::CommandQueue *queue = nullptr) { + if (buffer != nullptr) + throw std::runtime_error("Tensor buffer already exists"); + buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE, + v.size() * sizeof(float)); + cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue; + q.enqueueWriteBuffer(*buffer, CL_TRUE, 0, v.size() * sizeof(float), + v.data()); + q.finish(); + } + void createBuf(size_t size, const cl::CommandQueue *queue = nullptr) { + std::vector v(size); + std::generate(v.begin(), v.end(), + []() { return std::generate_canonical(gen); }); + fillBuf(v, queue); + } + void createBuf(size_t size, float value, + const cl::CommandQueue *queue = nullptr) { + std::vector v(size); + std::fill(v.begin(), v.end(), value); + fillBuf(v, queue); + } + +public: + Tensor(const std::vector &shape, const cl::CommandQueue *queue = nullptr) + : ITensor(shape) { + createBuf(getShapeSize(shape), queue); + } + Tensor(const std::vector &shape, float value, + const cl::CommandQueue *queue = nullptr) + : ITensor(shape) { + createBuf(getShapeSize(shape), value, queue); + } + Tensor(const std::vector &shape, bool fill, + const cl::CommandQueue *queue = nullptr) + : ITensor(shape) { + if (fill) + createBuf(getShapeSize(shape), 0.0f, queue); + } + Tensor(const Tensor &) = delete; + Tensor &operator=(const Tensor &) = delete; + Tensor(Tensor &&other) : ITensor(other.shape), buffer(other.buffer) { + other.buffer = nullptr; + }; + Tensor &operator=(Tensor &&other) = delete; + + std::vector toVector(const cl::CommandQueue *queue = nullptr) { + size_t size = getShapeSize(shape); + std::vector result(size); + cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue; + q.enqueueReadBuffer(*buffer, CL_TRUE, 0, size * sizeof(float), + result.data()); + q.finish(); + return result; + } + + const cl::Buffer *getBuffer() const { return buffer; } + + static Tensor0 *asScalar(Tensor *tensor) { + return tensor->getType() == Type::SCALAR + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor0 *asScalar(const Tensor *tensor) { + return tensor->getType() == Type::SCALAR + ? reinterpret_cast(tensor) + : nullptr; + } + static Tensor1 *asVector(Tensor *tensor) { + return tensor->getType() == Type::VECTOR + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor1 *asVector(const Tensor *tensor) { + return tensor->getType() == Type::VECTOR + ? reinterpret_cast(tensor) + : nullptr; + } + static Tensor2 *asMatrix(Tensor *tensor) { + return tensor->getType() == Type::MATRIX + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor2 *asMatrix(const Tensor *tensor) { + return tensor->getType() == Type::MATRIX + ? reinterpret_cast(tensor) + : nullptr; + } + static Tensor3 *asTensor3(Tensor *tensor) { + return tensor->getType() == Type::TENSOR3 + ? reinterpret_cast(tensor) + : nullptr; + } + static const Tensor3 *asTensor3(const Tensor *tensor) { + return tensor->getType() == Type::TENSOR3 + ? reinterpret_cast(tensor) + : nullptr; + } +}; + +class Tensor0 : public Tensor, public ITensor0 { +public: + Tensor0(const std::vector &shape, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, queue) { + if (shape.size() != 0) + throw std::invalid_argument("Tensor0 dimension must be 0"); + } + Tensor0(const std::vector &shape, float value, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, value, queue) { + if (shape.size() != 0) + throw std::invalid_argument("Tensor0 dimension must be 0"); + } + Tensor0(const cl::CommandQueue *queue = nullptr) : Tensor({}, queue) { + createBuf(1, queue); + } + Tensor0(float value, const cl::CommandQueue *queue = nullptr) + : Tensor({}, queue) { + createBuf(1, value, queue); + } + Tensor0(const Tensor0 &) = delete; + Tensor0 &operator=(const Tensor0 &) = delete; + Tensor0(Tensor0 &&other) : Tensor(std::move(other)) {}; + Tensor0 &operator=(Tensor0 &&other) = delete; +}; + +class Tensor1 : public Tensor, public ITensor1 { +public: + Tensor1(const std::vector &shape, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, queue) { + if (shape.size() != 1) + throw std::invalid_argument("Tensor1 dimension must be 1"); + } + Tensor1(const std::vector &shape, float value, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, value, queue) { + if (shape.size() != 1) + throw std::invalid_argument("Tensor1 dimension must be 1"); + } + Tensor1(int size, const cl::CommandQueue *queue = nullptr) + : Tensor({size}, queue) {} + Tensor1(int size, float value, const cl::CommandQueue *queue = nullptr) + : Tensor({size}, value, queue) {} + Tensor1(const std::vector &values, + const cl::CommandQueue *queue = nullptr) + : Tensor({(int)values.size()}, false, queue) { + fillBuf(values, queue); + } + Tensor1(const Tensor1 &) = delete; + Tensor1 &operator=(const Tensor1 &) = delete; + Tensor1(Tensor1 &&other) : Tensor(std::move(other)) {} + Tensor1 &operator=(Tensor1 &&other) = delete; + + int getSize() const override { return shape[0]; } +}; + +class Tensor2 : public ITensor2, public Tensor { +public: + Tensor2(const std::vector &shape, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, queue) { + if (shape.size() != 2) + throw std::invalid_argument("Tensor2 dimension must be 2"); + } + Tensor2(const std::vector &shape, float value, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, value, queue) { + if (shape.size() != 2) + throw std::invalid_argument("Tensor2 dimension must be 2"); + } + Tensor2(int rows, int cols, const cl::CommandQueue *queue = nullptr) + : ITensor2(), Tensor({rows, cols}, queue) {} + Tensor2(int rows, int cols, float value, + const cl::CommandQueue *queue = nullptr) + : ITensor2(), Tensor({rows, cols}, value, queue) {} + Tensor2(int rows, int cols, const std::vector &values, + const cl::CommandQueue *queue = nullptr) + : Tensor({rows, cols}, false, queue) { + fillBuf(values, queue); + } + Tensor2(const std::vector> &values, + const cl::CommandQueue *queue = nullptr) + : Tensor({(int)values.size(), (int)values[0].size()}, false) { + std::vector v(values.size() * values[0].size()); + for (size_t i = 0; i < values.size(); ++i) { + for (size_t j = 0; j < values[i].size(); ++j) + v[i * values[0].size() + j] = values[i][j]; + } + fillBuf(v, queue); + } + + Tensor2(const Tensor2 &) = delete; + Tensor2 &operator=(const Tensor2 &) = delete; + Tensor2(Tensor2 &&other) : Tensor(std::move(other)) {} + Tensor2 &operator=(Tensor2 &&other) = delete; + + int getRows() const override { return shape[0]; } + int getCols() const override { return shape[1]; } +}; + +class Tensor3 : public Tensor, public ITensor3 { +public: + Tensor3(const std::vector &shape, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, queue) { + if (shape.size() != 3) + throw std::invalid_argument("Tensor3 dimension must be 3"); + } + Tensor3(const std::vector &shape, float value, + const cl::CommandQueue *queue = nullptr) + : Tensor(shape, value, queue) { + if (shape.size() != 3) + throw std::invalid_argument("Tensor3 dimension must be 3"); + } + Tensor3(int d1, int d2, int d3, const cl::CommandQueue *queue = nullptr) + : Tensor({d1, d2, d3}, queue) {} + Tensor3(int d1, int d2, int d3, float value, + const cl::CommandQueue *queue = nullptr) + : Tensor({d1, d2, d3}, value, queue) {} + Tensor3(int d1, int d2, int d3, const std::vector &values, + const cl::CommandQueue *queue = nullptr) + : Tensor({d1, d2, d3}, false, queue) { + fillBuf(values, queue); + } + Tensor3(const std::vector>> &values, + const cl::CommandQueue *queue = nullptr) + : Tensor({(int)values.size(), (int)values[0].size(), + (int)values[0][0].size()}, + false, queue) { + std::vector v(shape[0] * shape[1] * shape[2]); + for (int i = 0; i < shape[0]; ++i) { + for (int j = 0; j < shape[1]; ++j) + for (int k = 0; k < shape[2]; ++k) + v[i * shape[1] * shape[2] + j * shape[1] + k] = values[i][j][k]; + } + fillBuf(v, queue); + } + Tensor3(const Tensor3 &) = delete; + Tensor3 &operator=(const Tensor3 &) = delete; + Tensor3(Tensor3 &&other) : Tensor(std::move(other)) {} + Tensor3 &operator=(Tensor3 &&other) = delete; +}; + +typedef Tensor0 Scalar; +typedef Tensor1 Vector; +typedef Tensor2 Matrix; + +} // namespace GPU diff --git a/src/math/tensor/math.hpp b/src/math/tensor/math.hpp new file mode 100644 index 0000000..36794d9 --- /dev/null +++ b/src/math/tensor/math.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include "tensor.hpp" + +enum class Activation { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU }; + +template +concept ITensorType = std::is_base_of_v; + +template +concept ITensor0Type = std::is_base_of_v; +template +concept ITensor1Type = std::is_base_of_v; +template +concept ITensor2Type = std::is_base_of_v; +template +concept ITensor3Type = std::is_base_of_v; + +template class ITensorMath { +protected: + void validateSameDimensions(const T &a, const T &b) const { + if (a.getDim() != b.getDim()) + throw std::invalid_argument("Tensors must have the same dimension"); + if (a.getSize() != b.getSize()) + throw std::invalid_argument("Tensors must have the same size"); + for (int i = 0; i < a.getDim(); ++i) { + if (a.getShape()[i] != b.getShape()[i]) + throw std::invalid_argument("Tensors must have the same shape"); + } + }; + +public: + virtual T activate(const T &m, Activation type, float alpha) = 0; + + virtual T mult(const T &m, float x) = 0; + virtual T add(const T &a, const T &b, float x) = 0; + virtual T add(const T &m, float x) = 0; +}; + +template class ITensor0Math {}; + +template class ITensor1Math {}; + +template class ITensor2Math { +public: + virtual T mult(const T &a, const T &b, bool transpose, float bias, + Activation type, float alpha) = 0; + + void validateMultDimensions(const T &a, const T &b, bool transpose) const { + if ((!transpose && a.getCols() != b.getRows()) || + (transpose && a.getCols() != b.getCols())) { + throw std::invalid_argument( + "Invalid matrix dimensions for multiplication"); + } + }; +}; + +template class ITensor3Math {}; \ No newline at end of file diff --git a/src/math/tensor/tensor.hpp b/src/math/tensor/tensor.hpp new file mode 100644 index 0000000..0ccc2dd --- /dev/null +++ b/src/math/tensor/tensor.hpp @@ -0,0 +1,67 @@ +#pragma once + +#include +#include + +std::random_device rd; +std::mt19937 gen(rd()); + +class ITensor { +protected: + std::vector shape; + + void validateDimensions(const std::vector &shape) const { + if (shape.empty()) + throw std::invalid_argument("Tensor shape cannot be empty"); + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] <= 0) + throw std::invalid_argument( + "All tensor dimensions must be positive, but dimension " + + std::to_string(i) + " is " + std::to_string(shape[i])); + } + }; + +public: + ITensor(const std::vector &shape) : shape(shape) {} + ITensor(const ITensor &) = default; + ITensor &operator=(const ITensor &) = default; + ITensor(ITensor &&other) = default; + ITensor &operator=(ITensor &&other) = default; + + const std::vector &getShape() const { return shape; } + int getDim() const { return static_cast(shape.size()); } + size_t getSize() const { + size_t size = 1; + for (int dim : shape) + size *= dim; + return size; + }; + + enum class Type { SCALAR, VECTOR, MATRIX, TENSOR3 }; + Type getType() const { return static_cast(shape.size()); }; +}; + +class ITensor0 {}; + +class ITensor1 { +public: + virtual int getSize() const = 0; +}; + +class ITensor2 { +public: + ITensor2() = default; + ITensor2(const ITensor2 &) = default; + ITensor2 &operator=(const ITensor2 &) = default; + ITensor2(ITensor2 &&other) = default; + ITensor2 &operator=(ITensor2 &&other) = default; + + virtual int getRows() const = 0; + virtual int getCols() const = 0; +}; + +class ITensor3 {}; + +typedef ITensor0 IScalar; +typedef ITensor1 IVector; +typedef ITensor2 IMatrix; \ No newline at end of file