Complete tensors math

This commit is contained in:
2025-11-01 10:30:32 +04:00
parent f728261354
commit f1dfe1b335
26 changed files with 1147 additions and 673 deletions

View File

@@ -2,7 +2,7 @@ CXX = g++
CXXFLAGS = -Wall -Wextra -O2 -std=c++23
LIBS = -lOpenCL
TARGET = main
COMMON_SRC = ./math/opencl/opencl.cpp ./math/matrix/cpu/matrix.cpp ./math/matrix/cpu/mutable_matrix.cpp ./math/matrix/gpu/matrix.cpp ./math/matrix/gpu/mutable_matrix.cpp
COMMON_SRC = ./math/opencl/opencl.cpp
MAIN_SRC = main.cpp $(COMMON_SRC)
BENCHMARK_SRC = benchmark.cpp $(COMMON_SRC)

View File

@@ -6,8 +6,7 @@
#include "./math/math.hpp"
typedef Matrices::CPU Matrix;
typedef MutableMatrices::CPU MutableMatrix;
using namespace GPU;
OpenCL openCL;
@@ -31,40 +30,37 @@ std::vector<float> generateIdentityMatrix(int size) {
}
int main() {
const int SIZE = 1024;
const int SIZE = 48;
std::cout << "Testing with " << SIZE << "x" << SIZE << " matrices..."
<< std::endl;
std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);
// std::vector<float> matrixA = generateRandomMatrix(SIZE, SIZE);
// std::vector<float> matrixB = generateRandomMatrix(SIZE, SIZE);
// std::vector<float> matrixC = generateRandomMatrix(SIZE, SIZE);
// std::vector<float> matrixA = generateIdentityMatrix(SIZE);
// std::vector<float> matrixB = generateIdentityMatrix(SIZE);
// std::vector<float> matrixC = generateIdentityMatrix(SIZE);
std::vector<float> matrixA = generateIdentityMatrix(SIZE);
std::vector<float> matrixB = generateIdentityMatrix(SIZE);
std::vector<float> matrixC = generateIdentityMatrix(SIZE);
// Тестирование на CPU
// Тестирование на GPU
{
std::cout << "\n=== CPU Version ===" << std::endl;
std::cout << "\n=== GPU Version ===" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
MutableMatrices::CPU a(SIZE, SIZE, matrixA);
Matrices::CPU b(SIZE, SIZE, matrixB);
Matrices::CPU c(SIZE, SIZE, matrixC);
MatrixMath mm;
Matrix a(SIZE, SIZE, matrixA);
Matrix b(SIZE, SIZE, matrixB);
auto gen_end = std::chrono::high_resolution_clock::now();
auto op_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 10; i++) {
a.mult(b, 0.2f, MutableMatrices::CPU::Activate::SIGMOID);
for (int i = 0; i < 100; ++i) {
Matrix x = mm.mult(a, b);
}
auto op_end = std::chrono::high_resolution_clock::now();
std::vector<float> v = a.toVector();
std::vector<float> v = a.toVector(&mm.getQueue());
auto total_end = std::chrono::high_resolution_clock::now();
@@ -88,24 +84,22 @@ int main() {
std::cout << std::endl;
}
// Тестирование на GPU
// Тестирование на CPU
{
std::cout << "\n=== GPU Version ===" << std::endl;
std::cout << "\n=== CPU Version ===" << std::endl;
auto start = std::chrono::high_resolution_clock::now();
MutableMatrices::GPU a(SIZE, SIZE, matrixA);
Matrices::GPU b(SIZE, SIZE, matrixB);
Matrices::GPU c(SIZE, SIZE, matrixC);
CPU::MatrixMath mm;
CPU::Matrix a(SIZE, SIZE, matrixA);
CPU::Matrix b(SIZE, SIZE, matrixB);
auto gen_end = std::chrono::high_resolution_clock::now();
auto op_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 10; i++) {
a.mult(b, 0.2f, MutableMatrices::GPU::Activate::SIGMOID, 0.0f);
for (int i = 0; i < 100; ++i) {
CPU::Matrix x = mm.mult(a, b);
}
auto op_end = std::chrono::high_resolution_clock::now();
std::vector<float> v = a.toVector();

View File

@@ -1,124 +1,147 @@
float activate_x(float x, const int activation_type, const float alpha) {
switch(activation_type) {
case 0: // LINEAR
return x;
case 1: // SIGMOID
return 1.0f / (1.0f + exp(-x));
case 2: // TANH
return tanh(x);
case 3: // RELU
return fmax(0.0f, x);
case 4: // LEAKY_RELU
return (x > 0.0f) ? x : alpha * x;
case 5: // ELU
return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
case 6: // GELU
return 0.5f * x * (1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x)));
default:
return x;
}
switch (activation_type) {
case 0: // LINEAR
return x;
case 1: // SIGMOID
return 1.0f / (1.0f + exp(-x));
case 2: // TANH
return tanh(x);
case 3: // RELU
return fmax(0.0f, x);
case 4: // LEAKY_RELU
return (x > 0.0f) ? x : alpha * x;
case 5: // ELU
return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
case 6: // GELU
return 0.5f * x *
(1.0f + tanh(sqrt(2.0f / M_PI_F) * (x + 0.044715f * x * x * x)));
default:
return x;
}
}
__kernel void activate(
__global float* input,
__global float* output,
const int activation_type,
const float alpha,
const int rows,
const int cols)
{
int row = get_global_id(0);
int col = get_global_id(1);
if (row < rows && col < cols) {
int idx = row * cols + col;
output[idx] = activate_x(input[idx], activation_type, alpha);
}
__kernel void activate(__global float *input, __global float *output,
const int activation_type, const float alpha) {
int i = get_global_id(0);
output[i] = activate_x(input[i], activation_type, alpha);
}
__kernel void mult(
__global float* A,
__global float* B,
__global float* C,
const float bias,
const int activation_type,
const float alpha,
const int M,
const int N,
const int K)
{
const int tile_size = 16;
int local_i = get_local_id(0);
int local_j = get_local_id(1);
int local_size_i = get_local_size(0);
int local_size_j = get_local_size(1);
int global_i = get_group_id(0) * local_size_i + local_i;
int global_j = get_group_id(1) * local_size_j + local_j;
__local float tile_A[16][16];
__local float tile_B[16][16];
__kernel void mult_small(__global float *A, __global float *B,
__global float *C, const float bias,
const int activation_type, const float alpha,
const int M, const int N, const int K,
const int transpose_B) {
const int row = get_global_id(0);
const int col = get_global_id(1);
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
float a_val = A[row * K + k];
int num_tiles = (K + tile_size - 1) / tile_size;
float b_val;
if (transpose_B) {
b_val = B[col * K + k];
} else {
b_val = B[k * N + col];
}
for (int tile = 0; tile < num_tiles; tile++) {
int tile_offset = tile * tile_size;
int load_i_A = tile_offset + local_i;
int load_j_A = tile_offset + local_j;
if (global_i < M && load_j_A < K) {
tile_A[local_j][local_i] = A[global_i * K + load_j_A];
} else {
tile_A[local_j][local_i] = 0.0f;
}
int load_i_B = tile_offset + local_i;
int load_j_B = tile_offset + local_j;
if (load_i_B < K && global_j < N) {
tile_B[local_j][local_i] = B[load_i_B * N + global_j];
} else {
tile_B[local_j][local_i] = 0.0f;
}
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < tile_size; k++) {
sum += tile_A[k][local_i] * tile_B[local_j][k];
}
barrier(CLK_LOCAL_MEM_FENCE);
sum += a_val * b_val;
}
if (global_i < M && global_j < N) {
float result = sum + bias;
if (activation_type != 0) {
result = activate_x(result, activation_type, alpha);
}
C[global_i * N + global_j] = result;
float result = sum + bias;
if (activation_type != 0) {
result = activate_x(result, activation_type, alpha);
}
C[row * N + col] = result;
}
}
__kernel void mult_sc(__global float* A, __global float* B, float scalar, int M, int N) {
int i = get_global_id(0);
int j = get_global_id(1);
B[i * N + j] = A[i * N + j] * scalar;
__kernel void mult(__global float *A, __global float *B, __global float *C,
const float bias, const int activation_type,
const float alpha, const int M, const int N, const int K,
const int transpose_B) {
const int tile_size = 16;
int local_i = get_local_id(0);
int local_j = get_local_id(1);
int local_size_i = get_local_size(0);
int local_size_j = get_local_size(1);
int global_i = get_group_id(0) * local_size_i + local_i;
int global_j = get_group_id(1) * local_size_j + local_j;
__local float tile_A[16][16];
__local float tile_B[16][16];
float sum = 0.0f;
int num_tiles = (K + tile_size - 1) / tile_size;
for (int tile = 0; tile < num_tiles; tile++) {
int tile_offset = tile * tile_size;
// Загрузка tile_A (без изменений)
int load_i_A = tile_offset + local_i;
int load_j_A = tile_offset + local_j;
if (global_i < M && load_j_A < K) {
tile_A[local_j][local_i] = A[global_i * K + load_j_A];
} else {
tile_A[local_j][local_i] = 0.0f;
}
// Загрузка tile_B с учетом транспонирования
int load_i_B = tile_offset + local_i;
int load_j_B = tile_offset + local_j;
if (transpose_B) {
// B транспонирована: обращаем индексы
if (load_i_B < N && global_j < K) {
tile_B[local_j][local_i] = B[global_j * N + load_i_B];
} else {
tile_B[local_j][local_i] = 0.0f;
}
} else {
// B не транспонирована (оригинальная логика)
if (load_i_B < K && global_j < N) {
tile_B[local_j][local_i] = B[load_i_B * N + global_j];
} else {
tile_B[local_j][local_i] = 0.0f;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < tile_size; ++k) {
sum += tile_A[k][local_i] * tile_B[local_j][k];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (global_i < M && global_j < N) {
float result = sum + bias;
if (activation_type != 0) {
result = activate_x(result, activation_type, alpha);
}
C[global_i * N + global_j] = result;
}
}
__kernel void add(__global float* A, __global float* B, __global float* C, float a, float b, int M, int N) {
int i = get_global_id(0);
int j = get_global_id(1);
C[i * N + j] = (A[i * N + j] * a) + (B[i * N + j] * b);
__kernel void mult_sc(__global float *A, __global float *B, float scalar) {
int i = get_global_id(0);
B[i] = A[i] * scalar;
}
__kernel void add_sc(__global float* A, __global float* B, float scalar, int M, int N) {
int i = get_global_id(0);
int j = get_global_id(1);
B[i * N + j] = A[i * N + j] + scalar;
__kernel void add(__global float *A, __global float *B, __global float *C,
float x) {
int i = get_global_id(0);
C[i] = A[i] + (B[i] * x);
}
__kernel void add_sc(__global float *A, __global float *B, float scalar) {
int i = get_global_id(0);
B[i] = A[i] + scalar;
}

View File

@@ -3,73 +3,29 @@
#include <chrono>
#include <thread>
typedef Matrices::GPU M;
typedef MutableMatrices::GPU MM;
class Layer {
protected:
int features;
float bias;
MM::Activate activate;
float alpha;
public:
Layer(int features, MM::Activate activate = MM::Activate::LINEAR,
float bias = 0.0f, float alpha = 0.0f)
: features(features), activate(activate), bias(bias), alpha(alpha) {}
int getFeatures() const { return features; }
float getBias() const { return bias; }
MM::Activate getActivate() const { return activate; }
float getAlpha() const { return alpha; }
};
class NeuralNetwork {
private:
std::vector<Layer> layers;
std::vector<MM> weights;
public:
NeuralNetwork(int n, std::initializer_list<Layer> l) : layers(l) {
weights.emplace_back(n, layers[0].getFeatures());
for (int i = 0; i < layers.size() - 1; i++)
weights.emplace_back(layers[i].getFeatures(),
layers[i + 1].getFeatures());
}
std::vector<float> predict(std::vector<float> i) {
if (i.size() != weights[0].getRows())
std::invalid_argument("Invalid input size");
MM input(1, (int)i.size(), i);
for (size_t i = 0; i < weights.size(); i++)
input.mult(weights[i], layers[i + 1].getBias(),
layers[i + 1].getActivate(), layers[i + 1].getAlpha());
return input.toVector();
}
};
using namespace GPU;
OpenCL openCL;
int main() {
NeuralNetwork nn(
2, {Layer(3, MM::Activate::RELU), Layer(1, MM::Activate::RELU)});
MatrixMath mm;
for (int i = 0; i < 10; i++) {
int v1 = (i / 2) % 2;
int v2 = i % 2;
Matrix a(2, 2);
Matrix b(2, 2);
std::vector<float> v = {static_cast<float>(v1), static_cast<float>(v2)};
CPU::Matrix a_(2, 2, a.toVector());
CPU::Matrix b_(2, 2, b.toVector());
std::vector<float> r = nn.predict(v);
float expected = static_cast<float>(v1 ^ v2);
a_.print();
b_.print();
std::cout << "XOR(" << v1 << ", " << v2 << ") = " << expected;
std::cout << " | Network: ";
for (size_t j = 0; j < r.size(); ++j) {
std::cout << r[j] << " ";
}
std::cout << std::endl;
}
Matrix c = mm.add(a, b);
CPU::Matrix c_(2, 2, c.toVector(&mm.getQueue()));
mm.await();
c_.print();
return 0;
}

View File

@@ -2,8 +2,6 @@
#include "opencl/opencl.hpp"
#include "matrix/cpu/matrix.hpp"
#include "matrix/cpu/mutable_matrix.hpp"
#include "tensor/cpu/math.hpp"
#include "matrix/gpu/matrix.hpp"
#include "matrix/gpu/mutable_matrix.hpp"
#include "tensor/gpu/math.hpp"

View File

@@ -1,24 +0,0 @@
#include "matrix.hpp"
Matrices::CPU::CPU(int rows, int cols, float value)
: IMatrix(rows, cols), data(rows * cols, value) {
validateDimensions(rows, cols);
}
Matrices::CPU::CPU(int rows, int cols, const std::vector<float> &matrix)
: IMatrix(rows, cols), data(matrix) {
validateDimensions(rows, cols);
if (matrix.size() != static_cast<size_t>(rows * cols)) {
throw std::invalid_argument("Data size doesn't match matrix dimensions");
}
}
float &Matrices::CPU::operator()(int row, int col) {
checkIndices(row, col);
return data[row * cols + col];
}
const float &Matrices::CPU::operator()(int row, int col) const {
checkIndices(row, col);
return data[row * cols + col];
}

View File

@@ -1,38 +0,0 @@
#pragma once
#include <algorithm>
#include <memory>
#include <stdexcept>
#include <vector>
#include "../matrix.hpp"
namespace Matrices {
class CPU : public IMatrix {
protected:
std::vector<float> data;
public:
CPU(int rows, int cols, float value = 0.0f);
CPU(int rows, int cols, const std::vector<float> &matrix);
CPU(const CPU &) = default;
CPU &operator=(const CPU &) = default;
CPU(CPU &&) = default;
CPU &operator=(CPU &&) = default;
~CPU() override = default;
float &operator()(int row, int col);
const float &operator()(int row, int col) const;
const std::vector<float> toVector() const { return data; }
int getRows() const override { return rows; }
int getCols() const override { return cols; }
size_t getSize() const { return data.size(); }
// GPU toGPU(OpenCL &openCL) const { return GPU(rows, cols, data); }
};
} // namespace Matrices

View File

@@ -1,76 +0,0 @@
#include "mutable_matrix.hpp"
float MutableMatrices::CPU::activate_x(float x, Activate type, float alpha) {
switch (type) {
case Activate::LINEAR:
return x;
case Activate::SIGMOID:
return 1.0f / (1.0f + std::exp(-x));
case Activate::TANH:
return std::tanh(x);
case Activate::RELU:
return std::max(0.0f, x);
case Activate::LEAKY_RELU:
return (x > 0.0f) ? x : alpha * x;
case Activate::ELU:
return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f);
case Activate::GELU:
return 0.5f * x *
(1.0f +
std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x)));
default:
throw std::invalid_argument("Unknown activation type");
}
}
void MutableMatrices::CPU::mult(Matrices::CPU &m, float bias, Activate type,
float alpha) {
validateMultDimensions(*this, m);
std::vector<float> result(rows * m.getCols(), 0.0f);
for (int i = 0; i < rows; i++) {
for (int j = 0; j < m.getCols(); j++) {
float sum = 0.0f;
for (int k = 0; k < cols; k++) {
sum += (*this)(i, k) * m(k, j);
}
result[i * m.getCols() + j] = activate_x(sum + bias, type, alpha);
}
}
data = std::move(result);
cols = m.getCols();
}
void MutableMatrices::CPU::mult(float scalar) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
data[i * cols + j] *= scalar;
}
}
}
void MutableMatrices::CPU::add(Matrices::CPU &m, float a, float b) {
validateSameDimensions(*this, m);
std::vector<float> result(rows * cols, 0.0f);
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
result[i * cols + j] = ((*this)(i, j) * a) + (m(i, j) * b);
}
}
data = std::move(result);
}
void MutableMatrices::CPU::add(float scalar) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
data[i * cols + j] += scalar;
}
}
}
void MutableMatrices::CPU::activate(Activate type, float alpha) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
data[i * cols + j] = activate_x(data[i * cols + j], type, alpha);
}
}
}

View File

@@ -1,28 +0,0 @@
#pragma once
#include "matrix.hpp"
#include "../mutable_matrix.hpp"
#include <cmath>
#define M_PI 3.14159265358979323846
namespace MutableMatrices {
class CPU : public Matrices::CPU, public IMutableMatrix<Matrices::CPU> {
private:
static float activate_x(float x, Activate type, float alpha = 0.01f);
public:
CPU(int rows, int cols, const std::vector<float> &matrix)
: Matrices::CPU(rows, cols, matrix) {}
void mult(Matrices::CPU &m, float bias = 0.0f,
Activate type = Activate::LINEAR, float alpha = 0.01f);
void mult(float scalar);
void add(Matrices::CPU &m, float a = 1.0f, float b = 1.0f);
void add(float scalar);
void activate(Activate type, float alpha = 0.01f);
};
}; // namespace MutableMatrices

View File

@@ -1,41 +0,0 @@
#include <random>
#include "matrix.hpp"
std::random_device rd;
std::mt19937 gen(rd());
Matrices::GPU::GPU(int rows, int cols)
: IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
validateDimensions(rows, cols);
std::vector<float> matrix;
matrix.reserve(rows * cols);
for (size_t i = 0; i < (size_t)rows * (size_t)cols; ++i)
matrix.push_back(std::generate_canonical<float, 32>(gen));
buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * cols * sizeof(float));
queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
matrix.data());
queue.finish();
}
Matrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
: IMatrix(rows, cols), queue(openCL.getContext(), openCL.getDevice()) {
validateDimensions(rows, cols);
if (matrix.size() != static_cast<size_t>(rows * cols)) {
throw std::invalid_argument("Matrix data size doesn't match dimensions");
}
buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * cols * sizeof(float));
queue.enqueueWriteBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
matrix.data());
queue.finish();
}
const std::vector<float> Matrices::GPU::toVector() const {
std::vector<float> result(rows * cols);
queue.enqueueReadBuffer(*buffer, CL_TRUE, 0, rows * cols * sizeof(float),
result.data());
queue.finish();
return result;
}

View File

@@ -1,40 +0,0 @@
#pragma once
#include "../../opencl/opencl.hpp"
#include "../matrix.hpp"
namespace Matrices {
class GPU : public IMatrix {
protected:
cl::Buffer *buffer;
cl::CommandQueue queue;
public:
GPU(int rows, int cols);
GPU(int rows, int cols, const std::vector<float> &matrix);
~GPU() { delete buffer; }
GPU(const GPU &) = delete;
GPU &operator=(const GPU &) = delete;
GPU(GPU &&other)
: IMatrix(other.rows, other.cols), buffer(other.buffer),
queue(std::move(other.queue)) {
other.buffer = nullptr;
other.rows = 0;
other.cols = 0;
}
GPU &operator=(GPU &&other) = default;
int getRows() const override { return rows; }
int getCols() const override { return cols; }
size_t getSize() const { return rows * cols; }
const cl::Buffer *getBuffer() const { return buffer; }
const std::vector<float> toVector() const;
// CPU toCPU() const { return CPU(rows, cols, toVector()); };
};
} // namespace Matrices

View File

@@ -1,120 +0,0 @@
#include "mutable_matrix.hpp"
MutableMatrices::GPU::GPU(int rows, int cols) : Matrices::GPU(rows, cols) {
for (const auto &entry : kernelsNames) {
kernels[entry.first] =
cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
}
}
MutableMatrices::GPU::GPU(int rows, int cols, const std::vector<float> &matrix)
: Matrices::GPU(rows, cols, matrix) {
for (const auto &entry : kernelsNames) {
kernels[entry.first] =
cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
}
}
void MutableMatrices::GPU::mult(Matrices::GPU &m, float bias, Activate type,
float alpha) {
validateMultDimensions(*this, m);
cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * m.getCols() * sizeof(float));
const int tile_size = 16;
cl::NDRange local_size(tile_size, tile_size);
cl::NDRange global_size(((rows + tile_size - 1) / tile_size) * tile_size,
((m.getCols() + tile_size - 1) / tile_size) *
tile_size);
kernels[Method::MULT].setArg(0, *buffer);
kernels[Method::MULT].setArg(1, *m.getBuffer());
kernels[Method::MULT].setArg(2, *b);
kernels[Method::MULT].setArg(3, bias);
kernels[Method::MULT].setArg(4, static_cast<int>(type));
kernels[Method::MULT].setArg(5, alpha);
kernels[Method::MULT].setArg(6, rows);
kernels[Method::MULT].setArg(7, m.getCols());
kernels[Method::MULT].setArg(8, cols);
cl::Event event;
queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange, global_size,
local_size, nullptr, &event);
event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
buffer = b;
cols = m.getCols();
}
void MutableMatrices::GPU::mult(float scalar) {
cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * cols * sizeof(float));
kernels[Method::SCALAR_MULT].setArg(0, *buffer);
kernels[Method::SCALAR_MULT].setArg(1, *b);
kernels[Method::SCALAR_MULT].setArg(2, scalar);
kernels[Method::SCALAR_MULT].setArg(3, rows);
kernels[Method::SCALAR_MULT].setArg(4, cols);
cl::Event event;
queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
cl::NDRange(rows, cols), cl::NullRange, nullptr,
&event);
event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
buffer = b;
}
void MutableMatrices::GPU::add(Matrices::GPU &m, float a, float b) {
validateSameDimensions(*this, m);
cl::Buffer *buf = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * cols * sizeof(float));
kernels[Method::ADD].setArg(0, *buffer);
kernels[Method::ADD].setArg(1, *m.getBuffer());
kernels[Method::ADD].setArg(2, *buf);
kernels[Method::ADD].setArg(3, a);
kernels[Method::ADD].setArg(4, b);
kernels[Method::ADD].setArg(5, rows);
kernels[Method::ADD].setArg(6, cols);
cl::Event event;
queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
cl::NDRange(rows, cols), cl::NullRange, nullptr,
&event);
event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
buffer = buf;
}
void MutableMatrices::GPU::add(float scalar) {
cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * cols * sizeof(float));
kernels[Method::SCALAR_ADD].setArg(0, *buffer);
kernels[Method::SCALAR_ADD].setArg(1, *b);
kernels[Method::SCALAR_ADD].setArg(2, scalar);
kernels[Method::SCALAR_ADD].setArg(3, rows);
kernels[Method::SCALAR_ADD].setArg(4, cols);
cl::Event event;
queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
cl::NDRange(rows, cols), cl::NullRange, nullptr,
&event);
event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
buffer = b;
}
void MutableMatrices::GPU::activate(Activate type, float alpha) {
cl::Buffer *b = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
rows * cols * sizeof(float));
kernels[Method::ACTIVATE].setArg(0, *buffer);
kernels[Method::ACTIVATE].setArg(1, *b);
kernels[Method::ACTIVATE].setArg(2, static_cast<int>(type));
kernels[Method::ACTIVATE].setArg(3, alpha);
kernels[Method::ACTIVATE].setArg(4, rows);
kernels[Method::ACTIVATE].setArg(5, cols);
cl::Event event;
queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange,
cl::NDRange(rows, cols), cl::NullRange, nullptr,
&event);
event.setCallback(CL_COMPLETE, releaseBuffer, buffer);
buffer = b;
}

View File

@@ -1,46 +0,0 @@
#pragma once
#include "../../opencl/opencl.hpp"
#include "matrix.hpp"
#include "../mutable_matrix.hpp"
namespace MutableMatrices {
class GPU : public Matrices::GPU, public IMutableMatrix<Matrices::GPU> {
private:
enum class Method { MULT, SCALAR_MULT, ADD, SCALAR_ADD, ACTIVATE };
std::unordered_map<Method, cl::Kernel> kernels;
std::unordered_map<Method, std::string> kernelsNames = {
{Method::MULT, "mult"},
{Method::SCALAR_MULT, "mult_sc"},
{Method::ADD, "add"},
{Method::SCALAR_ADD, "add_sc"},
{Method::ACTIVATE, "activate"}};
static void CL_CALLBACK releaseBuffer(cl_event, cl_int status, void *buf) {
if (status == CL_COMPLETE) {
// std::cout << "Kernel complete!" << std::endl;
delete (cl::Buffer *)buf;
}
}
public:
GPU(int rows, int cols);
GPU(int rows, int cols, const std::vector<float> &matrix);
GPU(const GPU &) = delete;
GPU &operator=(const GPU &) = delete;
GPU(GPU &&other) = default;
GPU &operator=(GPU &&other) = default;
void mult(Matrices::GPU &m, float bias = 0.0f,
Activate type = Activate::LINEAR, float alpha = 0.01f);
void mult(float scalar);
void add(Matrices::GPU &m, float a = 1.0f, float b = 1.0f);
void add(float scalar);
void activate(Activate type, float alpha = 0.01f);
};
}; // namespace MutableMatrices

View File

@@ -1,29 +0,0 @@
#pragma once
#include <random>
#include <stdexcept>
#include <vector>
class IMatrix {
protected:
int rows;
int cols;
void validateDimensions(int rows, int cols) const {
if (rows <= 0 || cols <= 0) {
throw std::invalid_argument("Matrix dimensions must be positive");
}
};
void checkIndices(int row, int col) const {
if (row < 0 || row >= rows || col < 0 || col >= cols) {
throw std::out_of_range("Matrix indices out of range");
}
};
public:
IMatrix(int rows, int cols) : rows(rows), cols(cols) {}
virtual ~IMatrix() = default;
virtual int getRows() const = 0;
virtual int getCols() const = 0;
virtual const std::vector<float> toVector() const = 0;
};

View File

@@ -1,29 +0,0 @@
#pragma once
#include "matrix.hpp"
template <typename T> class IMutableMatrix {
static_assert(std::is_base_of<IMatrix, T>::value,
"T must be derived from IMatrix");
public:
enum class Activate { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU };
virtual void mult(T &m, float bias, Activate type, float alpha) = 0;
virtual void mult(float s) = 0;
virtual void add(T &m, float a, float b) = 0;
virtual void add(float a) = 0;
virtual void activate(Activate type, float alpha = 0.01f) = 0;
void validateMultDimensions(T &a, T &b) const {
if (a.getCols() != b.getRows()) {
throw std::invalid_argument(
"Invalid matrix dimensions for multiplication");
}
};
void validateSameDimensions(T &a, T &b) const {
if (a.getRows() != b.getRows() || a.getCols() != b.getCols()) {
throw std::invalid_argument("Invalid matrix dimensions for addition");
}
};
};

View File

@@ -40,7 +40,7 @@ public:
cl::Device &getDevice() { return device; }
cl::Context &getContext() { return context; }
cl::CommandQueue &getDefaultQueue() { return defaultQueue; }
const cl::CommandQueue &getDefaultQueue() { return defaultQueue; }
cl::Program &getProgram(Program program);
void printDeviceInfo() const;

View File

@@ -0,0 +1 @@
#include "math.hpp"

View File

@@ -0,0 +1,103 @@
#pragma once
#include "tensor.hpp"
#include "../math.hpp"
#include <cmath>
#define M_PI 3.14159265358979323846
namespace CPU {
template <ITensorType T> class TensorMath;
class Tensor0Math;
class Tensor1Math;
class Tensor2Math;
class Tensor3Math;
template <ITensorType T> class TensorMath : public ITensorMath<T> {
protected:
float activate_x(float x, Activation type, float alpha = 0.01f) {
switch (type) {
case Activation::LINEAR:
return x;
case Activation::SIGMOID:
return 1.0f / (1.0f + std::exp(-x));
case Activation::TANH:
return std::tanh(x);
case Activation::RELU:
return std::max(0.0f, x);
case Activation::LEAKY_RELU:
return (x > 0.0f) ? x : alpha * x;
case Activation::ELU:
return (x > 0.0f) ? x : alpha * (std::exp(x) - 1.0f);
case Activation::GELU:
return 0.5f * x *
(1.0f +
std::tanh(std::sqrt(2.0f / M_PI) * (x + 0.044715f * x * x * x)));
default:
throw std::invalid_argument("Unknown activation type");
}
}
public:
T activate(const T &t, Activation type = Activation::LINEAR,
float alpha = 0.0f) override {
T result(t.getShape(), false);
for (size_t i = 0; i < t.getSize(); ++i) {
result[i] = activate_x(t[i], type, alpha);
}
return result;
}
T mult(const T &t, float x) override {
T result(t.getShape(), false);
for (size_t i = 0; i < t.getSize(); ++i)
result[i] = t[i] * x;
return result;
}
T add(const T &a, const T &b, float x = 1.0f) override {
this->validateSameDimensions(a, b);
T result(a.getShape(), false);
for (size_t i = 0; i < a.getSize(); ++i)
result[i] = a[i] + (b[i] * x);
return result;
}
T add(const T &t, float x) override {
T result(t.getShape(), false);
for (size_t i = 0; i < t.getSize(); ++i)
result[i] = t[i] + x;
return result;
}
};
class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
public:
Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
float bias = 0.0f, Activation type = Activation::LINEAR,
float alpha = 0.01f) override {
validateMultDimensions(a, b, transpose);
Tensor2 result(a.getRows(), b.getCols(), 0.0f);
for (int i = 0; i < result.getRows(); ++i) {
for (int j = 0; j < result.getCols(); ++j) {
float sum = 0.0f;
for (int k = 0; k < a.getCols(); ++k)
sum += a(i, k) * (transpose ? b(j, k) : b(k, j));
result(i, j) = activate_x(sum + bias, type, alpha);
}
}
return result;
}
};
class Tensor3Math : public TensorMath<Tensor3>, public ITensor3Math<Tensor3> {};
typedef Tensor0Math ScalarMath;
typedef Tensor1Math VectorMath;
typedef Tensor2Math MatrixMath;
} // namespace CPU

View File

@@ -0,0 +1 @@
#include "tensor.hpp"

View File

@@ -0,0 +1,296 @@
#pragma once
#include <algorithm>
#include <iostream>
#include <random>
#include <vector>
#include "../tensor.hpp"
extern std::mt19937 gen;
namespace CPU {
class Tensor;
class Tensor0;
class Tensor1;
class Tensor2;
class Tensor3;
class Tensor : public ITensor {
protected:
std::vector<float> data;
void resize(size_t size) { data.resize(size); }
void resize(const std::vector<int> &shape) {
size_t size = 1;
for (int dim : shape)
size *= dim;
resize(size);
}
public:
Tensor(const std::vector<int> &shape) : ITensor(shape) {
resize(shape);
std::generate(data.begin(), data.end(),
[]() { return std::generate_canonical<float, 10>(gen); });
}
Tensor(const std::vector<int> &shape, float value) : ITensor(shape) {
resize(shape);
std::fill(data.begin(), data.end(), value);
}
Tensor(const std::vector<int> &shape, bool fill) : ITensor(shape) {
resize(shape);
if (fill)
std::fill(data.begin(), data.end(), 0.0f);
}
Tensor(const Tensor &) = default;
Tensor &operator=(const Tensor &) = default;
Tensor(Tensor &&other) = default;
Tensor &operator=(Tensor &&other) = default;
float &operator[](int index) { return data[index]; }
const float &operator[](int index) const { return data[index]; }
virtual void print() const {
std::cout << "Tensor(" << getDim() << "): [";
for (size_t i = 0; i < data.size(); ++i) {
std::cout << data[i];
if (i > 15) {
std::cout << "... ";
break;
}
if (i != data.size() - 1)
std::cout << ", ";
}
std::cout << "]" << std::endl;
}
std::vector<float> toVector() const { return data; }
static Tensor0 *asScalar(Tensor *tensor) {
return tensor->getType() == Type::SCALAR
? reinterpret_cast<Tensor0 *>(tensor)
: nullptr;
}
static const Tensor0 *asScalar(const Tensor *tensor) {
return tensor->getType() == Type::SCALAR
? reinterpret_cast<const Tensor0 *>(tensor)
: nullptr;
}
static Tensor1 *asVector(Tensor *tensor) {
return tensor->getType() == Type::VECTOR
? reinterpret_cast<Tensor1 *>(tensor)
: nullptr;
}
static const Tensor1 *asVector(const Tensor *tensor) {
return tensor->getType() == Type::VECTOR
? reinterpret_cast<const Tensor1 *>(tensor)
: nullptr;
}
static Tensor2 *asMatrix(Tensor *tensor) {
return tensor->getType() == Type::MATRIX
? reinterpret_cast<Tensor2 *>(tensor)
: nullptr;
}
static const Tensor2 *asMatrix(const Tensor *tensor) {
return tensor->getType() == Type::MATRIX
? reinterpret_cast<const Tensor2 *>(tensor)
: nullptr;
}
static Tensor3 *asTensor3(Tensor *tensor) {
return tensor->getType() == Type::TENSOR3
? reinterpret_cast<Tensor3 *>(tensor)
: nullptr;
}
static const Tensor3 *asTensor3(const Tensor *tensor) {
return tensor->getType() == Type::TENSOR3
? reinterpret_cast<const Tensor3 *>(tensor)
: nullptr;
}
};
class Tensor0 : public Tensor, public ITensor0 {
public:
Tensor0(const std::vector<int> &shape) : Tensor(shape) {
if (shape.size() != 0)
throw std::invalid_argument("Tensor0 dimension must be 0");
}
Tensor0(const std::vector<int> &shape, float value) : Tensor(shape, value) {
if (shape.size() != 0)
throw std::invalid_argument("Tensor0 dimension must be 0");
}
Tensor0() : Tensor({}) {
resize(1);
data[0] = std::generate_canonical<float, 10>(gen);
}
Tensor0(float value) : Tensor({}) {
resize(1);
data[0] = value;
}
Tensor0(const Tensor0 &) = default;
Tensor0 &operator=(const Tensor0 &) = default;
Tensor0(Tensor0 &&other) = default;
Tensor0 &operator=(Tensor0 &&other) = default;
void print() const override {
std::cout << "Scalar: " << data[0] << std::endl;
}
float &value() { return data[0]; }
const float &value() const { return data[0]; }
};
class Tensor1 : public Tensor, public ITensor1 {
public:
Tensor1(const std::vector<int> &shape) : Tensor(shape) {
if (shape.size() != 1)
throw std::invalid_argument("Tensor1 dimension must be 1");
}
Tensor1(const std::vector<int> &shape, float value) : Tensor(shape, value) {
if (shape.size() != 1)
throw std::invalid_argument("Tensor1 dimension must be 1");
}
Tensor1(int size) : Tensor({size}) {}
Tensor1(int size, float value) : Tensor({size}, value) {}
Tensor1(const std::vector<float> &values) : Tensor({(int)values.size()}) {
data = values;
}
Tensor1(const Tensor1 &) = default;
Tensor1 &operator=(const Tensor1 &) = default;
Tensor1(Tensor1 &&other) = default;
Tensor1 &operator=(Tensor1 &&other) = default;
void print() const override {
std::cout << "Vector(" << shape[0] << "): [";
for (size_t i = 0; i < data.size(); ++i) {
std::cout << data[i];
if (i != data.size() - 1)
std::cout << ", ";
}
std::cout << "]" << std::endl;
}
float &operator()(int i) { return data[i]; }
const float &operator()(int i) const { return data[i]; }
int getSize() const override { return shape[0]; }
};
class Tensor2 : public ITensor2, public Tensor {
public:
Tensor2(const std::vector<int> &shape) : Tensor(shape) {
if (shape.size() != 2)
throw std::invalid_argument("Tensor2 dimension must be 2");
}
Tensor2(const std::vector<int> &shape, float value) : Tensor(shape, value) {
if (shape.size() != 2)
throw std::invalid_argument("Tensor2 dimension must be 2");
}
Tensor2(int rows, int cols) : ITensor2(), Tensor({rows, cols}) {}
Tensor2(int rows, int cols, float value)
: ITensor2(), Tensor({rows, cols}, value) {}
Tensor2(int rows, int cols, const std::vector<float> &values)
: Tensor({rows, cols}, false) {
for (int i = 0; i < shape[0]; ++i) {
for (int j = 0; j < shape[1]; ++j) {
data[i * shape[1] + j] = values[i * shape[1] + j];
}
}
}
Tensor2(const std::vector<std::vector<float>> &values)
: Tensor({(int)values.size(), (int)values[0].size()}) {
for (int i = 0; i < shape[0]; ++i) {
for (int j = 0; j < shape[1]; ++j) {
data[i * shape[1] + j] = values[i][j];
}
}
}
Tensor2(const Tensor2 &) = default;
Tensor2 &operator=(const Tensor2 &) = default;
Tensor2(Tensor2 &&other) = default;
Tensor2 &operator=(Tensor2 &&other) = default;
void print() const override {
std::cout << "Matrix(" << shape[0] << "x" << shape[1] << "):\n";
for (int i = 0; i < shape[0]; ++i) {
for (int j = 0; j < shape[1]; ++j) {
std::cout << data[i * shape[1] + j] << " ";
}
std::cout << std::endl;
}
}
float &operator()(int i, int j) { return data[i * shape[1] + j]; }
const float &operator()(int i, int j) const { return data[i * shape[1] + j]; }
int getRows() const override { return shape[0]; }
int getCols() const override { return shape[1]; }
};
class Tensor3 : public Tensor, public ITensor3 {
public:
Tensor3(const std::vector<int> &shape) : Tensor(shape) {
if (shape.size() != 3)
throw std::invalid_argument("Tensor3 dimension must be 3");
}
Tensor3(const std::vector<int> &shape, float value) : Tensor(shape, value) {
if (shape.size() != 3)
throw std::invalid_argument("Tensor3 dimension must be 3");
}
Tensor3(int d1, int d2, int d3) : Tensor({d1, d2, d3}) {}
Tensor3(int d1, int d2, int d3, float value) : Tensor({d1, d2, d3}, value) {}
Tensor3(int d1, int d2, int d3, const std::vector<float> &values)
: Tensor({d1, d2, d3}, false) {
for (int i = 0; i < shape[0]; ++i) {
for (int j = 0; j < shape[1]; ++j) {
for (int k = 0; k < shape[2]; ++k) {
data[i * shape[1] * shape[2] + j * shape[2] + k] =
values[i * shape[1] * shape[2] + j * shape[2] + k];
}
}
}
}
Tensor3(const std::vector<std::vector<std::vector<float>>> &values)
: Tensor({(int)values.size(), (int)values[0].size(),
(int)values[0][0].size()}) {
for (int i = 0; i < shape[0]; ++i) {
for (int j = 0; j < shape[1]; ++j) {
for (int k = 0; k < shape[2]; ++k) {
data[i * shape[1] * shape[2] + j * shape[2] + k] = values[i][j][k];
}
}
}
}
Tensor3(const Tensor3 &) = default;
Tensor3 &operator=(const Tensor3 &) = default;
Tensor3(Tensor3 &&other) = default;
Tensor3 &operator=(Tensor3 &&other) = default;
void print() const override {
std::cout << "Tensor3(" << shape[0] << "x" << shape[1] << "x" << shape[2]
<< "):\n";
for (int i = 0; i < shape[0]; ++i) {
std::cout << "Slice " << i << ":\n";
for (int j = 0; j < shape[1]; ++j) {
for (int k = 0; k < shape[2]; ++k) {
std::cout << data[i * shape[1] * shape[2] + j * shape[2] + k] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
float &operator()(int i, int j, int k) {
return data[i * shape[1] * shape[2] + j * shape[2] + k];
}
const float &operator()(int i, int j, int k) const {
return data[i * shape[1] * shape[2] + j * shape[2] + k];
}
};
typedef Tensor0 Scalar;
typedef Tensor1 Vector;
typedef Tensor2 Matrix;
} // namespace CPU

View File

@@ -0,0 +1 @@
#include "math.hpp"

View File

@@ -0,0 +1,164 @@
#pragma once
#include "../../opencl/opencl.hpp"
#include "tensor.hpp"
#include "../math.hpp"
namespace GPU {
template <ITensorType T> class TensorMath;
class Tensor0Math;
class Tensor1Math;
class Tensor2Math;
class Tensor3Math;
template <ITensorType T> class TensorMath : public ITensorMath<T> {
protected:
enum class Method {
MULT,
MULT_SMALL,
SCALAR_MULT,
ADD,
SCALAR_ADD,
ACTIVATE
};
std::unordered_map<Method, cl::Kernel> kernels;
std::unordered_map<Method, std::string> kernelsNames = {
{Method::MULT, "mult"}, {Method::MULT_SMALL, "mult_small"},
{Method::SCALAR_MULT, "mult_sc"}, {Method::ADD, "add"},
{Method::SCALAR_ADD, "add_sc"}, {Method::ACTIVATE, "activate"}};
cl::CommandQueue queue;
public:
TensorMath() {
queue = cl::CommandQueue(openCL.getContext(), openCL.getDevice());
for (const auto &entry : kernelsNames) {
kernels[entry.first] =
cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
}
}
const cl::CommandQueue &getQueue() const { return queue; }
void await() const { queue.finish(); }
T activate(const T &t, Activation type = Activation::LINEAR,
float alpha = 0.0f) override {
T result(t.getShape(), false, &queue);
kernels[Method::ACTIVATE].setArg(0, *t.getBuffer());
kernels[Method::ACTIVATE].setArg(1, *result.getBuffer());
kernels[Method::ACTIVATE].setArg(2, static_cast<int>(type));
kernels[Method::ACTIVATE].setArg(3, alpha);
queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange,
cl::NDRange(t.getSize()));
return result;
}
T mult(const T &t, float x) override {
T result(t.getShape(), false, &queue);
kernels[Method::SCALAR_MULT].setArg(0, *t.getBuffer());
kernels[Method::SCALAR_MULT].setArg(1, *result.getBuffer());
kernels[Method::SCALAR_MULT].setArg(2, x);
queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
cl::NDRange(t.getSize()));
return result;
}
T add(const T &a, const T &b, float x = 1.0f) override {
this->validateSameDimensions(a, b);
T result(a.getShape(), false, &queue);
kernels[Method::ADD].setArg(0, *a.getBuffer());
kernels[Method::ADD].setArg(1, *b.getBuffer());
kernels[Method::ADD].setArg(2, *result.getBuffer());
kernels[Method::ADD].setArg(3, x);
queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
cl::NDRange(a.getSize()));
return result;
}
T add(const T &t, float x) override {
T result(t.getShape(), false, &queue);
kernels[Method::SCALAR_ADD].setArg(0, *t.getBuffer());
kernels[Method::SCALAR_ADD].setArg(1, *result.getBuffer());
kernels[Method::SCALAR_ADD].setArg(2, x);
queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
cl::NDRange(t.getSize()));
return result;
}
};
class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
private:
Tensor2 mult_tiled(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
float bias = 0.0f, Activation type = Activation::LINEAR,
float alpha = 0.01f) {
validateMultDimensions(a, b, transpose);
Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
&queue);
const int tile_size = 16;
cl::NDRange local_size(tile_size, tile_size);
cl::NDRange global_size(
((result.getRows() + tile_size - 1) / tile_size) * tile_size,
((result.getCols() + tile_size - 1) / tile_size) * tile_size);
kernels[Method::MULT].setArg(0, *a.getBuffer());
kernels[Method::MULT].setArg(1, *b.getBuffer());
kernels[Method::MULT].setArg(2, *result.getBuffer());
kernels[Method::MULT].setArg(3, bias);
kernels[Method::MULT].setArg(4, static_cast<int>(type));
kernels[Method::MULT].setArg(5, alpha);
kernels[Method::MULT].setArg(6, result.getRows());
kernels[Method::MULT].setArg(7, result.getCols());
kernels[Method::MULT].setArg(8, a.getCols());
kernels[Method::MULT].setArg(9, transpose ? 1 : 0);
queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange,
global_size, local_size);
return result;
}
Tensor2 mult_small(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
float bias = 0.0f, Activation type = Activation::LINEAR,
float alpha = 0.01f) {
validateMultDimensions(a, b, transpose);
Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
&queue);
kernels[Method::MULT_SMALL].setArg(0, *a.getBuffer());
kernels[Method::MULT_SMALL].setArg(1, *b.getBuffer());
kernels[Method::MULT_SMALL].setArg(2, *result.getBuffer());
kernels[Method::MULT_SMALL].setArg(3, bias);
kernels[Method::MULT_SMALL].setArg(4, static_cast<int>(type));
kernels[Method::MULT_SMALL].setArg(5, alpha);
kernels[Method::MULT_SMALL].setArg(6, result.getRows());
kernels[Method::MULT_SMALL].setArg(7, result.getCols());
kernels[Method::MULT_SMALL].setArg(8, a.getCols());
kernels[Method::MULT_SMALL].setArg(9, transpose ? 1 : 0);
queue.enqueueNDRangeKernel(kernels[Method::MULT_SMALL], cl::NullRange,
cl::NDRange(result.getRows(), result.getCols()));
return result;
}
public:
Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
float bias = 0.0f, Activation type = Activation::LINEAR,
float alpha = 0.01f) override {
if (a.getRows() > 64 || a.getCols() > 64 || b.getRows() > 64 ||
b.getCols() > 64)
return mult_tiled(a, b, transpose, bias, type, alpha);
else
return mult_small(a, b, transpose, bias, type, alpha);
}
};
class Tensor3Math : public TensorMath<Tensor3>, public ITensor3Math<Tensor3> {};
typedef Tensor0Math ScalarMath;
typedef Tensor1Math VectorMath;
typedef Tensor2Math MatrixMath;
} // namespace GPU

View File

@@ -0,0 +1 @@
#include "tensor.hpp"

View File

@@ -0,0 +1,282 @@
#pragma once
#include "../../opencl/opencl.hpp"
#include <algorithm>
#include <iostream>
#include <random>
#include <vector>
#include "../tensor.hpp"
#include "math.hpp"
extern std::mt19937 gen;
namespace GPU {
class Tensor;
class Tensor0;
class Tensor1;
class Tensor2;
class Tensor3;
class Tensor : public ITensor {
protected:
cl::Buffer *buffer = nullptr;
size_t getShapeSize(const std::vector<int> &shape) {
size_t size = 1;
for (int dim : shape)
size *= dim;
return size;
}
void fillBuf(const std::vector<float> &v,
const cl::CommandQueue *queue = nullptr) {
if (buffer != nullptr)
throw std::runtime_error("Tensor buffer already exists");
buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
v.size() * sizeof(float));
cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
q.enqueueWriteBuffer(*buffer, CL_TRUE, 0, v.size() * sizeof(float),
v.data());
q.finish();
}
void createBuf(size_t size, const cl::CommandQueue *queue = nullptr) {
std::vector<float> v(size);
std::generate(v.begin(), v.end(),
[]() { return std::generate_canonical<float, 10>(gen); });
fillBuf(v, queue);
}
void createBuf(size_t size, float value,
const cl::CommandQueue *queue = nullptr) {
std::vector<float> v(size);
std::fill(v.begin(), v.end(), value);
fillBuf(v, queue);
}
public:
Tensor(const std::vector<int> &shape, const cl::CommandQueue *queue = nullptr)
: ITensor(shape) {
createBuf(getShapeSize(shape), queue);
}
Tensor(const std::vector<int> &shape, float value,
const cl::CommandQueue *queue = nullptr)
: ITensor(shape) {
createBuf(getShapeSize(shape), value, queue);
}
Tensor(const std::vector<int> &shape, bool fill,
const cl::CommandQueue *queue = nullptr)
: ITensor(shape) {
if (fill)
createBuf(getShapeSize(shape), 0.0f, queue);
}
Tensor(const Tensor &) = delete;
Tensor &operator=(const Tensor &) = delete;
Tensor(Tensor &&other) : ITensor(other.shape), buffer(other.buffer) {
other.buffer = nullptr;
};
Tensor &operator=(Tensor &&other) = delete;
std::vector<float> toVector(const cl::CommandQueue *queue = nullptr) {
size_t size = getShapeSize(shape);
std::vector<float> result(size);
cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
q.enqueueReadBuffer(*buffer, CL_TRUE, 0, size * sizeof(float),
result.data());
q.finish();
return result;
}
const cl::Buffer *getBuffer() const { return buffer; }
static Tensor0 *asScalar(Tensor *tensor) {
return tensor->getType() == Type::SCALAR
? reinterpret_cast<Tensor0 *>(tensor)
: nullptr;
}
static const Tensor0 *asScalar(const Tensor *tensor) {
return tensor->getType() == Type::SCALAR
? reinterpret_cast<const Tensor0 *>(tensor)
: nullptr;
}
static Tensor1 *asVector(Tensor *tensor) {
return tensor->getType() == Type::VECTOR
? reinterpret_cast<Tensor1 *>(tensor)
: nullptr;
}
static const Tensor1 *asVector(const Tensor *tensor) {
return tensor->getType() == Type::VECTOR
? reinterpret_cast<const Tensor1 *>(tensor)
: nullptr;
}
static Tensor2 *asMatrix(Tensor *tensor) {
return tensor->getType() == Type::MATRIX
? reinterpret_cast<Tensor2 *>(tensor)
: nullptr;
}
static const Tensor2 *asMatrix(const Tensor *tensor) {
return tensor->getType() == Type::MATRIX
? reinterpret_cast<const Tensor2 *>(tensor)
: nullptr;
}
static Tensor3 *asTensor3(Tensor *tensor) {
return tensor->getType() == Type::TENSOR3
? reinterpret_cast<Tensor3 *>(tensor)
: nullptr;
}
static const Tensor3 *asTensor3(const Tensor *tensor) {
return tensor->getType() == Type::TENSOR3
? reinterpret_cast<const Tensor3 *>(tensor)
: nullptr;
}
};
class Tensor0 : public Tensor, public ITensor0 {
public:
Tensor0(const std::vector<int> &shape,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, queue) {
if (shape.size() != 0)
throw std::invalid_argument("Tensor0 dimension must be 0");
}
Tensor0(const std::vector<int> &shape, float value,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, value, queue) {
if (shape.size() != 0)
throw std::invalid_argument("Tensor0 dimension must be 0");
}
Tensor0(const cl::CommandQueue *queue = nullptr) : Tensor({}, queue) {
createBuf(1, queue);
}
Tensor0(float value, const cl::CommandQueue *queue = nullptr)
: Tensor({}, queue) {
createBuf(1, value, queue);
}
Tensor0(const Tensor0 &) = delete;
Tensor0 &operator=(const Tensor0 &) = delete;
Tensor0(Tensor0 &&other) : Tensor(std::move(other)) {};
Tensor0 &operator=(Tensor0 &&other) = delete;
};
class Tensor1 : public Tensor, public ITensor1 {
public:
Tensor1(const std::vector<int> &shape,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, queue) {
if (shape.size() != 1)
throw std::invalid_argument("Tensor1 dimension must be 1");
}
Tensor1(const std::vector<int> &shape, float value,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, value, queue) {
if (shape.size() != 1)
throw std::invalid_argument("Tensor1 dimension must be 1");
}
Tensor1(int size, const cl::CommandQueue *queue = nullptr)
: Tensor({size}, queue) {}
Tensor1(int size, float value, const cl::CommandQueue *queue = nullptr)
: Tensor({size}, value, queue) {}
Tensor1(const std::vector<float> &values,
const cl::CommandQueue *queue = nullptr)
: Tensor({(int)values.size()}, false, queue) {
fillBuf(values, queue);
}
Tensor1(const Tensor1 &) = delete;
Tensor1 &operator=(const Tensor1 &) = delete;
Tensor1(Tensor1 &&other) : Tensor(std::move(other)) {}
Tensor1 &operator=(Tensor1 &&other) = delete;
int getSize() const override { return shape[0]; }
};
class Tensor2 : public ITensor2, public Tensor {
public:
Tensor2(const std::vector<int> &shape,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, queue) {
if (shape.size() != 2)
throw std::invalid_argument("Tensor2 dimension must be 2");
}
Tensor2(const std::vector<int> &shape, float value,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, value, queue) {
if (shape.size() != 2)
throw std::invalid_argument("Tensor2 dimension must be 2");
}
Tensor2(int rows, int cols, const cl::CommandQueue *queue = nullptr)
: ITensor2(), Tensor({rows, cols}, queue) {}
Tensor2(int rows, int cols, float value,
const cl::CommandQueue *queue = nullptr)
: ITensor2(), Tensor({rows, cols}, value, queue) {}
Tensor2(int rows, int cols, const std::vector<float> &values,
const cl::CommandQueue *queue = nullptr)
: Tensor({rows, cols}, false, queue) {
fillBuf(values, queue);
}
Tensor2(const std::vector<std::vector<float>> &values,
const cl::CommandQueue *queue = nullptr)
: Tensor({(int)values.size(), (int)values[0].size()}, false) {
std::vector<float> v(values.size() * values[0].size());
for (size_t i = 0; i < values.size(); ++i) {
for (size_t j = 0; j < values[i].size(); ++j)
v[i * values[0].size() + j] = values[i][j];
}
fillBuf(v, queue);
}
Tensor2(const Tensor2 &) = delete;
Tensor2 &operator=(const Tensor2 &) = delete;
Tensor2(Tensor2 &&other) : Tensor(std::move(other)) {}
Tensor2 &operator=(Tensor2 &&other) = delete;
int getRows() const override { return shape[0]; }
int getCols() const override { return shape[1]; }
};
class Tensor3 : public Tensor, public ITensor3 {
public:
Tensor3(const std::vector<int> &shape,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, queue) {
if (shape.size() != 3)
throw std::invalid_argument("Tensor3 dimension must be 3");
}
Tensor3(const std::vector<int> &shape, float value,
const cl::CommandQueue *queue = nullptr)
: Tensor(shape, value, queue) {
if (shape.size() != 3)
throw std::invalid_argument("Tensor3 dimension must be 3");
}
Tensor3(int d1, int d2, int d3, const cl::CommandQueue *queue = nullptr)
: Tensor({d1, d2, d3}, queue) {}
Tensor3(int d1, int d2, int d3, float value,
const cl::CommandQueue *queue = nullptr)
: Tensor({d1, d2, d3}, value, queue) {}
Tensor3(int d1, int d2, int d3, const std::vector<float> &values,
const cl::CommandQueue *queue = nullptr)
: Tensor({d1, d2, d3}, false, queue) {
fillBuf(values, queue);
}
Tensor3(const std::vector<std::vector<std::vector<float>>> &values,
const cl::CommandQueue *queue = nullptr)
: Tensor({(int)values.size(), (int)values[0].size(),
(int)values[0][0].size()},
false, queue) {
std::vector<float> v(shape[0] * shape[1] * shape[2]);
for (int i = 0; i < shape[0]; ++i) {
for (int j = 0; j < shape[1]; ++j)
for (int k = 0; k < shape[2]; ++k)
v[i * shape[1] * shape[2] + j * shape[1] + k] = values[i][j][k];
}
fillBuf(v, queue);
}
Tensor3(const Tensor3 &) = delete;
Tensor3 &operator=(const Tensor3 &) = delete;
Tensor3(Tensor3 &&other) : Tensor(std::move(other)) {}
Tensor3 &operator=(Tensor3 &&other) = delete;
};
typedef Tensor0 Scalar;
typedef Tensor1 Vector;
typedef Tensor2 Matrix;
} // namespace GPU

58
src/math/tensor/math.hpp Normal file
View File

@@ -0,0 +1,58 @@
#pragma once
#include "tensor.hpp"
enum class Activation { LINEAR, SIGMOID, TANH, RELU, LEAKY_RELU, ELU, GELU };
template <typename T>
concept ITensorType = std::is_base_of_v<ITensor, T>;
template <typename T>
concept ITensor0Type = std::is_base_of_v<ITensor0, T>;
template <typename T>
concept ITensor1Type = std::is_base_of_v<ITensor1, T>;
template <typename T>
concept ITensor2Type = std::is_base_of_v<ITensor2, T>;
template <typename T>
concept ITensor3Type = std::is_base_of_v<ITensor3, T>;
template <ITensorType T> class ITensorMath {
protected:
void validateSameDimensions(const T &a, const T &b) const {
if (a.getDim() != b.getDim())
throw std::invalid_argument("Tensors must have the same dimension");
if (a.getSize() != b.getSize())
throw std::invalid_argument("Tensors must have the same size");
for (int i = 0; i < a.getDim(); ++i) {
if (a.getShape()[i] != b.getShape()[i])
throw std::invalid_argument("Tensors must have the same shape");
}
};
public:
virtual T activate(const T &m, Activation type, float alpha) = 0;
virtual T mult(const T &m, float x) = 0;
virtual T add(const T &a, const T &b, float x) = 0;
virtual T add(const T &m, float x) = 0;
};
template <ITensor0Type T> class ITensor0Math {};
template <ITensor1Type T> class ITensor1Math {};
template <ITensor2Type T> class ITensor2Math {
public:
virtual T mult(const T &a, const T &b, bool transpose, float bias,
Activation type, float alpha) = 0;
void validateMultDimensions(const T &a, const T &b, bool transpose) const {
if ((!transpose && a.getCols() != b.getRows()) ||
(transpose && a.getCols() != b.getCols())) {
throw std::invalid_argument(
"Invalid matrix dimensions for multiplication");
}
};
};
template <ITensor3Type T> class ITensor3Math {};

View File

@@ -0,0 +1,67 @@
#pragma once
#include <stdexcept>
#include <vector>
std::random_device rd;
std::mt19937 gen(rd());
class ITensor {
protected:
std::vector<int> shape;
void validateDimensions(const std::vector<int> &shape) const {
if (shape.empty())
throw std::invalid_argument("Tensor shape cannot be empty");
for (size_t i = 0; i < shape.size(); ++i) {
if (shape[i] <= 0)
throw std::invalid_argument(
"All tensor dimensions must be positive, but dimension " +
std::to_string(i) + " is " + std::to_string(shape[i]));
}
};
public:
ITensor(const std::vector<int> &shape) : shape(shape) {}
ITensor(const ITensor &) = default;
ITensor &operator=(const ITensor &) = default;
ITensor(ITensor &&other) = default;
ITensor &operator=(ITensor &&other) = default;
const std::vector<int> &getShape() const { return shape; }
int getDim() const { return static_cast<int>(shape.size()); }
size_t getSize() const {
size_t size = 1;
for (int dim : shape)
size *= dim;
return size;
};
enum class Type { SCALAR, VECTOR, MATRIX, TENSOR3 };
Type getType() const { return static_cast<Type>(shape.size()); };
};
class ITensor0 {};
class ITensor1 {
public:
virtual int getSize() const = 0;
};
class ITensor2 {
public:
ITensor2() = default;
ITensor2(const ITensor2 &) = default;
ITensor2 &operator=(const ITensor2 &) = default;
ITensor2(ITensor2 &&other) = default;
ITensor2 &operator=(ITensor2 &&other) = default;
virtual int getRows() const = 0;
virtual int getCols() const = 0;
};
class ITensor3 {};
typedef ITensor0 IScalar;
typedef ITensor1 IVector;
typedef ITensor2 IMatrix;