Complete tensors math

2026-04-04 04:40:40 +04:00 · 2025-11-01 10:30:32 +04:00
parent f728261354
commit f1dfe1b335
26 changed files with 1147 additions and 673 deletions
--- a/src/math/tensor/gpu/math.cpp
+++ b/src/math/tensor/gpu/math.cpp
@@ -0,0 +1 @@
+#include "math.hpp"
--- a/src/math/tensor/gpu/math.hpp
+++ b/src/math/tensor/gpu/math.hpp
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "../../opencl/opencl.hpp"
+
+#include "tensor.hpp"
+
+#include "../math.hpp"
+
+namespace GPU {
+template <ITensorType T> class TensorMath;
+class Tensor0Math;
+class Tensor1Math;
+class Tensor2Math;
+class Tensor3Math;
+
+template <ITensorType T> class TensorMath : public ITensorMath<T> {
+protected:
+  enum class Method {
+    MULT,
+    MULT_SMALL,
+    SCALAR_MULT,
+    ADD,
+    SCALAR_ADD,
+    ACTIVATE
+  };
+  std::unordered_map<Method, cl::Kernel> kernels;
+  std::unordered_map<Method, std::string> kernelsNames = {
+      {Method::MULT, "mult"},           {Method::MULT_SMALL, "mult_small"},
+      {Method::SCALAR_MULT, "mult_sc"}, {Method::ADD, "add"},
+      {Method::SCALAR_ADD, "add_sc"},   {Method::ACTIVATE, "activate"}};
+
+  cl::CommandQueue queue;
+
+public:
+  TensorMath() {
+    queue = cl::CommandQueue(openCL.getContext(), openCL.getDevice());
+    for (const auto &entry : kernelsNames) {
+      kernels[entry.first] =
+          cl::Kernel(openCL.getProgram(OpenCL::Program::MATRIX), entry.second);
+    }
+  }
+
+  const cl::CommandQueue &getQueue() const { return queue; }
+
+  void await() const { queue.finish(); }
+
+  T activate(const T &t, Activation type = Activation::LINEAR,
+             float alpha = 0.0f) override {
+    T result(t.getShape(), false, &queue);
+    kernels[Method::ACTIVATE].setArg(0, *t.getBuffer());
+    kernels[Method::ACTIVATE].setArg(1, *result.getBuffer());
+    kernels[Method::ACTIVATE].setArg(2, static_cast<int>(type));
+    kernels[Method::ACTIVATE].setArg(3, alpha);
+    queue.enqueueNDRangeKernel(kernels[Method::ACTIVATE], cl::NullRange,
+                               cl::NDRange(t.getSize()));
+    return result;
+  }
+
+  T mult(const T &t, float x) override {
+    T result(t.getShape(), false, &queue);
+    kernels[Method::SCALAR_MULT].setArg(0, *t.getBuffer());
+    kernels[Method::SCALAR_MULT].setArg(1, *result.getBuffer());
+    kernels[Method::SCALAR_MULT].setArg(2, x);
+    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_MULT], cl::NullRange,
+                               cl::NDRange(t.getSize()));
+    return result;
+  }
+
+  T add(const T &a, const T &b, float x = 1.0f) override {
+    this->validateSameDimensions(a, b);
+    T result(a.getShape(), false, &queue);
+    kernels[Method::ADD].setArg(0, *a.getBuffer());
+    kernels[Method::ADD].setArg(1, *b.getBuffer());
+    kernels[Method::ADD].setArg(2, *result.getBuffer());
+    kernels[Method::ADD].setArg(3, x);
+    queue.enqueueNDRangeKernel(kernels[Method::ADD], cl::NullRange,
+                               cl::NDRange(a.getSize()));
+    return result;
+  }
+
+  T add(const T &t, float x) override {
+    T result(t.getShape(), false, &queue);
+    kernels[Method::SCALAR_ADD].setArg(0, *t.getBuffer());
+    kernels[Method::SCALAR_ADD].setArg(1, *result.getBuffer());
+    kernels[Method::SCALAR_ADD].setArg(2, x);
+    queue.enqueueNDRangeKernel(kernels[Method::SCALAR_ADD], cl::NullRange,
+                               cl::NDRange(t.getSize()));
+    return result;
+  }
+};
+
+class Tensor0Math : public TensorMath<Tensor0>, public ITensor0Math<Tensor0> {};
+
+class Tensor1Math : public TensorMath<Tensor1>, public ITensor1Math<Tensor1> {};
+
+class Tensor2Math : public TensorMath<Tensor2>, public ITensor2Math<Tensor2> {
+private:
+  Tensor2 mult_tiled(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+                     float bias = 0.0f, Activation type = Activation::LINEAR,
+                     float alpha = 0.01f) {
+    validateMultDimensions(a, b, transpose);
+    Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
+                   &queue);
+
+    const int tile_size = 16;
+    cl::NDRange local_size(tile_size, tile_size);
+    cl::NDRange global_size(
+        ((result.getRows() + tile_size - 1) / tile_size) * tile_size,
+        ((result.getCols() + tile_size - 1) / tile_size) * tile_size);
+
+    kernels[Method::MULT].setArg(0, *a.getBuffer());
+    kernels[Method::MULT].setArg(1, *b.getBuffer());
+    kernels[Method::MULT].setArg(2, *result.getBuffer());
+    kernels[Method::MULT].setArg(3, bias);
+    kernels[Method::MULT].setArg(4, static_cast<int>(type));
+    kernels[Method::MULT].setArg(5, alpha);
+    kernels[Method::MULT].setArg(6, result.getRows());
+    kernels[Method::MULT].setArg(7, result.getCols());
+    kernels[Method::MULT].setArg(8, a.getCols());
+    kernels[Method::MULT].setArg(9, transpose ? 1 : 0);
+    queue.enqueueNDRangeKernel(kernels[Method::MULT], cl::NullRange,
+                               global_size, local_size);
+    return result;
+  }
+  Tensor2 mult_small(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+                     float bias = 0.0f, Activation type = Activation::LINEAR,
+                     float alpha = 0.01f) {
+    validateMultDimensions(a, b, transpose);
+    Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), false,
+                   &queue);
+    kernels[Method::MULT_SMALL].setArg(0, *a.getBuffer());
+    kernels[Method::MULT_SMALL].setArg(1, *b.getBuffer());
+    kernels[Method::MULT_SMALL].setArg(2, *result.getBuffer());
+    kernels[Method::MULT_SMALL].setArg(3, bias);
+    kernels[Method::MULT_SMALL].setArg(4, static_cast<int>(type));
+    kernels[Method::MULT_SMALL].setArg(5, alpha);
+    kernels[Method::MULT_SMALL].setArg(6, result.getRows());
+    kernels[Method::MULT_SMALL].setArg(7, result.getCols());
+    kernels[Method::MULT_SMALL].setArg(8, a.getCols());
+    kernels[Method::MULT_SMALL].setArg(9, transpose ? 1 : 0);
+    queue.enqueueNDRangeKernel(kernels[Method::MULT_SMALL], cl::NullRange,
+                               cl::NDRange(result.getRows(), result.getCols()));
+    return result;
+  }
+
+public:
+  Tensor2 mult(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
+               float bias = 0.0f, Activation type = Activation::LINEAR,
+               float alpha = 0.01f) override {
+    if (a.getRows() > 64 || a.getCols() > 64 || b.getRows() > 64 ||
+        b.getCols() > 64)
+      return mult_tiled(a, b, transpose, bias, type, alpha);
+    else
+      return mult_small(a, b, transpose, bias, type, alpha);
+  }
+};
+
+class Tensor3Math : public TensorMath<Tensor3>, public ITensor3Math<Tensor3> {};
+
+typedef Tensor0Math ScalarMath;
+typedef Tensor1Math VectorMath;
+typedef Tensor2Math MatrixMath;
+
+} // namespace GPU
--- a/src/math/tensor/gpu/tensor.cpp
+++ b/src/math/tensor/gpu/tensor.cpp
@@ -0,0 +1 @@
+#include "tensor.hpp"
--- a/src/math/tensor/gpu/tensor.hpp
+++ b/src/math/tensor/gpu/tensor.hpp
@@ -0,0 +1,282 @@
+#pragma once
+
+#include "../../opencl/opencl.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "../tensor.hpp"
+#include "math.hpp"
+
+extern std::mt19937 gen;
+
+namespace GPU {
+class Tensor;
+class Tensor0;
+class Tensor1;
+class Tensor2;
+class Tensor3;
+
+class Tensor : public ITensor {
+protected:
+  cl::Buffer *buffer = nullptr;
+
+  size_t getShapeSize(const std::vector<int> &shape) {
+    size_t size = 1;
+    for (int dim : shape)
+      size *= dim;
+    return size;
+  }
+  void fillBuf(const std::vector<float> &v,
+               const cl::CommandQueue *queue = nullptr) {
+    if (buffer != nullptr)
+      throw std::runtime_error("Tensor buffer already exists");
+    buffer = new cl::Buffer(openCL.getContext(), CL_MEM_READ_WRITE,
+                            v.size() * sizeof(float));
+    cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
+    q.enqueueWriteBuffer(*buffer, CL_TRUE, 0, v.size() * sizeof(float),
+                         v.data());
+    q.finish();
+  }
+  void createBuf(size_t size, const cl::CommandQueue *queue = nullptr) {
+    std::vector<float> v(size);
+    std::generate(v.begin(), v.end(),
+                  []() { return std::generate_canonical<float, 10>(gen); });
+    fillBuf(v, queue);
+  }
+  void createBuf(size_t size, float value,
+                 const cl::CommandQueue *queue = nullptr) {
+    std::vector<float> v(size);
+    std::fill(v.begin(), v.end(), value);
+    fillBuf(v, queue);
+  }
+
+public:
+  Tensor(const std::vector<int> &shape, const cl::CommandQueue *queue = nullptr)
+      : ITensor(shape) {
+    createBuf(getShapeSize(shape), queue);
+  }
+  Tensor(const std::vector<int> &shape, float value,
+         const cl::CommandQueue *queue = nullptr)
+      : ITensor(shape) {
+    createBuf(getShapeSize(shape), value, queue);
+  }
+  Tensor(const std::vector<int> &shape, bool fill,
+         const cl::CommandQueue *queue = nullptr)
+      : ITensor(shape) {
+    if (fill)
+      createBuf(getShapeSize(shape), 0.0f, queue);
+  }
+  Tensor(const Tensor &) = delete;
+  Tensor &operator=(const Tensor &) = delete;
+  Tensor(Tensor &&other) : ITensor(other.shape), buffer(other.buffer) {
+    other.buffer = nullptr;
+  };
+  Tensor &operator=(Tensor &&other) = delete;
+
+  std::vector<float> toVector(const cl::CommandQueue *queue = nullptr) {
+    size_t size = getShapeSize(shape);
+    std::vector<float> result(size);
+    cl::CommandQueue q = queue == nullptr ? openCL.getDefaultQueue() : *queue;
+    q.enqueueReadBuffer(*buffer, CL_TRUE, 0, size * sizeof(float),
+                        result.data());
+    q.finish();
+    return result;
+  }
+
+  const cl::Buffer *getBuffer() const { return buffer; }
+
+  static Tensor0 *asScalar(Tensor *tensor) {
+    return tensor->getType() == Type::SCALAR
+               ? reinterpret_cast<Tensor0 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor0 *asScalar(const Tensor *tensor) {
+    return tensor->getType() == Type::SCALAR
+               ? reinterpret_cast<const Tensor0 *>(tensor)
+               : nullptr;
+  }
+  static Tensor1 *asVector(Tensor *tensor) {
+    return tensor->getType() == Type::VECTOR
+               ? reinterpret_cast<Tensor1 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor1 *asVector(const Tensor *tensor) {
+    return tensor->getType() == Type::VECTOR
+               ? reinterpret_cast<const Tensor1 *>(tensor)
+               : nullptr;
+  }
+  static Tensor2 *asMatrix(Tensor *tensor) {
+    return tensor->getType() == Type::MATRIX
+               ? reinterpret_cast<Tensor2 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor2 *asMatrix(const Tensor *tensor) {
+    return tensor->getType() == Type::MATRIX
+               ? reinterpret_cast<const Tensor2 *>(tensor)
+               : nullptr;
+  }
+  static Tensor3 *asTensor3(Tensor *tensor) {
+    return tensor->getType() == Type::TENSOR3
+               ? reinterpret_cast<Tensor3 *>(tensor)
+               : nullptr;
+  }
+  static const Tensor3 *asTensor3(const Tensor *tensor) {
+    return tensor->getType() == Type::TENSOR3
+               ? reinterpret_cast<const Tensor3 *>(tensor)
+               : nullptr;
+  }
+};
+
+class Tensor0 : public Tensor, public ITensor0 {
+public:
+  Tensor0(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 0)
+      throw std::invalid_argument("Tensor0 dimension must be 0");
+  }
+  Tensor0(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 0)
+      throw std::invalid_argument("Tensor0 dimension must be 0");
+  }
+  Tensor0(const cl::CommandQueue *queue = nullptr) : Tensor({}, queue) {
+    createBuf(1, queue);
+  }
+  Tensor0(float value, const cl::CommandQueue *queue = nullptr)
+      : Tensor({}, queue) {
+    createBuf(1, value, queue);
+  }
+  Tensor0(const Tensor0 &) = delete;
+  Tensor0 &operator=(const Tensor0 &) = delete;
+  Tensor0(Tensor0 &&other) : Tensor(std::move(other)) {};
+  Tensor0 &operator=(Tensor0 &&other) = delete;
+};
+
+class Tensor1 : public Tensor, public ITensor1 {
+public:
+  Tensor1(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 1)
+      throw std::invalid_argument("Tensor1 dimension must be 1");
+  }
+  Tensor1(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 1)
+      throw std::invalid_argument("Tensor1 dimension must be 1");
+  }
+  Tensor1(int size, const cl::CommandQueue *queue = nullptr)
+      : Tensor({size}, queue) {}
+  Tensor1(int size, float value, const cl::CommandQueue *queue = nullptr)
+      : Tensor({size}, value, queue) {}
+  Tensor1(const std::vector<float> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({(int)values.size()}, false, queue) {
+    fillBuf(values, queue);
+  }
+  Tensor1(const Tensor1 &) = delete;
+  Tensor1 &operator=(const Tensor1 &) = delete;
+  Tensor1(Tensor1 &&other) : Tensor(std::move(other)) {}
+  Tensor1 &operator=(Tensor1 &&other) = delete;
+
+  int getSize() const override { return shape[0]; }
+};
+
+class Tensor2 : public ITensor2, public Tensor {
+public:
+  Tensor2(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 2)
+      throw std::invalid_argument("Tensor2 dimension must be 2");
+  }
+  Tensor2(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 2)
+      throw std::invalid_argument("Tensor2 dimension must be 2");
+  }
+  Tensor2(int rows, int cols, const cl::CommandQueue *queue = nullptr)
+      : ITensor2(), Tensor({rows, cols}, queue) {}
+  Tensor2(int rows, int cols, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : ITensor2(), Tensor({rows, cols}, value, queue) {}
+  Tensor2(int rows, int cols, const std::vector<float> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({rows, cols}, false, queue) {
+    fillBuf(values, queue);
+  }
+  Tensor2(const std::vector<std::vector<float>> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({(int)values.size(), (int)values[0].size()}, false) {
+    std::vector<float> v(values.size() * values[0].size());
+    for (size_t i = 0; i < values.size(); ++i) {
+      for (size_t j = 0; j < values[i].size(); ++j)
+        v[i * values[0].size() + j] = values[i][j];
+    }
+    fillBuf(v, queue);
+  }
+
+  Tensor2(const Tensor2 &) = delete;
+  Tensor2 &operator=(const Tensor2 &) = delete;
+  Tensor2(Tensor2 &&other) : Tensor(std::move(other)) {}
+  Tensor2 &operator=(Tensor2 &&other) = delete;
+
+  int getRows() const override { return shape[0]; }
+  int getCols() const override { return shape[1]; }
+};
+
+class Tensor3 : public Tensor, public ITensor3 {
+public:
+  Tensor3(const std::vector<int> &shape,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, queue) {
+    if (shape.size() != 3)
+      throw std::invalid_argument("Tensor3 dimension must be 3");
+  }
+  Tensor3(const std::vector<int> &shape, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor(shape, value, queue) {
+    if (shape.size() != 3)
+      throw std::invalid_argument("Tensor3 dimension must be 3");
+  }
+  Tensor3(int d1, int d2, int d3, const cl::CommandQueue *queue = nullptr)
+      : Tensor({d1, d2, d3}, queue) {}
+  Tensor3(int d1, int d2, int d3, float value,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({d1, d2, d3}, value, queue) {}
+  Tensor3(int d1, int d2, int d3, const std::vector<float> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({d1, d2, d3}, false, queue) {
+    fillBuf(values, queue);
+  }
+  Tensor3(const std::vector<std::vector<std::vector<float>>> &values,
+          const cl::CommandQueue *queue = nullptr)
+      : Tensor({(int)values.size(), (int)values[0].size(),
+                (int)values[0][0].size()},
+               false, queue) {
+    std::vector<float> v(shape[0] * shape[1] * shape[2]);
+    for (int i = 0; i < shape[0]; ++i) {
+      for (int j = 0; j < shape[1]; ++j)
+        for (int k = 0; k < shape[2]; ++k)
+          v[i * shape[1] * shape[2] + j * shape[1] + k] = values[i][j][k];
+    }
+    fillBuf(v, queue);
+  }
+  Tensor3(const Tensor3 &) = delete;
+  Tensor3 &operator=(const Tensor3 &) = delete;
+  Tensor3(Tensor3 &&other) : Tensor(std::move(other)) {}
+  Tensor3 &operator=(Tensor3 &&other) = delete;
+};
+
+typedef Tensor0 Scalar;
+typedef Tensor1 Vector;
+typedef Tensor2 Matrix;
+
+} // namespace GPU