New tensor lib

2026-04-04 12:50:39 +04:00 · 2025-11-09 17:01:44 +04:00
parent bbafbf5574
commit d3ac52b8df
27 changed files with 497 additions and 1746 deletions
--- a/src/tensor/.clangd
+++ b/src/tensor/.clangd
@@ -0,0 +1,9 @@
+CompileFlags:
+  Add:
+    - -std=c++23
+    - -Wall
+    - -Wextra
+    - -Wpedantic
+  Remove: []
+Diagnostics:
+  UnusedIncludes: Strict
--- a/src/tensor/Makefile
+++ b/src/tensor/Makefile
@@ -0,0 +1,39 @@
+CXX = g++
+CXXFLAGS = -Wall -Wextra -O1 -g -std=c++23
+
+ifeq ($(OS),Windows_NT)
+    DETECTED_OS := Windows
+else
+    DETECTED_OS := $(shell uname -s)
+endif
+ifeq ($(DETECTED_OS),Windows)
+	TARGET = main.exe
+    MKDIR = powershell -Command "mkdir"
+    SHARED_LIB_EXT = pyd
+else
+	TARGET = main
+    MKDIR = mkdir -p
+    SHARED_LIB_EXT = so
+endif
+
+BUILD_DIR = build
+COMMON_SRC = tensor.cpp
+
+PYTHON_PATH = $(shell python -c "from sysconfig import get_paths; print(get_paths()['data'])")
+PYTHON_INCLUDE = $(PYTHON_PATH)\include
+PYTHON_LIBS = $(PYTHON_PATH)\libs
+PYBIND_INCLUDE = $(shell python -c "import pybind11; print(pybind11.get_include())")
+
+.DEFAULT_GOAL := $(TARGET)
+
+$(BUILD_DIR):
+	$(MKDIR) $(BUILD_DIR)
+
+$(TARGET): $(COMMON_SRC) main.cpp | $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) -o $@ $^
+
+module: $(COMMON_SRC) pybind.cpp | $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o tensor.$(SHARED_LIB_EXT) $^ -I"$(PYTHON_INCLUDE)" -L"$(PYTHON_LIBS)" -lpython313 -I"$(PYBIND_INCLUDE)"
+
+clean:
+	rm -rf $(BUILD_DIR) $(TARGET) *.$(SHARED_LIB_EXT)
--- a/src/tensor/main.cpp
+++ b/src/tensor/main.cpp
@@ -0,0 +1,2 @@
+
+int main() { return 0; }
--- a/src/tensor/opencl/kernels/tensor.cl
+++ b/src/tensor/opencl/kernels/tensor.cl
@@ -0,0 +1,144 @@
+float activate_x(float x, const int activation_type, const float alpha) {
+  switch (activation_type) {
+  case 0: // LINEAR
+    return x;
+  case 1: // SIGMOID
+    return 1.0f / (1.0f + exp(-x));
+  case 2: // TANH
+    return tanh(x);
+  case 3: // RELU
+    return fmax(0.0f, x);
+  case 4: // LEAKY_RELU
+    return (x > 0.0f) ? x : alpha * x;
+  case 5: // ELU
+    return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
+  default:
+    return x;
+  }
+}
+
+__kernel void activate(__global float *input, __global float *output,
+                       const int activation_type, const float alpha) {
+  int i = get_global_id(0);
+  output[i] = activate_x(input[i], activation_type, alpha);
+}
+
+__kernel void mult_small(__global float *A, __global float *B,
+                         __global float *C, __global float *bias,
+                         const int activation_type, const float alpha,
+                         const int M, const int N, const int K,
+                         const int transpose_B) {
+  const int row = get_global_id(0);
+  const int col = get_global_id(1);
+
+  if (row < M && col < N) {
+    float sum = 0.0f;
+    for (int k = 0; k < K; k++) {
+      float a_val = A[row * K + k];
+
+      float b_val;
+      if (transpose_B) {
+        b_val = B[col * K + k];
+      } else {
+        b_val = B[k * N + col];
+      }
+
+      sum += a_val * b_val;
+    }
+
+    float result = sum + bias[col];
+    if (activation_type != 0) {
+      result = activate_x(result, activation_type, alpha);
+    }
+    C[row * N + col] = result;
+  }
+}
+
+__kernel void mult(__global float *A, __global float *B, __global float *C,
+                   __global float *bias, const int activation_type,
+                   const float alpha, const int M, const int N, const int K,
+                   const int transpose_B) {
+  const int tile_size = 16;
+
+  int local_i = get_local_id(0);
+  int local_j = get_local_id(1);
+  int local_size_i = get_local_size(0);
+  int local_size_j = get_local_size(1);
+
+  int global_i = get_group_id(0) * local_size_i + local_i;
+  int global_j = get_group_id(1) * local_size_j + local_j;
+
+  __local float tile_A[16][16];
+  __local float tile_B[16][16];
+
+  float sum = 0.0f;
+
+  int num_tiles = (K + tile_size - 1) / tile_size;
+
+  for (int tile = 0; tile < num_tiles; tile++) {
+    int tile_offset = tile * tile_size;
+
+    // Загрузка tile_A (без изменений)
+    int load_i_A = tile_offset + local_i;
+    int load_j_A = tile_offset + local_j;
+
+    if (global_i < M && load_j_A < K) {
+      tile_A[local_j][local_i] = A[global_i * K + load_j_A];
+    } else {
+      tile_A[local_j][local_i] = 0.0f;
+    }
+
+    // Загрузка tile_B с учетом транспонирования
+    int load_i_B = tile_offset + local_i;
+    int load_j_B = tile_offset + local_j;
+
+    if (transpose_B) {
+      // B транспонирована: обращаем индексы
+      if (load_i_B < N && global_j < K) {
+        tile_B[local_j][local_i] = B[global_j * N + load_i_B];
+      } else {
+        tile_B[local_j][local_i] = 0.0f;
+      }
+    } else {
+      // B не транспонирована (оригинальная логика)
+      if (load_i_B < K && global_j < N) {
+        tile_B[local_j][local_i] = B[load_i_B * N + global_j];
+      } else {
+        tile_B[local_j][local_i] = 0.0f;
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#pragma unroll
+    for (int k = 0; k < tile_size; ++k) {
+      sum += tile_A[k][local_i] * tile_B[local_j][k];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  if (global_i < M && global_j < N) {
+    float result = sum + bias[global_j];
+    if (activation_type != 0) {
+      result = activate_x(result, activation_type, alpha);
+    }
+    C[global_i * N + global_j] = result;
+  }
+}
+
+__kernel void mult_sc(__global float *A, __global float *B, float scalar) {
+  int i = get_global_id(0);
+  B[i] = A[i] * scalar;
+}
+
+__kernel void add(__global float *A, __global float *B, __global float *C,
+                  float x) {
+  int i = get_global_id(0);
+  C[i] = A[i] + (B[i] * x);
+}
+
+__kernel void add_sc(__global float *A, __global float *B, float scalar) {
+  int i = get_global_id(0);
+  B[i] = A[i] + scalar;
+}
--- a/src/tensor/opencl/opencl.cpp
+++ b/src/tensor/opencl/opencl.cpp
@@ -0,0 +1,121 @@
+#include "opencl.hpp"
+
+std::string OpenCL::readProgram(const std::string &filePath) {
+  std::ifstream file(filePath, std::ios::binary);
+  if (!file.is_open()) {
+    throw std::runtime_error("Cannot open file: " + filePath);
+  }
+
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  return buffer.str();
+}
+cl::Program OpenCL::compileProgram(const std::string &file) {
+  std::string source = readProgram(file);
+  cl::Program program(context, source);
+  try {
+    program.build({device});
+  } catch (cl::Error &e) {
+    std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+    std::cerr << "Build log:\n" << build_log << std::endl;
+    throw;
+  }
+  return program;
+}
+void OpenCL::loadPrograms() {
+  for (const auto &entry : programPaths) {
+    programs[entry.first] = compileProgram(entry.second);
+    std::cout << "Loaded program: " << entry.second << std::endl;
+  }
+}
+
+void OpenCL::initializeDevice() {
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+
+  if (platforms.empty()) {
+    throw std::runtime_error("No OpenCL platforms found");
+  }
+
+  std::vector<cl::Device> devices;
+  bool deviceFound = false;
+
+  for (const auto &platform : platforms) {
+    try {
+      platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+      if (!devices.empty()) {
+        deviceFound = true;
+        break;
+      }
+    } catch (const cl::Error &) {
+      continue;
+    }
+  }
+
+  if (!deviceFound) {
+    for (const auto &platform : platforms) {
+      try {
+        platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+        if (!devices.empty()) {
+          deviceFound = true;
+          break;
+        }
+      } catch (const cl::Error &) {
+        continue;
+      }
+    }
+  }
+
+  if (!deviceFound) {
+    throw std::runtime_error("No suitable OpenCL devices found");
+  }
+
+  device = devices[0];
+  context = cl::Context(device);
+  queue = cl::CommandQueue(context, device);
+
+  std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>()
+            << "\nPlatform: " << platforms[0].getInfo<CL_PLATFORM_NAME>()
+            << "\nCompute units: "
+            << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
+            << "\nGlobal memory: "
+            << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+            << " MB" << std::endl;
+}
+
+OpenCL::OpenCL() {
+  try {
+    initializeDevice();
+    loadPrograms();
+  } catch (const cl::Error &e) {
+    std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")"
+              << std::endl;
+    throw;
+  }
+}
+
+cl::Program &OpenCL::getProgram(Program program) {
+  auto it = programs.find(program);
+  if (it == programs.end()) {
+    throw std::invalid_argument("Program not loaded: " +
+                                std::to_string(static_cast<int>(program)));
+  }
+  return it->second;
+}
+
+void OpenCL::printDeviceInfo() const {
+  std::cout << "=== OpenCL Device Info ===" << std::endl;
+  std::cout << "Name: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+  std::cout << "Vendor: " << device.getInfo<CL_DEVICE_VENDOR>() << std::endl;
+  std::cout << "Version: " << device.getInfo<CL_DEVICE_VERSION>() << std::endl;
+  std::cout << "Compute Units: "
+            << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << std::endl;
+  std::cout << "Global Memory: "
+            << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+            << " MB" << std::endl;
+  std::cout << "Local Memory: "
+            << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << " KB"
+            << std::endl;
+  std::cout << "Max Work Group Size: "
+            << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
+}
--- a/src/tensor/opencl/opencl.hpp
+++ b/src/tensor/opencl/opencl.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_TARGET_OPENCL_VERSION 300
+#include <CL/opencl.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+
+class OpenCL {
+public:
+  enum class Program { TENSOR };
+
+private:
+  cl::Device device;
+  cl::Context context;
+  cl::CommandQueue queue;
+
+  std::unordered_map<Program, cl::Program> programs;
+  std::unordered_map<Program, std::string> programPaths = {
+      {Program::TENSOR, "./opencl/kernels/tensor.cl"}};
+
+  std::string readProgram(const std::string &filePath);
+  cl::Program compileProgram(const std::string &file);
+  void loadPrograms();
+
+  void initializeDevice();
+
+public:
+  OpenCL();
+
+  OpenCL(const OpenCL &) = delete;
+  OpenCL &operator=(const OpenCL &) = delete;
+  OpenCL(OpenCL &&) = delete;
+  OpenCL &operator=(OpenCL &&) = delete;
+
+  cl::Device &getDevice() { return device; }
+  cl::Context &getContext() { return context; }
+  const cl::CommandQueue &getQueue() { return queue; }
+
+  cl::Program &getProgram(Program program);
+  void printDeviceInfo() const;
+};
--- a/src/tensor/pybind.cpp
+++ b/src/tensor/pybind.cpp
@@ -0,0 +1,102 @@
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "tensor.hpp"
+
+namespace py = pybind11;
+
+template <typename T, int Dim>
+void register_tensor(py::module &m, const std::string &name) {
+  auto tensor = py::class_<Tensor<T, Dim>>(m, name.c_str())
+                    .def(py::init<const std::array<size_t, Dim> &>())
+                    .def(py::init<const std::array<size_t, Dim> &, T>())
+                    .def(py::init<const std::array<size_t, Dim> &,
+                                  const std::vector<T> &>())
+                    .def(py::init<const std::array<size_t, Dim> &, T, T>())
+
+                    .def("get_shape", &Tensor<T, Dim>::getShape)
+                    .def("get_data", &Tensor<T, Dim>::getData)
+                    .def("get_size", &Tensor<T, Dim>::getSize)
+                    .def("get_axes", &Tensor<T, Dim>::getAxes)
+
+                    .def("__getitem__",
+                         [](const Tensor<T, Dim> &t, size_t i) -> T {
+                           if (i >= t.getSize())
+                             throw py::index_error();
+                           return t[i];
+                         })
+                    .def("__setitem__",
+                         [](Tensor<T, Dim> &t, size_t i, T value) {
+                           if (i >= t.getSize())
+                             throw py::index_error();
+                           t[i] = value;
+                         })
+
+                    // .def("__call__",
+                    //      [](Tensor<T, Dim> &t, py::args args) -> T & {
+                    //
+                    //      })
+
+                    .def(py::self + py::self)
+                    .def(py::self - py::self)
+                    .def(py::self * py::self)
+                    .def(py::self += py::self)
+                    .def(py::self -= py::self)
+                    .def(py::self *= py::self)
+
+                    .def(py::self + T())
+                    .def(py::self - T())
+                    .def(py::self * T())
+                    .def(py::self / T())
+                    .def(T() + py::self)
+                    .def(T() - py::self)
+                    .def(T() * py::self)
+
+                    .def(py::self += T())
+                    .def(py::self -= T())
+                    .def(py::self *= T())
+                    .def(py::self /= T())
+
+                    .def("__pos__", [](const Tensor<T, Dim> &t) { return +t; })
+                    .def("__neg__", [](const Tensor<T, Dim> &t) { return -t; })
+
+                    .def("print", &Tensor<T, Dim>::print);
+
+  if constexpr (Dim == 1 || Dim == 2)
+    tensor.def("__matmul__", &Tensor<T, Dim>::operator%);
+
+  if constexpr (Dim >= 2) {
+    tensor
+        .def("transpose", py::overload_cast<const std::array<int, Dim> &>(
+                              &Tensor<T, Dim>::transpose))
+        .def("transpose",
+             py::overload_cast<int, int>(&Tensor<T, Dim>::transpose))
+        .def("t", &Tensor<T, Dim>::t);
+  }
+}
+
+PYBIND11_MODULE(tensor, m) {
+  m.doc() = "Tensor math library";
+
+  register_tensor<float, 0>(m, "Scalar");
+  register_tensor<float, 1>(m, "Vector");
+  register_tensor<float, 2>(m, "Matrix");
+  register_tensor<float, 3>(m, "Tensor3");
+  register_tensor<float, 4>(m, "Tensor4");
+  register_tensor<float, 5>(m, "Tensor5");
+
+  register_tensor<double, 0>(m, "dScalar");
+  register_tensor<double, 1>(m, "dVector");
+  register_tensor<double, 2>(m, "dMatrix");
+  register_tensor<double, 3>(m, "dTensor3");
+  register_tensor<double, 4>(m, "dTensor4");
+  register_tensor<double, 5>(m, "dTensor5");
+
+  register_tensor<int, 0>(m, "iScalar");
+  register_tensor<int, 1>(m, "iVector");
+  register_tensor<int, 2>(m, "iMatrix");
+  register_tensor<int, 3>(m, "iTensor3");
+  register_tensor<int, 4>(m, "iTensor4");
+  register_tensor<int, 5>(m, "iTensor5");
+}
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -0,0 +1 @@
+#include "tensor.hpp"
--- a/src/tensor/tensor.hpp
+++ b/src/tensor/tensor.hpp
@@ -0,0 +1,338 @@
+#include <array>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+
+template <typename T, int Dim> class Tensor {
+private:
+  std::array<size_t, Dim> shape_;
+  std::array<int, Dim> axes_;
+  std::vector<T> data_;
+
+  template <typename... Indices> size_t computeIndex(Indices... indices) const {
+    static_assert(sizeof...(Indices) == Dim, "Invalid number of indices");
+    std::array<size_t, Dim> indicesArray = {static_cast<size_t>(indices)...};
+    std::array<size_t, Dim> axesIndices;
+    for (int i = 0; i < Dim; ++i)
+      axesIndices[axes_[i]] = indicesArray[i];
+    size_t index = 0;
+    size_t stride = 1;
+    for (int i = Dim - 1; i >= 0; --i) {
+      index += axesIndices[i] * stride;
+      stride *= shape_[i];
+    }
+    return index;
+  }
+
+  void checkItHasSameShape(const Tensor &other) {
+    if (getShape() != other.getShape())
+      throw std::invalid_argument("Tensor shapes must match");
+  }
+  void checkAxisInDim(int axis) {
+    if (axis < 0 || axis >= Dim)
+      throw std::invalid_argument("Invalid axis index");
+  }
+
+public:
+  Tensor() = delete;
+  Tensor(const std::array<size_t, Dim> &shape) {
+    for (size_t d : shape)
+      if (d == 0)
+        throw std::invalid_argument("Invalid shape");
+    shape_ = shape;
+    for (int i = 0; i < Dim; ++i)
+      axes_[i] = i;
+    size_t total_size = 1;
+    for (size_t dim : shape)
+      total_size *= dim;
+    data_.resize(total_size);
+  }
+  Tensor(const std::array<size_t, Dim> &shape, T fill) : Tensor(shape) {
+    std::fill(data_.begin(), data_.end(), fill);
+  }
+  Tensor(const std::array<size_t, Dim> &shape, const std::vector<T> &data)
+      : Tensor(shape) {
+    if (data.size() != data_.size())
+      throw std::invalid_argument("Invalid data size");
+    data_ = data;
+  }
+  Tensor(const std::array<size_t, Dim> &shape, T min, T max) : Tensor(shape) {
+    static std::random_device rd;
+    static std::mt19937 gen(rd());
+    if constexpr (std::is_integral_v<T>) {
+      std::uniform_int_distribution<T> dis(min, max);
+      for (auto &element : data_)
+        element = dis(gen);
+    } else if constexpr (std::is_floating_point_v<T>) {
+      std::uniform_real_distribution<T> dis(min, max);
+      for (auto &element : data_)
+        element = dis(gen);
+    } else
+      throw std::invalid_argument("Invalid randomized type");
+  }
+
+  Tensor(const Tensor &other)
+      : shape_(other.shape_), axes_(other.axes_), data_(other.data_) {}
+  Tensor &operator=(const Tensor &other) {
+    shape_ = other.shape_;
+    axes_ = other.axes_;
+    data_ = other.data_;
+    return *this;
+  }
+  Tensor(Tensor &&other) noexcept
+      : shape_(std::move(other.shape_)), axes_(std::move(other.axes_)),
+        data_(std::move(other.data_)) {}
+  Tensor &operator=(Tensor &&other) noexcept {
+    shape_ = std::move(other.shape_);
+    axes_ = std::move(other.axes_);
+    data_ = std::move(other.data_);
+    return *this;
+  }
+  ~Tensor() = default;
+
+  const std::array<int, Dim> &getAxes() const { return axes_; }
+  const std::vector<T> &getData() const { return data_; }
+  size_t getSize() const { return data_.size(); }
+  const std::array<size_t, Dim> getShape() const {
+    std::array<size_t, Dim> result;
+    for (int i = 0; i < Dim; ++i)
+      result[i] = shape_[axes_[i]];
+    return result;
+  }
+
+  T &operator[](size_t i) { return data_[i]; }
+  const T &operator[](size_t i) const { return data_[i]; }
+
+  template <typename... Indices> T &operator()(Indices... indices) {
+    return data_[computeIndex(indices...)];
+  }
+  template <typename... Indices> const T &operator()(Indices... indices) const {
+    return data_[computeIndex(indices...)];
+  }
+
+  Tensor &transpose(const std::array<int, Dim> &new_axes) {
+    std::array<bool, Dim> used{};
+    for (int axis : new_axes) {
+      checkAxisInDim(axis);
+      if (used[axis])
+        throw std::invalid_argument("Duplicate axis index");
+      used[axis] = true;
+    }
+    axes_ = new_axes;
+    return *this;
+  }
+  Tensor &transpose(int axis_a, int axis_b) {
+    checkAxisInDim(axis_a);
+    checkAxisInDim(axis_b);
+    if (axis_a == axis_b)
+      throw std::invalid_argument("Duplicate axis index");
+    std::swap(axes_[axis_a], axes_[axis_b]);
+    return *this;
+  }
+  Tensor &t() {
+    static_assert(Dim >= 2, "Can't change the only axis");
+    std::swap(axes_[Dim - 1], axes_[Dim - 2]);
+    return *this;
+  }
+
+  Tensor operator+() const { return *this; }
+  Tensor operator-() const {
+    Tensor result = *this;
+    for (T &e : result.data_)
+      e = -e;
+    return result;
+  }
+
+  Tensor &operator+=(const T &scalar) {
+    for (T &e : data_)
+      e += scalar;
+    return *this;
+  }
+  Tensor operator+(const T &scalar) const {
+    Tensor result = *this;
+    result += scalar;
+    return result;
+  }
+  friend Tensor operator+(const T &scalar, const Tensor &tensor) {
+    return tensor + scalar;
+  }
+
+  Tensor &operator-=(const T &scalar) {
+    for (T &e : data_)
+      e -= scalar;
+    return *this;
+  }
+  Tensor operator-(const T &scalar) const {
+    Tensor result = *this;
+    result -= scalar;
+    return result;
+  }
+  friend Tensor operator-(const T &scalar, const Tensor &tensor) {
+    Tensor result = tensor;
+    for (T &e : result.data_)
+      e = scalar - e;
+    return result;
+  }
+
+  Tensor &operator*=(const T &scalar) {
+    for (T &e : data_)
+      e *= scalar;
+    return *this;
+  }
+  Tensor operator*(const T &scalar) const {
+    Tensor result = *this;
+    result *= scalar;
+    return result;
+  }
+  friend Tensor operator*(const T &scalar, const Tensor &tensor) {
+    return tensor * scalar;
+  }
+
+  Tensor &operator/=(const T &scalar) {
+    if (scalar == T(0))
+      throw std::invalid_argument("Division by zero");
+    for (T &e : data_)
+      e /= scalar;
+    return *this;
+  }
+  Tensor operator/(const T &scalar) const {
+    Tensor result = *this;
+    result /= scalar;
+    return result;
+  }
+
+  Tensor &operator+=(const Tensor &other) {
+    checkItHasSameShape(other);
+    for (size_t i = 0; i < data_.size(); ++i)
+      data_[i] += other.data_[i];
+    return *this;
+  }
+  Tensor operator+(const Tensor &other) const {
+    Tensor result = *this;
+    result += other;
+    return result;
+  }
+
+  Tensor &operator-=(const Tensor &other) {
+    checkItHasSameShape(other);
+    for (size_t i = 0; i < data_.size(); ++i)
+      data_[i] -= other.data_[i];
+    return *this;
+  }
+  Tensor operator-(const Tensor &other) const {
+    Tensor result = *this;
+    result -= other;
+    return result;
+  }
+
+  Tensor &operator*=(const Tensor &other) {
+    checkItHasSameShape(other);
+    for (size_t i = 0; i < data_.size(); ++i)
+      data_[i] *= other.data_[i];
+    return *this;
+  }
+  Tensor operator*(const Tensor &other) const {
+    Tensor result = *this;
+    result *= other;
+    return result;
+  }
+
+  Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
+    static_assert(Dim == 1 || Dim == 2,
+                  "Inner product is only defined for vectors and matrices");
+    if constexpr (Dim == 1) {
+      if (data_.size() != other.data_.size())
+        throw std::invalid_argument(
+            "Vector sizes must match for inner product");
+      T result_val = T(0);
+      for (size_t i = 0; i < data_.size(); ++i)
+        result_val += data_[i] * other.data_[i];
+      return Tensor<T, 0>({}, {result_val});
+    } else if constexpr (Dim == 2) {
+      if (shape_[axes_[1]] != other.shape_[other.axes_[0]])
+        throw std::invalid_argument(
+            "Matrix dimensions must match for multiplication");
+      size_t m = shape_[axes_[0]];
+      size_t n = shape_[axes_[1]];
+      size_t p = other.shape_[other.axes_[1]];
+      Tensor<T, 2> result({m, p}, T(0));
+      for (size_t i = 0; i < m; ++i) {
+        for (size_t j = 0; j < p; ++j) {
+          T sum = T(0);
+          for (size_t k = 0; k < n; ++k)
+            sum += (*this)(i, k) * other(k, j);
+          result(i, j) = sum;
+        }
+      }
+      return result;
+    }
+  }
+
+  void print() const {
+    if constexpr (Dim == 0) {
+      std::cout << "Scalar<" << typeid(T).name() << ">: " << data_[0]
+                << std::endl;
+    } else if constexpr (Dim == 1) {
+      std::cout << "Vector<" << typeid(T).name() << ">(" << shape_[0] << "): [";
+      for (size_t i = 0; i < data_.size(); ++i) {
+        std::cout << data_[i];
+        if (i < data_.size() - 1)
+          std::cout << ", ";
+      }
+      std::cout << "]" << std::endl;
+    } else if constexpr (Dim == 2) {
+      std::cout << "Matrix<" << typeid(T).name() << ">(" << shape_[axes_[0]]
+                << "x" << shape_[axes_[1]] << "):" << std::endl;
+      for (size_t i = 0; i < shape_[axes_[0]]; ++i) {
+        std::cout << "  [";
+        for (size_t j = 0; j < shape_[axes_[1]]; ++j) {
+          std::cout << (*this)(i, j);
+          if (j < shape_[axes_[1]] - 1)
+            std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+      }
+    } else {
+      std::cout << "Tensor" << Dim << "D<" << typeid(T).name() << ">" << "[";
+      for (size_t i = 0; i < Dim; ++i) {
+        std::cout << shape_[axes_[i]];
+        if (i < Dim - 1)
+          std::cout << "x";
+      }
+      std::cout << "]: [";
+      size_t show = std::min(data_.size(), size_t(10));
+      for (size_t i = 0; i < show; ++i) {
+        std::cout << data_[i];
+        if (i < show - 1)
+          std::cout << ", ";
+      }
+      if (data_.size() > 10)
+        std::cout << ", ...";
+      std::cout << "]" << std::endl;
+    }
+  }
+};
+
+template <typename T> using Scalar = Tensor<T, 0>;
+template <typename T> using Vector = Tensor<T, 1>;
+template <typename T> using Matrix = Tensor<T, 2>;
+
+class Tensors {
+  Tensors() = delete;
+
+public:
+  template <typename T, typename... Args> static auto empty(Args... args) {
+    return Tensor<T, sizeof...(Args)>({static_cast<size_t>(args)...});
+  }
+
+  template <typename T, typename... Args> static auto zero(Args... args) {
+    return Tensor<T, sizeof...(Args)>({static_cast<size_t>(args)...}, T(0));
+  }
+
+  template <typename T, typename... Args> static auto rand(Args... args) {
+    return Tensor<T, sizeof...(Args)>({static_cast<size_t>(args)...}, T(0),
+                                      T(1));
+  }
+};