New tensor lib

This commit is contained in:
2025-11-09 17:01:44 +04:00
parent bbafbf5574
commit d3ac52b8df
27 changed files with 497 additions and 1746 deletions

9
src/tensor/.clangd Normal file
View File

@@ -0,0 +1,9 @@
CompileFlags:
Add:
- -std=c++23
- -Wall
- -Wextra
- -Wpedantic
Remove: []
Diagnostics:
UnusedIncludes: Strict

39
src/tensor/Makefile Normal file
View File

@@ -0,0 +1,39 @@
CXX = g++
CXXFLAGS = -Wall -Wextra -O1 -g -std=c++23
ifeq ($(OS),Windows_NT)
DETECTED_OS := Windows
else
DETECTED_OS := $(shell uname -s)
endif
ifeq ($(DETECTED_OS),Windows)
TARGET = main.exe
MKDIR = powershell -Command "mkdir"
SHARED_LIB_EXT = pyd
else
TARGET = main
MKDIR = mkdir -p
SHARED_LIB_EXT = so
endif
BUILD_DIR = build
COMMON_SRC = tensor.cpp
PYTHON_PATH = $(shell python -c "from sysconfig import get_paths; print(get_paths()['data'])")
PYTHON_INCLUDE = $(PYTHON_PATH)\include
PYTHON_LIBS = $(PYTHON_PATH)\libs
PYBIND_INCLUDE = $(shell python -c "import pybind11; print(pybind11.get_include())")
.DEFAULT_GOAL := $(TARGET)
$(BUILD_DIR):
$(MKDIR) $(BUILD_DIR)
$(TARGET): $(COMMON_SRC) main.cpp | $(BUILD_DIR)
$(CXX) $(CXXFLAGS) -o $@ $^
module: $(COMMON_SRC) pybind.cpp | $(BUILD_DIR)
$(CXX) $(CXXFLAGS) -shared -fPIC -o tensor.$(SHARED_LIB_EXT) $^ -I"$(PYTHON_INCLUDE)" -L"$(PYTHON_LIBS)" -lpython313 -I"$(PYBIND_INCLUDE)"
clean:
rm -rf $(BUILD_DIR) $(TARGET) *.$(SHARED_LIB_EXT)

2
src/tensor/main.cpp Normal file
View File

@@ -0,0 +1,2 @@
int main() { return 0; }

View File

@@ -0,0 +1,144 @@
float activate_x(float x, const int activation_type, const float alpha) {
switch (activation_type) {
case 0: // LINEAR
return x;
case 1: // SIGMOID
return 1.0f / (1.0f + exp(-x));
case 2: // TANH
return tanh(x);
case 3: // RELU
return fmax(0.0f, x);
case 4: // LEAKY_RELU
return (x > 0.0f) ? x : alpha * x;
case 5: // ELU
return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
default:
return x;
}
}
__kernel void activate(__global float *input, __global float *output,
const int activation_type, const float alpha) {
int i = get_global_id(0);
output[i] = activate_x(input[i], activation_type, alpha);
}
__kernel void mult_small(__global float *A, __global float *B,
__global float *C, __global float *bias,
const int activation_type, const float alpha,
const int M, const int N, const int K,
const int transpose_B) {
const int row = get_global_id(0);
const int col = get_global_id(1);
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
float a_val = A[row * K + k];
float b_val;
if (transpose_B) {
b_val = B[col * K + k];
} else {
b_val = B[k * N + col];
}
sum += a_val * b_val;
}
float result = sum + bias[col];
if (activation_type != 0) {
result = activate_x(result, activation_type, alpha);
}
C[row * N + col] = result;
}
}
__kernel void mult(__global float *A, __global float *B, __global float *C,
__global float *bias, const int activation_type,
const float alpha, const int M, const int N, const int K,
const int transpose_B) {
const int tile_size = 16;
int local_i = get_local_id(0);
int local_j = get_local_id(1);
int local_size_i = get_local_size(0);
int local_size_j = get_local_size(1);
int global_i = get_group_id(0) * local_size_i + local_i;
int global_j = get_group_id(1) * local_size_j + local_j;
__local float tile_A[16][16];
__local float tile_B[16][16];
float sum = 0.0f;
int num_tiles = (K + tile_size - 1) / tile_size;
for (int tile = 0; tile < num_tiles; tile++) {
int tile_offset = tile * tile_size;
// Загрузка tile_A (без изменений)
int load_i_A = tile_offset + local_i;
int load_j_A = tile_offset + local_j;
if (global_i < M && load_j_A < K) {
tile_A[local_j][local_i] = A[global_i * K + load_j_A];
} else {
tile_A[local_j][local_i] = 0.0f;
}
// Загрузка tile_B с учетом транспонирования
int load_i_B = tile_offset + local_i;
int load_j_B = tile_offset + local_j;
if (transpose_B) {
// B транспонирована: обращаем индексы
if (load_i_B < N && global_j < K) {
tile_B[local_j][local_i] = B[global_j * N + load_i_B];
} else {
tile_B[local_j][local_i] = 0.0f;
}
} else {
// B не транспонирована (оригинальная логика)
if (load_i_B < K && global_j < N) {
tile_B[local_j][local_i] = B[load_i_B * N + global_j];
} else {
tile_B[local_j][local_i] = 0.0f;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int k = 0; k < tile_size; ++k) {
sum += tile_A[k][local_i] * tile_B[local_j][k];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (global_i < M && global_j < N) {
float result = sum + bias[global_j];
if (activation_type != 0) {
result = activate_x(result, activation_type, alpha);
}
C[global_i * N + global_j] = result;
}
}
__kernel void mult_sc(__global float *A, __global float *B, float scalar) {
int i = get_global_id(0);
B[i] = A[i] * scalar;
}
__kernel void add(__global float *A, __global float *B, __global float *C,
float x) {
int i = get_global_id(0);
C[i] = A[i] + (B[i] * x);
}
__kernel void add_sc(__global float *A, __global float *B, float scalar) {
int i = get_global_id(0);
B[i] = A[i] + scalar;
}

View File

@@ -0,0 +1,121 @@
#include "opencl.hpp"
std::string OpenCL::readProgram(const std::string &filePath) {
std::ifstream file(filePath, std::ios::binary);
if (!file.is_open()) {
throw std::runtime_error("Cannot open file: " + filePath);
}
std::stringstream buffer;
buffer << file.rdbuf();
return buffer.str();
}
cl::Program OpenCL::compileProgram(const std::string &file) {
std::string source = readProgram(file);
cl::Program program(context, source);
try {
program.build({device});
} catch (cl::Error &e) {
std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
std::cerr << "Build log:\n" << build_log << std::endl;
throw;
}
return program;
}
void OpenCL::loadPrograms() {
for (const auto &entry : programPaths) {
programs[entry.first] = compileProgram(entry.second);
std::cout << "Loaded program: " << entry.second << std::endl;
}
}
void OpenCL::initializeDevice() {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.empty()) {
throw std::runtime_error("No OpenCL platforms found");
}
std::vector<cl::Device> devices;
bool deviceFound = false;
for (const auto &platform : platforms) {
try {
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if (!devices.empty()) {
deviceFound = true;
break;
}
} catch (const cl::Error &) {
continue;
}
}
if (!deviceFound) {
for (const auto &platform : platforms) {
try {
platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
if (!devices.empty()) {
deviceFound = true;
break;
}
} catch (const cl::Error &) {
continue;
}
}
}
if (!deviceFound) {
throw std::runtime_error("No suitable OpenCL devices found");
}
device = devices[0];
context = cl::Context(device);
queue = cl::CommandQueue(context, device);
std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>()
<< "\nPlatform: " << platforms[0].getInfo<CL_PLATFORM_NAME>()
<< "\nCompute units: "
<< device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
<< "\nGlobal memory: "
<< device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
<< " MB" << std::endl;
}
OpenCL::OpenCL() {
try {
initializeDevice();
loadPrograms();
} catch (const cl::Error &e) {
std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")"
<< std::endl;
throw;
}
}
cl::Program &OpenCL::getProgram(Program program) {
auto it = programs.find(program);
if (it == programs.end()) {
throw std::invalid_argument("Program not loaded: " +
std::to_string(static_cast<int>(program)));
}
return it->second;
}
void OpenCL::printDeviceInfo() const {
std::cout << "=== OpenCL Device Info ===" << std::endl;
std::cout << "Name: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
std::cout << "Vendor: " << device.getInfo<CL_DEVICE_VENDOR>() << std::endl;
std::cout << "Version: " << device.getInfo<CL_DEVICE_VERSION>() << std::endl;
std::cout << "Compute Units: "
<< device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << std::endl;
std::cout << "Global Memory: "
<< device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
<< " MB" << std::endl;
std::cout << "Local Memory: "
<< device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << " KB"
<< std::endl;
std::cout << "Max Work Group Size: "
<< device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
}

View File

@@ -0,0 +1,47 @@
#pragma once
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 300
#include <CL/opencl.hpp>
#include <fstream>
#include <iostream>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <unordered_map>
class OpenCL {
public:
enum class Program { TENSOR };
private:
cl::Device device;
cl::Context context;
cl::CommandQueue queue;
std::unordered_map<Program, cl::Program> programs;
std::unordered_map<Program, std::string> programPaths = {
{Program::TENSOR, "./opencl/kernels/tensor.cl"}};
std::string readProgram(const std::string &filePath);
cl::Program compileProgram(const std::string &file);
void loadPrograms();
void initializeDevice();
public:
OpenCL();
OpenCL(const OpenCL &) = delete;
OpenCL &operator=(const OpenCL &) = delete;
OpenCL(OpenCL &&) = delete;
OpenCL &operator=(OpenCL &&) = delete;
cl::Device &getDevice() { return device; }
cl::Context &getContext() { return context; }
const cl::CommandQueue &getQueue() { return queue; }
cl::Program &getProgram(Program program);
void printDeviceInfo() const;
};

102
src/tensor/pybind.cpp Normal file
View File

@@ -0,0 +1,102 @@
#include <pybind11/operators.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "tensor.hpp"
namespace py = pybind11;
template <typename T, int Dim>
void register_tensor(py::module &m, const std::string &name) {
auto tensor = py::class_<Tensor<T, Dim>>(m, name.c_str())
.def(py::init<const std::array<size_t, Dim> &>())
.def(py::init<const std::array<size_t, Dim> &, T>())
.def(py::init<const std::array<size_t, Dim> &,
const std::vector<T> &>())
.def(py::init<const std::array<size_t, Dim> &, T, T>())
.def("get_shape", &Tensor<T, Dim>::getShape)
.def("get_data", &Tensor<T, Dim>::getData)
.def("get_size", &Tensor<T, Dim>::getSize)
.def("get_axes", &Tensor<T, Dim>::getAxes)
.def("__getitem__",
[](const Tensor<T, Dim> &t, size_t i) -> T {
if (i >= t.getSize())
throw py::index_error();
return t[i];
})
.def("__setitem__",
[](Tensor<T, Dim> &t, size_t i, T value) {
if (i >= t.getSize())
throw py::index_error();
t[i] = value;
})
// .def("__call__",
// [](Tensor<T, Dim> &t, py::args args) -> T & {
//
// })
.def(py::self + py::self)
.def(py::self - py::self)
.def(py::self * py::self)
.def(py::self += py::self)
.def(py::self -= py::self)
.def(py::self *= py::self)
.def(py::self + T())
.def(py::self - T())
.def(py::self * T())
.def(py::self / T())
.def(T() + py::self)
.def(T() - py::self)
.def(T() * py::self)
.def(py::self += T())
.def(py::self -= T())
.def(py::self *= T())
.def(py::self /= T())
.def("__pos__", [](const Tensor<T, Dim> &t) { return +t; })
.def("__neg__", [](const Tensor<T, Dim> &t) { return -t; })
.def("print", &Tensor<T, Dim>::print);
if constexpr (Dim == 1 || Dim == 2)
tensor.def("__matmul__", &Tensor<T, Dim>::operator%);
if constexpr (Dim >= 2) {
tensor
.def("transpose", py::overload_cast<const std::array<int, Dim> &>(
&Tensor<T, Dim>::transpose))
.def("transpose",
py::overload_cast<int, int>(&Tensor<T, Dim>::transpose))
.def("t", &Tensor<T, Dim>::t);
}
}
PYBIND11_MODULE(tensor, m) {
m.doc() = "Tensor math library";
register_tensor<float, 0>(m, "Scalar");
register_tensor<float, 1>(m, "Vector");
register_tensor<float, 2>(m, "Matrix");
register_tensor<float, 3>(m, "Tensor3");
register_tensor<float, 4>(m, "Tensor4");
register_tensor<float, 5>(m, "Tensor5");
register_tensor<double, 0>(m, "dScalar");
register_tensor<double, 1>(m, "dVector");
register_tensor<double, 2>(m, "dMatrix");
register_tensor<double, 3>(m, "dTensor3");
register_tensor<double, 4>(m, "dTensor4");
register_tensor<double, 5>(m, "dTensor5");
register_tensor<int, 0>(m, "iScalar");
register_tensor<int, 1>(m, "iVector");
register_tensor<int, 2>(m, "iMatrix");
register_tensor<int, 3>(m, "iTensor3");
register_tensor<int, 4>(m, "iTensor4");
register_tensor<int, 5>(m, "iTensor5");
}

1
src/tensor/tensor.cpp Normal file
View File

@@ -0,0 +1 @@
#include "tensor.hpp"

338
src/tensor/tensor.hpp Normal file
View File

@@ -0,0 +1,338 @@
#include <array>
#include <iostream>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <vector>
template <typename T, int Dim> class Tensor {
private:
std::array<size_t, Dim> shape_;
std::array<int, Dim> axes_;
std::vector<T> data_;
template <typename... Indices> size_t computeIndex(Indices... indices) const {
static_assert(sizeof...(Indices) == Dim, "Invalid number of indices");
std::array<size_t, Dim> indicesArray = {static_cast<size_t>(indices)...};
std::array<size_t, Dim> axesIndices;
for (int i = 0; i < Dim; ++i)
axesIndices[axes_[i]] = indicesArray[i];
size_t index = 0;
size_t stride = 1;
for (int i = Dim - 1; i >= 0; --i) {
index += axesIndices[i] * stride;
stride *= shape_[i];
}
return index;
}
void checkItHasSameShape(const Tensor &other) {
if (getShape() != other.getShape())
throw std::invalid_argument("Tensor shapes must match");
}
void checkAxisInDim(int axis) {
if (axis < 0 || axis >= Dim)
throw std::invalid_argument("Invalid axis index");
}
public:
Tensor() = delete;
Tensor(const std::array<size_t, Dim> &shape) {
for (size_t d : shape)
if (d == 0)
throw std::invalid_argument("Invalid shape");
shape_ = shape;
for (int i = 0; i < Dim; ++i)
axes_[i] = i;
size_t total_size = 1;
for (size_t dim : shape)
total_size *= dim;
data_.resize(total_size);
}
Tensor(const std::array<size_t, Dim> &shape, T fill) : Tensor(shape) {
std::fill(data_.begin(), data_.end(), fill);
}
Tensor(const std::array<size_t, Dim> &shape, const std::vector<T> &data)
: Tensor(shape) {
if (data.size() != data_.size())
throw std::invalid_argument("Invalid data size");
data_ = data;
}
Tensor(const std::array<size_t, Dim> &shape, T min, T max) : Tensor(shape) {
static std::random_device rd;
static std::mt19937 gen(rd());
if constexpr (std::is_integral_v<T>) {
std::uniform_int_distribution<T> dis(min, max);
for (auto &element : data_)
element = dis(gen);
} else if constexpr (std::is_floating_point_v<T>) {
std::uniform_real_distribution<T> dis(min, max);
for (auto &element : data_)
element = dis(gen);
} else
throw std::invalid_argument("Invalid randomized type");
}
Tensor(const Tensor &other)
: shape_(other.shape_), axes_(other.axes_), data_(other.data_) {}
Tensor &operator=(const Tensor &other) {
shape_ = other.shape_;
axes_ = other.axes_;
data_ = other.data_;
return *this;
}
Tensor(Tensor &&other) noexcept
: shape_(std::move(other.shape_)), axes_(std::move(other.axes_)),
data_(std::move(other.data_)) {}
Tensor &operator=(Tensor &&other) noexcept {
shape_ = std::move(other.shape_);
axes_ = std::move(other.axes_);
data_ = std::move(other.data_);
return *this;
}
~Tensor() = default;
const std::array<int, Dim> &getAxes() const { return axes_; }
const std::vector<T> &getData() const { return data_; }
size_t getSize() const { return data_.size(); }
const std::array<size_t, Dim> getShape() const {
std::array<size_t, Dim> result;
for (int i = 0; i < Dim; ++i)
result[i] = shape_[axes_[i]];
return result;
}
T &operator[](size_t i) { return data_[i]; }
const T &operator[](size_t i) const { return data_[i]; }
template <typename... Indices> T &operator()(Indices... indices) {
return data_[computeIndex(indices...)];
}
template <typename... Indices> const T &operator()(Indices... indices) const {
return data_[computeIndex(indices...)];
}
Tensor &transpose(const std::array<int, Dim> &new_axes) {
std::array<bool, Dim> used{};
for (int axis : new_axes) {
checkAxisInDim(axis);
if (used[axis])
throw std::invalid_argument("Duplicate axis index");
used[axis] = true;
}
axes_ = new_axes;
return *this;
}
Tensor &transpose(int axis_a, int axis_b) {
checkAxisInDim(axis_a);
checkAxisInDim(axis_b);
if (axis_a == axis_b)
throw std::invalid_argument("Duplicate axis index");
std::swap(axes_[axis_a], axes_[axis_b]);
return *this;
}
Tensor &t() {
static_assert(Dim >= 2, "Can't change the only axis");
std::swap(axes_[Dim - 1], axes_[Dim - 2]);
return *this;
}
Tensor operator+() const { return *this; }
Tensor operator-() const {
Tensor result = *this;
for (T &e : result.data_)
e = -e;
return result;
}
Tensor &operator+=(const T &scalar) {
for (T &e : data_)
e += scalar;
return *this;
}
Tensor operator+(const T &scalar) const {
Tensor result = *this;
result += scalar;
return result;
}
friend Tensor operator+(const T &scalar, const Tensor &tensor) {
return tensor + scalar;
}
Tensor &operator-=(const T &scalar) {
for (T &e : data_)
e -= scalar;
return *this;
}
Tensor operator-(const T &scalar) const {
Tensor result = *this;
result -= scalar;
return result;
}
friend Tensor operator-(const T &scalar, const Tensor &tensor) {
Tensor result = tensor;
for (T &e : result.data_)
e = scalar - e;
return result;
}
Tensor &operator*=(const T &scalar) {
for (T &e : data_)
e *= scalar;
return *this;
}
Tensor operator*(const T &scalar) const {
Tensor result = *this;
result *= scalar;
return result;
}
friend Tensor operator*(const T &scalar, const Tensor &tensor) {
return tensor * scalar;
}
Tensor &operator/=(const T &scalar) {
if (scalar == T(0))
throw std::invalid_argument("Division by zero");
for (T &e : data_)
e /= scalar;
return *this;
}
Tensor operator/(const T &scalar) const {
Tensor result = *this;
result /= scalar;
return result;
}
Tensor &operator+=(const Tensor &other) {
checkItHasSameShape(other);
for (size_t i = 0; i < data_.size(); ++i)
data_[i] += other.data_[i];
return *this;
}
Tensor operator+(const Tensor &other) const {
Tensor result = *this;
result += other;
return result;
}
Tensor &operator-=(const Tensor &other) {
checkItHasSameShape(other);
for (size_t i = 0; i < data_.size(); ++i)
data_[i] -= other.data_[i];
return *this;
}
Tensor operator-(const Tensor &other) const {
Tensor result = *this;
result -= other;
return result;
}
Tensor &operator*=(const Tensor &other) {
checkItHasSameShape(other);
for (size_t i = 0; i < data_.size(); ++i)
data_[i] *= other.data_[i];
return *this;
}
Tensor operator*(const Tensor &other) const {
Tensor result = *this;
result *= other;
return result;
}
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
static_assert(Dim == 1 || Dim == 2,
"Inner product is only defined for vectors and matrices");
if constexpr (Dim == 1) {
if (data_.size() != other.data_.size())
throw std::invalid_argument(
"Vector sizes must match for inner product");
T result_val = T(0);
for (size_t i = 0; i < data_.size(); ++i)
result_val += data_[i] * other.data_[i];
return Tensor<T, 0>({}, {result_val});
} else if constexpr (Dim == 2) {
if (shape_[axes_[1]] != other.shape_[other.axes_[0]])
throw std::invalid_argument(
"Matrix dimensions must match for multiplication");
size_t m = shape_[axes_[0]];
size_t n = shape_[axes_[1]];
size_t p = other.shape_[other.axes_[1]];
Tensor<T, 2> result({m, p}, T(0));
for (size_t i = 0; i < m; ++i) {
for (size_t j = 0; j < p; ++j) {
T sum = T(0);
for (size_t k = 0; k < n; ++k)
sum += (*this)(i, k) * other(k, j);
result(i, j) = sum;
}
}
return result;
}
}
void print() const {
if constexpr (Dim == 0) {
std::cout << "Scalar<" << typeid(T).name() << ">: " << data_[0]
<< std::endl;
} else if constexpr (Dim == 1) {
std::cout << "Vector<" << typeid(T).name() << ">(" << shape_[0] << "): [";
for (size_t i = 0; i < data_.size(); ++i) {
std::cout << data_[i];
if (i < data_.size() - 1)
std::cout << ", ";
}
std::cout << "]" << std::endl;
} else if constexpr (Dim == 2) {
std::cout << "Matrix<" << typeid(T).name() << ">(" << shape_[axes_[0]]
<< "x" << shape_[axes_[1]] << "):" << std::endl;
for (size_t i = 0; i < shape_[axes_[0]]; ++i) {
std::cout << " [";
for (size_t j = 0; j < shape_[axes_[1]]; ++j) {
std::cout << (*this)(i, j);
if (j < shape_[axes_[1]] - 1)
std::cout << ", ";
}
std::cout << "]" << std::endl;
}
} else {
std::cout << "Tensor" << Dim << "D<" << typeid(T).name() << ">" << "[";
for (size_t i = 0; i < Dim; ++i) {
std::cout << shape_[axes_[i]];
if (i < Dim - 1)
std::cout << "x";
}
std::cout << "]: [";
size_t show = std::min(data_.size(), size_t(10));
for (size_t i = 0; i < show; ++i) {
std::cout << data_[i];
if (i < show - 1)
std::cout << ", ";
}
if (data_.size() > 10)
std::cout << ", ...";
std::cout << "]" << std::endl;
}
}
};
template <typename T> using Scalar = Tensor<T, 0>;
template <typename T> using Vector = Tensor<T, 1>;
template <typename T> using Matrix = Tensor<T, 2>;
class Tensors {
Tensors() = delete;
public:
template <typename T, typename... Args> static auto empty(Args... args) {
return Tensor<T, sizeof...(Args)>({static_cast<size_t>(args)...});
}
template <typename T, typename... Args> static auto zero(Args... args) {
return Tensor<T, sizeof...(Args)>({static_cast<size_t>(args)...}, T(0));
}
template <typename T, typename... Args> static auto rand(Args... args) {
return Tensor<T, sizeof...(Args)>({static_cast<size_t>(args)...}, T(0),
T(1));
}
};