From c1874212ae4fcda5150acbf2a339f805637ebecf Mon Sep 17 00:00:00 2001 From: StepanovPlaton Date: Tue, 18 Nov 2025 00:04:49 +0400 Subject: [PATCH] Complete first GPU tensors --- src/tensor/Makefile | 6 +- src/tensor/main.cpp | 11 +++- src/tensor/opencl/kernels/scalar.cl | 2 +- src/tensor/opencl/opencl.hpp | 2 +- src/tensor/opencl/tensor.hpp | 88 ++++++++++++++++++++++------- 5 files changed, 83 insertions(+), 26 deletions(-) diff --git a/src/tensor/Makefile b/src/tensor/Makefile index 1133f6c..fac7275 100644 --- a/src/tensor/Makefile +++ b/src/tensor/Makefile @@ -26,13 +26,17 @@ PYTHON_INCLUDE = $(shell python -c "import sysconfig; print(sysconfig.get_config PYTHON_LIBS = $(PYTHON_PATH)$(SP)libs PYBIND_INCLUDE = $(shell python -c "import pybind11; print(pybind11.get_include())") +OPENCL_INCLUDES = -I"A:/Programs/OpenCL/include" +OPENCL_LIB_PATH = -L"A:/Programs/OpenCL/lib" -lOpenCL +OPENCL_LIB = -L"A:/Programs/OpenCL/lib" -lOpenCL + .DEFAULT_GOAL := $(TARGET) $(BUILD_DIR): $(MKDIR) $(BUILD_DIR) $(TARGET): $(COMMON_SRC) main.cpp | $(BUILD_DIR) - $(CXX) $(CXXFLAGS) -o $@ $^ + $(CXX) $(CXXFLAGS) $(OPENCL_INCLUDES) $(OPENCL_LIB_PATH) -o $@ $^ $(OPENCL_LIB) module: $(COMMON_SRC) pybind.cpp | $(BUILD_DIR) $(CXX) $(CXXFLAGS) -shared -fPIC -o tensor.$(SHARED_LIB_EXT) $^ -I"$(PYTHON_INCLUDE)" -L"$(PYTHON_LIBS)" -lpython3.13 -I"$(PYBIND_INCLUDE)" diff --git a/src/tensor/main.cpp b/src/tensor/main.cpp index 4e03024..ee84d2f 100644 --- a/src/tensor/main.cpp +++ b/src/tensor/main.cpp @@ -5,9 +5,16 @@ // TODO: GENERIC KERNELS // TODO: Scalar mult +// TODO: TMult >2 + +OpenCL openCL; int main() { - Tensor a = Tensor({2, 4}); - std::cout << a.toString(); + Tensor a = Tensor({8192, 8192}, 1); + Tensor b = Tensor({8192, 8192}, 1); + auto c = a % b; + Tensor d = Tensor(c); + d += 1; + std::cout << d.toString(); return 0; } diff --git a/src/tensor/opencl/kernels/scalar.cl b/src/tensor/opencl/kernels/scalar.cl index 2b201b0..9a823c0 100644 --- a/src/tensor/opencl/kernels/scalar.cl +++ b/src/tensor/opencl/kernels/scalar.cl @@ -5,5 +5,5 @@ __kernel void add(__global float *A, float scalar) { __kernel void mult(__global float *A, float scalar) { int i = get_global_id(0); - B[i] = A[i] * scalar; + A[i] *= scalar; } diff --git a/src/tensor/opencl/opencl.hpp b/src/tensor/opencl/opencl.hpp index d0bf975..ac87ba9 100644 --- a/src/tensor/opencl/opencl.hpp +++ b/src/tensor/opencl/opencl.hpp @@ -70,4 +70,4 @@ public: void printDeviceInfo() const; }; -OpenCL openCL; +extern OpenCL openCL; diff --git a/src/tensor/opencl/tensor.hpp b/src/tensor/opencl/tensor.hpp index bad21be..116d5e5 100644 --- a/src/tensor/opencl/tensor.hpp +++ b/src/tensor/opencl/tensor.hpp @@ -4,12 +4,13 @@ #include "../tensor.hpp" +#include #include - +#include template class Tensor : public ITensor { private: cl::Buffer *data_ = nullptr; - cl::Event *event_ = new cl::Event(); + cl::Event event_ = cl::Event(); class AutoEventList { private: @@ -19,7 +20,7 @@ private: AutoEventList(std::initializer_list events) : events_(events) {} operator const std::vector *() const { return &events_; } }; - template AutoEventList all(Events &&...events) { + template AutoEventList all(Events &&...events) const { return AutoEventList{std::forward(events)...}; } @@ -32,16 +33,15 @@ private: void fillBuf(const std::vector &data) { createBuf(data.size()); - // event_ = event?! openCL.getQueue().enqueueWriteBuffer(*data_, CL_FALSE, 0, data.size() * sizeof(T), data.data(), - all(*event_), event_); + nullptr, &event_); } void fillBuf(const Tensor &other) { createBuf(other.getSize()); - openCL.getQueue().enqueueCopyBuffer( - *other.getData(), *data_, 0, 0, other.getSize() * sizeof(T), - all(*event_, *other.getEvent()), event_); + openCL.getQueue().enqueueCopyBuffer(*other.getData(), *data_, 0, 0, + other.getSize() * sizeof(T), + all(other.getEvent()), &event_); } public: @@ -97,7 +97,7 @@ public: Tensor(Tensor &&other) noexcept : ITensor(std::move(other)) { data_ = other.data_; event_ = other.event_; - other.data = nullptr; + other.data_ = nullptr; } Tensor &operator=(Tensor &&other) noexcept { ITensor::operator=(std::move(other)); @@ -112,7 +112,7 @@ public: }; const cl::Buffer *getData() const { return data_; } - const cl::Event *getEvent() const { return event_; } + const cl::Event &getEvent() const { return event_; } // T &operator[](size_t i); // const T &operator[](size_t i) const; @@ -128,7 +128,7 @@ public: kernel.setArg(0, *data_); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(getSize()), - cl::NullRange, all(*event_), event_); + cl::NullRange, all(event_), &event_); return *this; } @@ -137,7 +137,7 @@ public: kernel.setArg(0, *data_); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(getSize()), - cl::NullRange, all(*event_), event_); + cl::NullRange, all(event_), &event_); return *this; } @@ -147,7 +147,7 @@ public: kernel.setArg(1, scalar); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(getSize()), - cl::NullRange, all(*event_), event_); + cl::NullRange, all(event_), &event_); return *this; } @@ -157,7 +157,7 @@ public: kernel.setArg(1, scalar); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(getSize()), - cl::NullRange, all(*event_), event_); + cl::NullRange, all(event_), &event_); return *this; } @@ -167,7 +167,7 @@ public: kernel.setArg(1, *other.getData()); openCL.getQueue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange, - all(*event_, *other.event_), event_); + all(event_, other.event_), &event_); return *this; } @@ -177,7 +177,7 @@ public: kernel.setArg(1, *other.getData()); openCL.getQueue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange, - all(*event_, *other.event_), event_); + all(event_, other.event_), &event_); return *this; } @@ -191,9 +191,9 @@ public: if (shape_[axes_[1]] != other.shape_[other.axes_[0]]) throw std::invalid_argument( "Matrix dimensions must match for multiplication"); - size_t m = shape_[axes_[0]]; - size_t k = shape_[axes_[1]]; - size_t n = other.shape_[other.axes_[1]]; + int m = (int)shape_[axes_[0]]; + int k = (int)shape_[axes_[1]]; + int n = (int)other.shape_[other.axes_[1]]; Tensor result({m, n}); cl::Kernel kernel = openCL.createKernel(OpenCL::Method::T_MULT); kernel.setArg(0, *data_); @@ -207,12 +207,58 @@ public: cl::NDRange local_size(TILE_SIZE, TILE_SIZE); openCL.getQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_size, local_size, - all(*event_, *other.event_), result.event_); + all(event_, other.event_), &result.event_); return result; } } - std::string toString() const override; + std::string toString() const override { + std::vector result(getSize()); + openCL.getQueue().enqueueReadBuffer( + *data_, CL_TRUE, 0, getSize() * sizeof(T), result.data(), all(event_)); + std::ostringstream oss; + if constexpr (Dim == 0) { + oss << "Scalar<" << typeid(T).name() << ">: " << result[0]; + } else if constexpr (Dim == 1) { + oss << "Vector<" << typeid(T).name() << ">(" << shape_[0] << "): ["; + for (size_t i = 0; i < getSize(); ++i) { + oss << result[i]; + if (i < getSize() - 1) + oss << ", "; + } + oss << "]"; + } else if constexpr (Dim == 2) { + oss << "Matrix<" << typeid(T).name() << ">(" << shape_[axes_[0]] << "x" + << shape_[axes_[1]] << "):"; + for (size_t i = 0; i < shape_[axes_[0]]; ++i) { + oss << "\n ["; + for (size_t j = 0; j < shape_[axes_[1]]; ++j) { + oss << result[i * shape_[axes_[0]] + j]; + if (j < shape_[axes_[1]] - 1) + oss << ", "; + } + oss << "]"; + } + } else { + oss << "Tensor" << Dim << "D<" << typeid(T).name() << ">" << "["; + for (size_t i = 0; i < Dim; ++i) { + oss << shape_[axes_[i]]; + if (i < Dim - 1) + oss << "x"; + } + oss << "]: ["; + size_t show = std::min(getSize(), size_t(10)); + for (size_t i = 0; i < show; ++i) { + oss << result[i]; + if (i < show - 1) + oss << ", "; + } + if (getSize() > 10) + oss << ", ..."; + oss << "]"; + } + return oss.str(); + } }; #include "tensor.tpp"