diff --git a/src/tensor/opencl/kernels.hpp b/src/tensor/opencl/kernels.hpp new file mode 100644 index 0000000..196a564 --- /dev/null +++ b/src/tensor/opencl/kernels.hpp @@ -0,0 +1,156 @@ +#include "opencl.hpp" +#include +#include +#include +#include + +template class Kernels { +public: + enum class Vector { + type2 = 2, + type4 = 4, + type8 = 8, + type16 = 16, + }; + enum class Method { + POSITIVE, + NEGATIVE, + S_ADD, + S_MULT, + T_ADD, + T_HADAMARD, + T_MULT, + }; + constexpr const static std::string type = typeid(T).name(); + + // TODO: get native vector size + static Vector vector = Vector::type8; + +private: + static std::string unaryOperation(std::string name, std::string operation) { + return std::format( + R"( + __kernel void {method}(__global {type}* A, int len) {{ + int gid = get_global_id(0); + int base = gid * {vector}; + if (base + ({vector}-1) < len) {{ + {type}{vector} data = vload{vector}(gid, A); + vstore{vector}({operation}data, gid, A); + }} else {{ + for (int i = 0; i < {vec_size}; i++) {{ + int idx = base + i; + if (idx < len) A[idx] = {operation}A[idx]; + }} + }} + }} + )", + std::make_format_args(std::make_pair("method", name), + std::make_pair("vector", vector), + std::make_pair("type", type), + std::make_pair("operation", operation))); + } + + static std::string scalarOperation(std::string name, std::string operation) { + return std::format( + R"( + __kernel void {method}(__global {type}* A, int len, {type} scalar) {{ + int gid = get_global_id(0); + int base = gid * {vector}; + if (base + ({vector}-1) < len) {{ + {type}{vector} data = vload{vector}(gid, A); + data = data {operation} scalar; + vstore{vector}(data, gid, A); + }} else {{ + for (int i = 0; i < {vec_size}; i++) {{ + int idx = base + i; + if (idx < len) A[idx] = A[idx] {operation} scalar; + }} + }} + }} + )", + std::make_format_args(std::make_pair("method", name), + std::make_pair("vector", vector), + std::make_pair("type", type), + std::make_pair("operation", operation))); + } + + static std::string binaryOperation(std::string name, std::string operation) { + return std::format( + R"( + __kernel void {method}(__global {type}* A, __global {type}* B, int len) {{ + int gid = get_global_id(0); + int base = gid * {vector}; + if (base + ({vector}-1) < len) {{ + {type}{vector} dataA = vload{vector}(gid, A); + {type}{vector} dataB = vload{vector}(gid, B); + vstore{vector}(dataA {operation} dataB, gid, A); + }} else {{ + for (int i = 0; i < {vector}; i++) {{ + int idx = base + i; + if (idx < len) A[idx] = A[idx] {operation} B[idx]; + }} + }} + }} + )", + std::make_format_args(std::make_pair("method", name), + std::make_pair("vector", vector), + std::make_pair("type", type), + std::make_pair("operation", operation))); + } + + static std::unordered_map> + programs = { + {Method::POSITIVE, {unaryOperation("positive", "+"), "positive"}}, + {Method::NEGATIVE, {unaryOperation("negative", "-")}, "negative"}, + + {Method::S_ADD, {scalarOperation("add", "+")}, "add"}, + {Method::S_MULT, {scalarOperation("mult", "*")}, "mult"}, + + {Method::T_ADD, {binaryOperation("add", "+")}, "add"}, + {Method::T_HADAMARD, + {binaryOperation("hadamard_mult", "*")}, + "hadamard_mult"}, + {Method::T_MULT, {"", "mult"}}, + }; + + static inline std::unordered_map compiledPrograms; + static inline std::mutex compileMutex; + +public: + static cl::Kernel create(Method method) { + std::lock_guard lock(compileMutex); + + auto cache = compiledPrograms.find(method); + if (cache != compiledPrograms.end()) { + const auto &programName = std::get<1>(programs[method]); + return cl::Kernel(cache->second, programName.c_str()); + } + + auto program = programs.find(method); + if (program == programs.end()) + throw std::runtime_error("Unknown method: " + + std::to_string(static_cast(method))); + const auto &[sourceCode, kernelName] = program->second; + + try { + cl::Program::Sources sources; + sources.push_back({sourceCode.c_str(), sourceCode.length()}); + cl::Program program(openCL.getContext(), sources); + program.build({openCL.getDevice()}); + compiledPrograms[method] = program; + return cl::Kernel(program, kernelName.c_str()); + + } catch (const cl::Error &e) { + if (e.err() == CL_BUILD_PROGRAM_FAILURE) { + cl::Program program(openCL.getContext(), + {sourceCode.c_str(), sourceCode.length()}); + auto buildInfo = + program.getBuildInfo(openCL.getDevice()); + throw std::runtime_error( + "OpenCL compilation failed: " + std::string(e.what()) + + "\nBuild log:\n" + buildInfo); + } + throw std::runtime_error("OpenCL error: " + std::string(e.what())); + } + } +}; diff --git a/src/tensor/opencl/kernels/atomic.cl b/src/tensor/opencl/kernels/atomic.cl deleted file mode 100644 index 26d5d68..0000000 --- a/src/tensor/opencl/kernels/atomic.cl +++ /dev/null @@ -1,34 +0,0 @@ -__kernel void positive(__global float *A) { - int i = get_global_id(0); - A[i] = +A[i]; -} - -__kernel void negative(__global float *A) { - int i = get_global_id(0); - A[i] = -A[i]; -} - - -float activate_x(float x, const int activation_type, const float alpha) { - switch (activation_type) { - case 0: // LINEAR - return x; - case 1: // SIGMOID - return 1.0f / (1.0f + exp(-x)); - case 2: // TANH - return tanh(x); - case 3: // RELU - return fmax(0.0f, x); - case 4: // LEAKY_RELU - return (x > 0.0f) ? x : alpha * x; - case 5: // ELU - return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f); - default: - return x; - } -} -__kernel void activate(__global float *input, __global float *output, - const int activation_type, const float alpha) { - int i = get_global_id(0); - output[i] = activate_x(input[i], activation_type, alpha); -} diff --git a/src/tensor/opencl/kernels/fusion.cl b/src/tensor/opencl/kernels/fusion.cl deleted file mode 100644 index e69de29..0000000 diff --git a/src/tensor/opencl/kernels/scalar.cl b/src/tensor/opencl/kernels/scalar.cl deleted file mode 100644 index 9a823c0..0000000 --- a/src/tensor/opencl/kernels/scalar.cl +++ /dev/null @@ -1,9 +0,0 @@ -__kernel void add(__global float *A, float scalar) { - int i = get_global_id(0); - A[i] += scalar; -} - -__kernel void mult(__global float *A, float scalar) { - int i = get_global_id(0); - A[i] *= scalar; -} diff --git a/src/tensor/opencl/kernels/tensor.cl b/src/tensor/opencl/kernels/tensor.cl deleted file mode 100644 index 6cd5a7b..0000000 --- a/src/tensor/opencl/kernels/tensor.cl +++ /dev/null @@ -1,81 +0,0 @@ -__kernel void add(__global float *A, __global float *B) { - int i = get_global_id(0); - A[i] += B[i]; -} -__kernel void hadamard_mult(__global float *A, __global float *B) { - int i = get_global_id(0); - A[i] *= B[i]; -} - -#define TILE_SIZE 16 -#define VEC_SIZE 4 -__kernel void mult(__global float *A, __global float *B, __global float *C, - const int M, const int N, const int K) { - - const int row = get_global_id(0) * VEC_SIZE; - const int col = get_global_id(1); - const int local_row = get_local_id(0); - const int local_col = get_local_id(1); - - __local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts - __local float tile_B[TILE_SIZE][TILE_SIZE + 1]; - - float4 sum[VEC_SIZE]; - for (int i = 0; i < VEC_SIZE; i++) { - sum[i] = (float4)(0.0f); - } - - const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE; - - for (int t = 0; t < numTiles; t++) { - // Загрузка tile_A с векторизацией - int a_col = t * TILE_SIZE + local_col; - #pragma unroll - for (int v = 0; v < VEC_SIZE; v++) { - int current_row = row + v; - if (current_row < M && a_col < K) { - tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col]; - } else { - tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f; - } - } - - // Загрузка tile_B - int b_row = t * TILE_SIZE + local_row; - if (b_row < K && col < N) { - tile_B[local_row][local_col] = B[b_row * N + col]; - } else { - tile_B[local_row][local_col] = 0.0f; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Векторизованное вычисление - #pragma unroll - for (int k = 0; k < TILE_SIZE; k++) { - float4 a_vals = (float4)( - tile_A[local_row * VEC_SIZE + 0][k], - tile_A[local_row * VEC_SIZE + 1][k], - tile_A[local_row * VEC_SIZE + 2][k], - tile_A[local_row * VEC_SIZE + 3][k] - ); - float b_val = tile_B[k][local_col]; - - sum[0] += a_vals.x * b_val; - sum[1] += a_vals.y * b_val; - sum[2] += a_vals.z * b_val; - sum[3] += a_vals.w * b_val; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Сохранение результатов с векторизацией - #pragma unroll - for (int v = 0; v < VEC_SIZE; v++) { - int current_row = row + v; - if (current_row < M && col < N) { - C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w; - } - } -} \ No newline at end of file diff --git a/src/tensor/opencl/opencl.cpp b/src/tensor/opencl/opencl.cpp index c1de462..c16b4f0 100644 --- a/src/tensor/opencl/opencl.cpp +++ b/src/tensor/opencl/opencl.cpp @@ -1,66 +1,23 @@ #include "opencl.hpp" -#include #include -#include #include -std::string OpenCL::readProgram(const std::string &filePath) { - std::ifstream file(filePath, std::ios::binary); - if (!file.is_open()) { - throw std::runtime_error("Cannot open file: " + filePath); - } - - std::stringstream buffer; - buffer << file.rdbuf(); - return buffer.str(); -} -cl::Program OpenCL::compileProgram(const std::string &file) { - std::string source = readProgram(file); - cl::Program program(context, source); +OpenCL::OpenCL() { try { - program.build({device}); - } catch (cl::Error &e) { - std::string build_log = program.getBuildInfo(device); - std::cerr << "Build log:\n" << build_log << std::endl; - throw; - } - return program; -} -void OpenCL::loadPrograms(std::string &programsBasePath) { - for (const auto &entry : programPaths) { - programs[entry.first] = compileProgram(programsBasePath + entry.second); - std::cout << "Loaded program: " << entry.second << std::endl; - } -} + std::vector platforms; + cl::Platform::get(&platforms); -void OpenCL::initializeDevice() { - std::vector platforms; - cl::Platform::get(&platforms); - - if (platforms.empty()) { - throw std::runtime_error("No OpenCL platforms found"); - } - - std::vector devices; - bool deviceFound = false; - - for (const auto &platform : platforms) { - try { - platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); - if (!devices.empty()) { - deviceFound = true; - break; - } - } catch (const cl::Error &) { - continue; + if (platforms.empty()) { + throw std::runtime_error("No OpenCL platforms found"); } - } - if (!deviceFound) { + std::vector devices; + bool deviceFound = false; + for (const auto &platform : platforms) { try { - platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); + platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); if (!devices.empty()) { deviceFound = true; break; @@ -69,32 +26,29 @@ void OpenCL::initializeDevice() { continue; } } - } - if (!deviceFound) { - throw std::runtime_error("No suitable OpenCL devices found"); - } + if (!deviceFound) { + for (const auto &platform : platforms) { + try { + platform.getDevices(CL_DEVICE_TYPE_CPU, &devices); + if (!devices.empty()) { + deviceFound = true; + break; + } + } catch (const cl::Error &) { + continue; + } + } + } - device = devices[0]; - context = cl::Context(device); - queue = - cl::CommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); + if (!deviceFound) { + throw std::runtime_error("No suitable OpenCL devices found"); + } - std::cout << "Using device: " << device.getInfo() - << "\nPlatform: " << platforms[0].getInfo() - << "\nCompute units: " - << device.getInfo() - << "\nGlobal memory: " - << device.getInfo() / (1024 * 1024) - << " MB" << std::endl; -} - -OpenCL::OpenCL() {} - -void OpenCL::init(std::string programsBasePath) { - try { - initializeDevice(); - loadPrograms(programsBasePath); + device = devices[0]; + context = cl::Context(device); + queue = cl::CommandQueue(context, device, + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); } catch (const cl::Error &e) { std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")" << std::endl; @@ -102,26 +56,6 @@ void OpenCL::init(std::string programsBasePath) { } } -cl::Program &OpenCL::getProgram(Program program) { - auto it = programs.find(program); - if (it == programs.end()) - throw std::invalid_argument("Program not loaded: " + - std::to_string(static_cast(program))); - return it->second; -} - -cl::Kernel OpenCL::createKernel(Method method) { - auto methodProgram = methodPrograms.find(method); - if (methodProgram == methodPrograms.end()) - throw std::invalid_argument("Not found program for method: " + - std::to_string(static_cast(method))); - auto methodName = methodNames.find(method); - if (methodName == methodNames.end()) - throw std::invalid_argument("Not found name for method: " + - std::to_string(static_cast(method))); - return cl::Kernel(getProgram(methodProgram->second), methodName->second); -} - void OpenCL::printDeviceInfo() const { std::cout << "=== OpenCL Device Info ===" << std::endl; std::cout << "Name: " << device.getInfo() << std::endl; diff --git a/src/tensor/opencl/opencl.hpp b/src/tensor/opencl/opencl.hpp index 402e12d..8841af7 100644 --- a/src/tensor/opencl/opencl.hpp +++ b/src/tensor/opencl/opencl.hpp @@ -4,59 +4,15 @@ #define CL_HPP_TARGET_OPENCL_VERSION 300 #include -#include - class OpenCL { -public: - enum class Method { - POSITIVE, - NEGATIVE, - S_ADD, - S_MULT, - T_ADD, - T_HADAMARD, - T_MULT, - }; - enum class Program { ATOMIC, SCALAR, TENSOR, FUSION }; - private: cl::Device device; cl::Context context; cl::CommandQueue queue; - std::unordered_map programs; - std::unordered_map programPaths = { - {Program::ATOMIC, "opencl/kernels/atomic.cl"}, - {Program::SCALAR, "opencl/kernels/scalar.cl"}, - {Program::TENSOR, "opencl/kernels/tensor.cl"}, - {Program::FUSION, "opencl/kernels/fusion.cl"}}; - std::unordered_map methodPrograms = { - {Method::POSITIVE, Program::ATOMIC}, - {Method::NEGATIVE, Program::ATOMIC}, - {Method::S_ADD, Program::SCALAR}, - {Method::S_MULT, Program::SCALAR}, - {Method::T_ADD, Program::TENSOR}, - {Method::T_HADAMARD, Program::TENSOR}, - {Method::T_MULT, Program::TENSOR}, - }; - std::unordered_map methodNames = { - {Method::POSITIVE, "positive"}, {Method::NEGATIVE, "negative"}, - {Method::S_ADD, "add"}, {Method::S_MULT, "mult"}, - {Method::T_ADD, "add"}, {Method::T_HADAMARD, "hadamard_mult"}, - {Method::T_MULT, "mult"}, - }; - - std::string readProgram(const std::string &filePath); - cl::Program compileProgram(const std::string &file); - void loadPrograms(std::string &programsBasePath); - - void initializeDevice(); - public: OpenCL(); - void init(std::string programsBasePath); - OpenCL(const OpenCL &) = delete; OpenCL &operator=(const OpenCL &) = delete; OpenCL(OpenCL &&) = delete; @@ -66,9 +22,6 @@ public: cl::Context &getContext() { return context; } const cl::CommandQueue &getQueue() { return queue; } - cl::Program &getProgram(Program program); - cl::Kernel createKernel(Method method); - void printDeviceInfo() const; }; diff --git a/src/tensor/opencl/tensor.hpp b/src/tensor/opencl/tensor.hpp index 6fb3301..8c424bb 100644 --- a/src/tensor/opencl/tensor.hpp +++ b/src/tensor/opencl/tensor.hpp @@ -2,6 +2,8 @@ #include "opencl.hpp" +#include "kernels.hpp" + #include "../tensor.hpp" #include @@ -45,6 +47,7 @@ private: public: typedef class ITensor ITensor; + typedef class Kernels Kernels; using ITensor::axes_; using ITensor::checkAxisInDim; @@ -117,7 +120,7 @@ public: using ITensor::operator-; Tensor operator+() const override { - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::POSITIVE); + cl::Kernel kernel = Kernels::create(Kernels::Method::POSITIVE); kernel.setArg(0, *data_); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(getSize()), @@ -126,7 +129,7 @@ public: } Tensor operator-() const override { - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::NEGATIVE); + cl::Kernel kernel = Kernels::create(Kernels::Method::NEGATIVE); kernel.setArg(0, *data_); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(getSize()), @@ -135,7 +138,7 @@ public: } Tensor &operator+=(const T scalar) override { - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::S_ADD); + cl::Kernel kernel = Kernels::create(Kernels::Method::S_ADD); kernel.setArg(0, *data_); kernel.setArg(1, scalar); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, @@ -145,7 +148,7 @@ public: } Tensor &operator*=(const T scalar) override { - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::S_MULT); + cl::Kernel kernel = Kernels::create(Kernels::Method::S_MULT); kernel.setArg(0, *data_); kernel.setArg(1, scalar); openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange, @@ -155,7 +158,7 @@ public: } Tensor &operator+=(const Tensor &other) override { - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::T_ADD); + cl::Kernel kernel = Kernels::create(Kernels::Method::T_ADD); kernel.setArg(0, *data_); kernel.setArg(1, *other.getData()); openCL.getQueue().enqueueNDRangeKernel( @@ -165,7 +168,7 @@ public: } Tensor &operator*=(const Tensor &other) override { - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::T_HADAMARD); + cl::Kernel kernel = Kernels::create(Kernels::Method::T_HADAMARD); kernel.setArg(0, *data_); kernel.setArg(1, *other.getData()); openCL.getQueue().enqueueNDRangeKernel( @@ -189,7 +192,7 @@ public: size_t k = shape_[axes_[1]]; size_t n = other.shape_[other.axes_[1]]; Tensor result({m, n}); - cl::Kernel kernel = openCL.createKernel(OpenCL::Method::T_MULT); + cl::Kernel kernel = Kernels::create(Kernels::Method::T_MULT); kernel.setArg(0, *data_); kernel.setArg(1, *other.getData()); kernel.setArg(2, *result.getData());