New tensor lib

2026-04-04 04:40:40 +04:00 · 2025-11-09 17:01:44 +04:00
parent bbafbf5574
commit d3ac52b8df
27 changed files with 497 additions and 1746 deletions
--- a/src/tensor/opencl/kernels/tensor.cl
+++ b/src/tensor/opencl/kernels/tensor.cl
@@ -0,0 +1,144 @@
+float activate_x(float x, const int activation_type, const float alpha) {
+  switch (activation_type) {
+  case 0: // LINEAR
+    return x;
+  case 1: // SIGMOID
+    return 1.0f / (1.0f + exp(-x));
+  case 2: // TANH
+    return tanh(x);
+  case 3: // RELU
+    return fmax(0.0f, x);
+  case 4: // LEAKY_RELU
+    return (x > 0.0f) ? x : alpha * x;
+  case 5: // ELU
+    return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
+  default:
+    return x;
+  }
+}
+
+__kernel void activate(__global float *input, __global float *output,
+                       const int activation_type, const float alpha) {
+  int i = get_global_id(0);
+  output[i] = activate_x(input[i], activation_type, alpha);
+}
+
+__kernel void mult_small(__global float *A, __global float *B,
+                         __global float *C, __global float *bias,
+                         const int activation_type, const float alpha,
+                         const int M, const int N, const int K,
+                         const int transpose_B) {
+  const int row = get_global_id(0);
+  const int col = get_global_id(1);
+
+  if (row < M && col < N) {
+    float sum = 0.0f;
+    for (int k = 0; k < K; k++) {
+      float a_val = A[row * K + k];
+
+      float b_val;
+      if (transpose_B) {
+        b_val = B[col * K + k];
+      } else {
+        b_val = B[k * N + col];
+      }
+
+      sum += a_val * b_val;
+    }
+
+    float result = sum + bias[col];
+    if (activation_type != 0) {
+      result = activate_x(result, activation_type, alpha);
+    }
+    C[row * N + col] = result;
+  }
+}
+
+__kernel void mult(__global float *A, __global float *B, __global float *C,
+                   __global float *bias, const int activation_type,
+                   const float alpha, const int M, const int N, const int K,
+                   const int transpose_B) {
+  const int tile_size = 16;
+
+  int local_i = get_local_id(0);
+  int local_j = get_local_id(1);
+  int local_size_i = get_local_size(0);
+  int local_size_j = get_local_size(1);
+
+  int global_i = get_group_id(0) * local_size_i + local_i;
+  int global_j = get_group_id(1) * local_size_j + local_j;
+
+  __local float tile_A[16][16];
+  __local float tile_B[16][16];
+
+  float sum = 0.0f;
+
+  int num_tiles = (K + tile_size - 1) / tile_size;
+
+  for (int tile = 0; tile < num_tiles; tile++) {
+    int tile_offset = tile * tile_size;
+
+    // Загрузка tile_A (без изменений)
+    int load_i_A = tile_offset + local_i;
+    int load_j_A = tile_offset + local_j;
+
+    if (global_i < M && load_j_A < K) {
+      tile_A[local_j][local_i] = A[global_i * K + load_j_A];
+    } else {
+      tile_A[local_j][local_i] = 0.0f;
+    }
+
+    // Загрузка tile_B с учетом транспонирования
+    int load_i_B = tile_offset + local_i;
+    int load_j_B = tile_offset + local_j;
+
+    if (transpose_B) {
+      // B транспонирована: обращаем индексы
+      if (load_i_B < N && global_j < K) {
+        tile_B[local_j][local_i] = B[global_j * N + load_i_B];
+      } else {
+        tile_B[local_j][local_i] = 0.0f;
+      }
+    } else {
+      // B не транспонирована (оригинальная логика)
+      if (load_i_B < K && global_j < N) {
+        tile_B[local_j][local_i] = B[load_i_B * N + global_j];
+      } else {
+        tile_B[local_j][local_i] = 0.0f;
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#pragma unroll
+    for (int k = 0; k < tile_size; ++k) {
+      sum += tile_A[k][local_i] * tile_B[local_j][k];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  if (global_i < M && global_j < N) {
+    float result = sum + bias[global_j];
+    if (activation_type != 0) {
+      result = activate_x(result, activation_type, alpha);
+    }
+    C[global_i * N + global_j] = result;
+  }
+}
+
+__kernel void mult_sc(__global float *A, __global float *B, float scalar) {
+  int i = get_global_id(0);
+  B[i] = A[i] * scalar;
+}
+
+__kernel void add(__global float *A, __global float *B, __global float *C,
+                  float x) {
+  int i = get_global_id(0);
+  C[i] = A[i] + (B[i] * x);
+}
+
+__kernel void add_sc(__global float *A, __global float *B, float scalar) {
+  int i = get_global_id(0);
+  B[i] = A[i] + scalar;
+}
--- a/src/tensor/opencl/opencl.cpp
+++ b/src/tensor/opencl/opencl.cpp
@@ -0,0 +1,121 @@
+#include "opencl.hpp"
+
+std::string OpenCL::readProgram(const std::string &filePath) {
+  std::ifstream file(filePath, std::ios::binary);
+  if (!file.is_open()) {
+    throw std::runtime_error("Cannot open file: " + filePath);
+  }
+
+  std::stringstream buffer;
+  buffer << file.rdbuf();
+  return buffer.str();
+}
+cl::Program OpenCL::compileProgram(const std::string &file) {
+  std::string source = readProgram(file);
+  cl::Program program(context, source);
+  try {
+    program.build({device});
+  } catch (cl::Error &e) {
+    std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
+    std::cerr << "Build log:\n" << build_log << std::endl;
+    throw;
+  }
+  return program;
+}
+void OpenCL::loadPrograms() {
+  for (const auto &entry : programPaths) {
+    programs[entry.first] = compileProgram(entry.second);
+    std::cout << "Loaded program: " << entry.second << std::endl;
+  }
+}
+
+void OpenCL::initializeDevice() {
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+
+  if (platforms.empty()) {
+    throw std::runtime_error("No OpenCL platforms found");
+  }
+
+  std::vector<cl::Device> devices;
+  bool deviceFound = false;
+
+  for (const auto &platform : platforms) {
+    try {
+      platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+      if (!devices.empty()) {
+        deviceFound = true;
+        break;
+      }
+    } catch (const cl::Error &) {
+      continue;
+    }
+  }
+
+  if (!deviceFound) {
+    for (const auto &platform : platforms) {
+      try {
+        platform.getDevices(CL_DEVICE_TYPE_CPU, &devices);
+        if (!devices.empty()) {
+          deviceFound = true;
+          break;
+        }
+      } catch (const cl::Error &) {
+        continue;
+      }
+    }
+  }
+
+  if (!deviceFound) {
+    throw std::runtime_error("No suitable OpenCL devices found");
+  }
+
+  device = devices[0];
+  context = cl::Context(device);
+  queue = cl::CommandQueue(context, device);
+
+  std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>()
+            << "\nPlatform: " << platforms[0].getInfo<CL_PLATFORM_NAME>()
+            << "\nCompute units: "
+            << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
+            << "\nGlobal memory: "
+            << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+            << " MB" << std::endl;
+}
+
+OpenCL::OpenCL() {
+  try {
+    initializeDevice();
+    loadPrograms();
+  } catch (const cl::Error &e) {
+    std::cerr << "OpenCL error: " << e.what() << " (" << e.err() << ")"
+              << std::endl;
+    throw;
+  }
+}
+
+cl::Program &OpenCL::getProgram(Program program) {
+  auto it = programs.find(program);
+  if (it == programs.end()) {
+    throw std::invalid_argument("Program not loaded: " +
+                                std::to_string(static_cast<int>(program)));
+  }
+  return it->second;
+}
+
+void OpenCL::printDeviceInfo() const {
+  std::cout << "=== OpenCL Device Info ===" << std::endl;
+  std::cout << "Name: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+  std::cout << "Vendor: " << device.getInfo<CL_DEVICE_VENDOR>() << std::endl;
+  std::cout << "Version: " << device.getInfo<CL_DEVICE_VERSION>() << std::endl;
+  std::cout << "Compute Units: "
+            << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << std::endl;
+  std::cout << "Global Memory: "
+            << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / (1024 * 1024)
+            << " MB" << std::endl;
+  std::cout << "Local Memory: "
+            << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 << " KB"
+            << std::endl;
+  std::cout << "Max Work Group Size: "
+            << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
+}
--- a/src/tensor/opencl/opencl.hpp
+++ b/src/tensor/opencl/opencl.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_TARGET_OPENCL_VERSION 300
+#include <CL/opencl.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+
+class OpenCL {
+public:
+  enum class Program { TENSOR };
+
+private:
+  cl::Device device;
+  cl::Context context;
+  cl::CommandQueue queue;
+
+  std::unordered_map<Program, cl::Program> programs;
+  std::unordered_map<Program, std::string> programPaths = {
+      {Program::TENSOR, "./opencl/kernels/tensor.cl"}};
+
+  std::string readProgram(const std::string &filePath);
+  cl::Program compileProgram(const std::string &file);
+  void loadPrograms();
+
+  void initializeDevice();
+
+public:
+  OpenCL();
+
+  OpenCL(const OpenCL &) = delete;
+  OpenCL &operator=(const OpenCL &) = delete;
+  OpenCL(OpenCL &&) = delete;
+  OpenCL &operator=(OpenCL &&) = delete;
+
+  cl::Device &getDevice() { return device; }
+  cl::Context &getContext() { return context; }
+  const cl::CommandQueue &getQueue() { return queue; }
+
+  cl::Program &getProgram(Program program);
+  void printDeviceInfo() const;
+};