Split headers and logic

2026-04-04 04:40:40 +04:00 · 2025-11-17 16:03:32 +04:00
parent bbd9c67c96
commit d7d93999a4
18 changed files with 589 additions and 394 deletions
--- a/src/tensor/opencl/kernels/atomic.cl
+++ b/src/tensor/opencl/kernels/atomic.cl
@@ -0,0 +1,34 @@
+__kernel void positive(__global float *A, __global float *B) {
+  int i = get_global_id(0);
+  B[i] = +A[i];
+}
+
+__kernel void negative(__global float *A, __global float *B) {
+  int i = get_global_id(0);
+  B[i] = -A[i];
+}
+
+
+float activate_x(float x, const int activation_type, const float alpha) {
+  switch (activation_type) {
+  case 0: // LINEAR
+    return x;
+  case 1: // SIGMOID
+    return 1.0f / (1.0f + exp(-x));
+  case 2: // TANH
+    return tanh(x);
+  case 3: // RELU
+    return fmax(0.0f, x);
+  case 4: // LEAKY_RELU
+    return (x > 0.0f) ? x : alpha * x;
+  case 5: // ELU
+    return (x > 0.0f) ? x : alpha * (exp(x) - 1.0f);
+  default:
+    return x;
+  }
+}
+__kernel void activate(__global float *input, __global float *output,
+                       const int activation_type, const float alpha) {
+  int i = get_global_id(0);
+  output[i] = activate_x(input[i], activation_type, alpha);
+}
--- a/src/tensor/opencl/kernels/fusion.cl
+++ b/src/tensor/opencl/kernels/fusion.cl
--- a/src/tensor/opencl/kernels/scalar.cl
+++ b/src/tensor/opencl/kernels/scalar.cl
@@ -0,0 +1,9 @@
+__kernel void add(__global float *A, __global float *B, float scalar) {
+  int i = get_global_id(0);
+  B[i] = A[i] + scalar;
+}
+
+__kernel void mult(__global float *A, __global float *B, float scalar) {
+  int i = get_global_id(0);
+  B[i] = A[i] * scalar;
+}
--- a/src/tensor/opencl/kernels/tensor.cl
+++ b/src/tensor/opencl/kernels/tensor.cl
@@ -1,4 +1,15 @@
-float activate_x(float x, const int activation_type, const float alpha) {
+__kernel void add(__global float *A, __global float *B, __global float *C,
+                  float x) {
+  int i = get_global_id(0);
+  C[i] = A[i] + (B[i] * x);
+}
+__kernel void mult(__global float *A, __global float *B, __global float *C,
+                  float x) {
+  int i = get_global_id(0);
+  C[i] = A[i] * (B[i] * x);
+}
+
+float activate(float x, const int activation_type, const float alpha) {
  switch (activation_type) {
  case 0: // LINEAR
    return x;
@@ -17,12 +28,6 @@ float activate_x(float x, const int activation_type, const float alpha) {
  }
 }

-__kernel void activate(__global float *input, __global float *output,
-                       const int activation_type, const float alpha) {
-  int i = get_global_id(0);
-  output[i] = activate_x(input[i], activation_type, alpha);
-}
-
 __kernel void mult_small(__global float *A, __global float *B,
                         __global float *C, __global float *bias,
                         const int activation_type, const float alpha,
@@ -48,7 +53,7 @@ __kernel void mult_small(__global float *A, __global float *B,

    float result = sum + bias[col];
    if (activation_type != 0) {
-      result = activate_x(result, activation_type, alpha);
+      result = activate(result, activation_type, alpha);
    }
    C[row * N + col] = result;
  }
@@ -121,24 +126,9 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
  if (global_i < M && global_j < N) {
    float result = sum + bias[global_j];
    if (activation_type != 0) {
-      result = activate_x(result, activation_type, alpha);
+      result = activate(result, activation_type, alpha);
    }
    C[global_i * N + global_j] = result;
  }
 }

-__kernel void mult_sc(__global float *A, __global float *B, float scalar) {
-  int i = get_global_id(0);
-  B[i] = A[i] * scalar;
-}
-
-__kernel void add(__global float *A, __global float *B, __global float *C,
-                  float x) {
-  int i = get_global_id(0);
-  C[i] = A[i] + (B[i] * x);
-}
-
-__kernel void add_sc(__global float *A, __global float *B, float scalar) {
-  int i = get_global_id(0);
-  B[i] = A[i] + scalar;
-}
--- a/src/tensor/opencl/opencl.cpp
+++ b/src/tensor/opencl/opencl.cpp
@@ -1,5 +1,10 @@
 #include "opencl.hpp"

+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
 std::string OpenCL::readProgram(const std::string &filePath) {
  std::ifstream file(filePath, std::ios::binary);
  if (!file.is_open()) {
@@ -118,4 +123,4 @@ void OpenCL::printDeviceInfo() const {
            << std::endl;
  std::cout << "Max Work Group Size: "
            << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
-}
+}
--- a/src/tensor/opencl/opencl.hpp
+++ b/src/tensor/opencl/opencl.hpp
@@ -4,16 +4,11 @@
 #define CL_HPP_TARGET_OPENCL_VERSION 300
 #include <CL/opencl.hpp>

-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
 #include <unordered_map>

 class OpenCL {
 public:
-  enum class Program { TENSOR };
+  enum class Program { ATOMIC, SCALAR, TENSOR, FUSION };

 private:
  cl::Device device;
@@ -22,7 +17,10 @@ private:

  std::unordered_map<Program, cl::Program> programs;
  std::unordered_map<Program, std::string> programPaths = {
-      {Program::TENSOR, "./opencl/kernels/tensor.cl"}};
+      {Program::ATOMIC, "./opencl/kernels/atomic.cl"},
+      {Program::SCALAR, "./opencl/kernels/scalar.cl"},
+      {Program::TENSOR, "./opencl/kernels/tensor.cl"},
+      {Program::FUSION, "./opencl/kernels/fusion.cl"}};

  std::string readProgram(const std::string &filePath);
  cl::Program compileProgram(const std::string &file);
--- a/src/tensor/opencl/tensor.hpp
+++ b/src/tensor/opencl/tensor.hpp
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "opencl.hpp"
+
+#include "../tensor.hpp"
--- a/src/tensor/opencl/tensor.tpp
+++ b/src/tensor/opencl/tensor.tpp