From 2db52adf0f52260d740003cd11c9454a35a710d0 Mon Sep 17 00:00:00 2001
From: StepanovPlaton <mail2platon@yandex.ru>
Date: Tue, 25 Nov 2025 23:15:43 +0400
Subject: [PATCH] Check

---
 src/tensor/Makefile           |   1 +
 src/tensor/main.cpp           |  11 +-
 src/tensor/opencl/kernels.hpp | 285 +++++++++++++++++----------
 src/tensor/opencl/opencl.cpp  |  61 ++++++
 src/tensor/opencl/tensor.hpp  |  32 ++--
 src/tensor/pybind.cpp         |   6 -
 src/tensor/tensor.pyi         | 351 ----------------------------------
 7 files changed, 270 insertions(+), 477 deletions(-)
 delete mode 100644 src/tensor/tensor.pyi
diff --git a/src/tensor/Makefile b/src/tensor/Makefile
index f5fe52c..455ef42 100644
--- a/src/tensor/Makefile
+++ b/src/tensor/Makefile
@@ -34,6 +34,7 @@ OPENCL_LIB_PATH = -L"A:/Programs/OpenCL/lib"
 OPENCL_LIB = -lOpenCL
 
 .DEFAULT_GOAL := cpu
+.PHONY: cpu opencl cpu_module opencl_module clean
 
 $(BUILD_DIR):
 	$(MKDIR) $(BUILD_DIR)
diff --git a/src/tensor/main.cpp b/src/tensor/main.cpp
index c7501a0..180a37c 100644
--- a/src/tensor/main.cpp
+++ b/src/tensor/main.cpp
@@ -1,7 +1,6 @@
 #ifdef USE_OPENCL
 #include "opencl/tensor.hpp"
 OpenCL openCL;
-// TODO: GENERIC KERNELS
 // TODO: Scalar mult
 #elif USE_CPU
 #include "cpu/tensor.hpp"
@@ -21,19 +20,21 @@ public:
     auto end = std::chrono::high_resolution_clock::now();
     auto duration =
         std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-    std::cout << operation << ": " << duration.count() << " μs\n";
+    std::cout << operation << ": " << duration.count() << " ns\n";
   }
 };
 
 int main() {
 #ifdef USE_OPENCL
-  openCL.init("./");
+  openCL.printDeviceInfo();
 #endif
 
   Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
   Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
+  Profiler::measure("Matrix multiplication", [&]() {
+    auto result = a % b;
+    std::cout << result.toString();
+  });
 
-  Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
-  std::cout << a.toString();
   return 0;
 }
diff --git a/src/tensor/opencl/kernels.hpp b/src/tensor/opencl/kernels.hpp
index 196a564..faf9a6f 100644
--- a/src/tensor/opencl/kernels.hpp
+++ b/src/tensor/opencl/kernels.hpp
@@ -1,10 +1,14 @@
-#include "opencl.hpp"
 #include <CL/opencl.hpp>
+
+#include "opencl.hpp"
+
 #include <format>
+#include <iostream>
+#include <ostream>
 #include <string>
 #include <unordered_map>
 
-template <typename T, int Dim> class Kernels {
+template <typename T> class Kernels {
 public:
   enum class Vector {
     type2 = 2,
@@ -21,136 +25,211 @@ public:
     T_HADAMARD,
     T_MULT,
   };
-  constexpr const static std::string type = typeid(T).name();
-
-  // TODO: get native vector size
-  static Vector vector = Vector::type8;
 
 private:
-  static std::string unaryOperation(std::string name, std::string operation) {
-    return std::format(
+  constexpr std::string getTypeName() { return "unknown"; }
+  Vector vector;
+  std::string configuration;
+
+  std::string format(std::string tmp,
+                     std::unordered_map<std::string, std::string> args) {
+    std::string result(tmp);
+    for (const auto &[key, value] : args) {
+      std::string placeholder = "{" + key + "}";
+      size_t pos = 0;
+      while ((pos = result.find(placeholder, pos)) != std::string::npos) {
+        result.replace(pos, placeholder.length(), value);
+        pos += value.length();
+      }
+    }
+    return result;
+  }
+
+  std::string unaryOperation(std::string name, std::string operation) {
+    return format(
         R"(
-        __kernel void {method}(__global {type}* A, int len) {{
+        __kernel void {method}(__global type* A, int len) {
           int gid = get_global_id(0);
-          int base = gid * {vector};
-          if (base + ({vector}-1) < len) {{
-            {type}{vector} data = vload{vector}(gid, A);
-            vstore{vector}({operation}data, gid, A);
-          }} else {{
-            for (int i = 0; i < {vec_size}; i++) {{
+          int base = gid * WIDTH;
+          if (base + WIDTH <= len) {
+            typeX data = vloadX(gid, A);
+            vstoreX({operation}data, gid, A);
+          } else {
+            for (int i = 0; i < WIDTH; i++) {
               int idx = base + i;
               if (idx < len) A[idx] = {operation}A[idx];
-            }}
-          }}
-        }}
-        )",
-        std::make_format_args(std::make_pair("method", name),
-                              std::make_pair("vector", vector),
-                              std::make_pair("type", type),
-                              std::make_pair("operation", operation)));
+            }
+          }
+        })",
+        {{"method", name}, {"operation", operation}});
   }
 
-  static std::string scalarOperation(std::string name, std::string operation) {
-    return std::format(
+  std::string scalarOperation(std::string name, std::string operation) {
+    return format(
         R"(
-        __kernel void {method}(__global {type}* A, int len, {type} scalar) {{
+        __kernel void {method}(__global type* A, int len, type scalar) {
           int gid = get_global_id(0);
-          int base = gid * {vector};
-          if (base + ({vector}-1) < len) {{
-            {type}{vector} data = vload{vector}(gid, A);
+          int base = gid * WIDTH;
+          if (base + WIDTH <= len) {
+            typeX data = vloadX(gid, A);
             data = data {operation} scalar;
-            vstore{vector}(data, gid, A);
-          }} else {{
-            for (int i = 0; i < {vec_size}; i++) {{
+            vstoreX(data, gid, A);
+          } else {
+            for (int i = 0; i < WIDTH; i++) {
               int idx = base + i;
               if (idx < len) A[idx] = A[idx] {operation} scalar;
-            }}
-          }}
-        }}
-        )",
-        std::make_format_args(std::make_pair("method", name),
-                              std::make_pair("vector", vector),
-                              std::make_pair("type", type),
-                              std::make_pair("operation", operation)));
+            }
+          }
+        })",
+        {{"method", name}, {"operation", operation}});
   }
 
-  static std::string binaryOperation(std::string name, std::string operation) {
-    return std::format(
+  std::string binaryOperation(std::string name, std::string operation) {
+    return format(
         R"(
-        __kernel void {method}(__global {type}* A, __global {type}* B, int len) {{
+        __kernel void {method}(__global type* A, __global type* B, int len) {
           int gid = get_global_id(0);
-          int base = gid * {vector};
-          if (base + ({vector}-1) < len) {{
-            {type}{vector} dataA = vload{vector}(gid, A);
-            {type}{vector} dataB = vload{vector}(gid, B);
-            vstore{vector}(dataA {operation} dataB, gid, A);
-          }} else {{
-            for (int i = 0; i < {vector}; i++) {{
+          int base = gid * WIDTH;
+          if (base + WIDTH <= len) {
+            typeX dataA = vloadX(gid, A);
+            typeX dataB = vloadX(gid, B);
+            vstoreX(dataA {operation} dataB, gid, A);
+          } else {
+            for (int i = 0; i < WIDTH; i++) {
               int idx = base + i;
               if (idx < len) A[idx] = A[idx] {operation} B[idx];
-            }}
-          }}
-        }}
-        )",
-        std::make_format_args(std::make_pair("method", name),
-                              std::make_pair("vector", vector),
-                              std::make_pair("type", type),
-                              std::make_pair("operation", operation)));
+            }
+          }
+        })",
+        {{"method", name}, {"operation", operation}});
   }
 
-  static std::unordered_map<Method, std::tuple<std::string, std::string>>
-      programs = {
-          {Method::POSITIVE, {unaryOperation("positive", "+"), "positive"}},
-          {Method::NEGATIVE, {unaryOperation("negative", "-")}, "negative"},
+  std::string matrixMult(std::string name) {
+    return format(
+        R"(
+        #define TILE_SIZE WIDTH*4                      
+        __kernel void mult(const __global typeX* A,
+                            const __global typeX* B,
+                            __global typeX* C, const int M, const int N, const int K) {
+            const int row = get_local_id(0);
+            const int col = get_local_id(1);
+            const int globalRow = (TILE_SIZE/WIDTH)*get_group_id(0) + row;
+            const int globalCol = TILE_SIZE*get_group_id(1) + col;
+            __local typeX Asub[TILE_SIZE][TILE_SIZE/WIDTH];
+            __local typeX Bsub[TILE_SIZE][TILE_SIZE/WIDTH];
+            typeX acc = 0;
+            const int numTiles = K/TILE_SIZE;
+            for (int tile = 0; tile < numTiles; tile++) {
+                const int tiledRow = (TILE_SIZE/WIDTH)*tile + row;
+                const int tiledCol = TILE_SIZE*tile + col;
+                Asub[col][row] = A[tiledCol*(M/WIDTH) + globalRow];
+                Bsub[col][row] = B[globalCol*(K/WIDTH) + tiledRow];
+                barrier(CLK_LOCAL_MEM_FENCE);
+                typeX vecA, vecB;
+                type valB;
+                for (int k = 0; k < TILE_SIZE/WIDTH; k++) {
+                    vecB = Bsub[col][k];
+                    for (int w = 0; w < WIDTH; w++) {
+                        vecA = Asub[WIDTH*k + w][row];
+                        valB = vecB[w];
+                        for (int i = 0; i < WIDTH; i++)
+                            acc[i] += vecA[i] * valB;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+            C[globalCol*(M/WIDTH) + globalRow] = acc;
+        }
+        )",
+        {{"method", name}});
+  }
 
-          {Method::S_ADD, {scalarOperation("add", "+")}, "add"},
-          {Method::S_MULT, {scalarOperation("mult", "*")}, "mult"},
+  std::unordered_map<Method, std::tuple<std::string, std::string>> programs = {
+      {Method::POSITIVE, {unaryOperation("positive", "+"), "positive"}},
+      {Method::NEGATIVE, {unaryOperation("negative", "-"), "negative"}},
 
-          {Method::T_ADD, {binaryOperation("add", "+")}, "add"},
-          {Method::T_HADAMARD,
-           {binaryOperation("hadamard_mult", "*")},
-           "hadamard_mult"},
-          {Method::T_MULT, {"", "mult"}},
+      {Method::S_ADD, {scalarOperation("add", "+"), "add"}},
+      {Method::S_MULT, {scalarOperation("mult", "*"), "mult"}},
+
+      {Method::T_ADD, {binaryOperation("add", "+"), "add"}},
+      {Method::T_HADAMARD,
+       {binaryOperation("hadamard_mult", "*"), "hadamard_mult"}},
+
+      {Method::T_MULT, {matrixMult("mult"), "mult"}},
   };
 
-  static inline std::unordered_map<Method, cl::Program> compiledPrograms;
-  static inline std::mutex compileMutex;
+  std::unordered_map<Method, cl::Program> compiledPrograms;
 
 public:
-  static cl::Kernel create(Method method) {
-    std::lock_guard<std::mutex> lock(compileMutex);
+  Kernels(Vector vec = Vector::type4) : vector(vec) {
+    std::string extensions = openCL.getDevice().getInfo<CL_DEVICE_EXTENSIONS>();
+    if (extensions.find("cl_khr_fp16") != std::string::npos)
+      configuration = R"(
+        #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+        typedef half _half;
+        typedef half2 _half2;
+        typedef half4 _half4;
+        typedef half8 _half8;
+        typedef half16 _half16;
+      )";
+    else
+      configuration = R"(
+        typedef float _half;
+        typedef float2 _half2;
+        typedef float4 _half4;
+        typedef float8 _half8;
+        typedef float16 _half16;
+      )";
+    configuration += format(
+        R"(
+        typedef {type} type;
+        typedef {type}{vector} typeX;
+        #define WIDTH {vector}
+        #define vloadX vload{vector}
+        #define vstoreX vstore{vector}
+      )",
+        {{"type", getTypeName()}, {"vector", std::to_string((int)vector)}});
 
-    auto cache = compiledPrograms.find(method);
-    if (cache != compiledPrograms.end()) {
-      const auto &programName = std::get<1>(programs[method]);
-      return cl::Kernel(cache->second, programName.c_str());
-    }
-
-    auto program = programs.find(method);
-    if (program == programs.end())
-      throw std::runtime_error("Unknown method: " +
-                               std::to_string(static_cast<int>(method)));
-    const auto &[sourceCode, kernelName] = program->second;
-
-    try {
-      cl::Program::Sources sources;
-      sources.push_back({sourceCode.c_str(), sourceCode.length()});
-      cl::Program program(openCL.getContext(), sources);
-      program.build({openCL.getDevice()});
-      compiledPrograms[method] = program;
-      return cl::Kernel(program, kernelName.c_str());
-
-    } catch (const cl::Error &e) {
-      if (e.err() == CL_BUILD_PROGRAM_FAILURE) {
-        cl::Program program(openCL.getContext(),
-                            {sourceCode.c_str(), sourceCode.length()});
-        auto buildInfo =
-            program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(openCL.getDevice());
-        throw std::runtime_error(
-            "OpenCL compilation failed: " + std::string(e.what()) +
-            "\nBuild log:\n" + buildInfo);
+    for (const auto &[method, programInfo] : programs) {
+      const auto &[sourceCode, kernelName] = programInfo;
+      if (!sourceCode.empty()) {
+        cl::Program program(openCL.getContext(), configuration + sourceCode);
+        try {
+          program.build({openCL.getDevice()});
+          compiledPrograms[method] = program;
+        } catch (const cl::Error &e) {
+          std::cerr << "OpenCL compilation error for method "
+                    << static_cast<int>(method) << ": " << e.what()
+                    << std::endl;
+          std::string buildLog =
+              program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(openCL.getDevice());
+          std::cerr << "Build log for method " << static_cast<int>(method)
+                    << ":" << std::endl;
+          std::cerr << buildLog << std::endl;
+        }
       }
-      throw std::runtime_error("OpenCL error: " + std::string(e.what()));
     }
   }
+
+  cl::Kernel create(Method method) {
+    auto it = compiledPrograms.find(method);
+    if (it == compiledPrograms.end())
+      throw std::runtime_error("Program for method not found or not compiled");
+    const auto &kernelName = std::get<1>(programs[method]);
+    return cl::Kernel(it->second, kernelName.c_str());
+  }
 };
+
+#define SPECIALIZE_KERNELS_TYPE(type, name)                                    \
+  template <> constexpr std::string Kernels<type>::getTypeName() {             \
+    return name;                                                               \
+  }
+SPECIALIZE_KERNELS_TYPE(char, "char")
+SPECIALIZE_KERNELS_TYPE(short, "short")
+SPECIALIZE_KERNELS_TYPE(int, "int")
+SPECIALIZE_KERNELS_TYPE(long, "long")
+SPECIALIZE_KERNELS_TYPE(float, "float")
+SPECIALIZE_KERNELS_TYPE(double, "double")
+
+typedef cl_half half;
+SPECIALIZE_KERNELS_TYPE(half, "_half")
\ No newline at end of file
diff --git a/src/tensor/opencl/opencl.cpp b/src/tensor/opencl/opencl.cpp
index c16b4f0..6967b1f 100644
--- a/src/tensor/opencl/opencl.cpp
+++ b/src/tensor/opencl/opencl.cpp
@@ -71,4 +71,65 @@ void OpenCL::printDeviceInfo() const {
             << std::endl;
   std::cout << "Max Work Group Size: "
             << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
+  std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
+
+  std::cout << "Optimal vector sizes:" << std::endl;
+  try {
+    cl_uint short_native =
+        device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT>();
+    cl_uint short_preferred =
+        device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
+    std::cout << "  short: native=" << short_native
+              << ", preferred=" << short_preferred << std::endl;
+  } catch (const cl::Error &e) {
+    std::cout << "  short: N/A (error: " << e.what() << ")" << std::endl;
+  }
+  try {
+    cl_uint int_native = device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_INT>();
+    cl_uint int_preferred =
+        device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT>();
+    std::cout << "  int: native=" << int_native
+              << ", preferred=" << int_preferred << std::endl;
+  } catch (const cl::Error &e) {
+    std::cout << "  int: N/A (error: " << e.what() << ")" << std::endl;
+  }
+  try {
+    if (extensions.find("cl_khr_fp16") != std::string::npos) {
+      cl_uint half_native =
+          device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF>();
+      cl_uint half_preferred =
+          device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF>();
+      std::cout << "  half: native=" << half_native
+                << ", preferred=" << half_preferred << std::endl;
+    } else {
+      std::cout << "  half: not supported" << std::endl;
+    }
+  } catch (const cl::Error &e) {
+    std::cout << "  half: N/A (error: " << e.what() << ")" << std::endl;
+  }
+  try {
+    cl_uint float_native =
+        device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT>();
+    cl_uint float_preferred =
+        device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>();
+    std::cout << "  float: native=" << float_native
+              << ", preferred=" << float_preferred << std::endl;
+  } catch (const cl::Error &e) {
+    std::cout << "  float: N/A (error: " << e.what() << ")" << std::endl;
+  }
+  try {
+    if (extensions.find("cl_khr_fp64") != std::string::npos ||
+        device.getInfo<CL_DEVICE_VERSION>().find("1.0") == std::string::npos) {
+      cl_uint double_native =
+          device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>();
+      cl_uint double_preferred =
+          device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE>();
+      std::cout << "  double: native=" << double_native
+                << ", preferred=" << double_preferred << std::endl;
+    } else {
+      std::cout << "  double: not supported" << std::endl;
+    }
+  } catch (const cl::Error &e) {
+    std::cout << "  double: N/A (error: " << e.what() << ")" << std::endl;
+  }
 }
diff --git a/src/tensor/opencl/tensor.hpp b/src/tensor/opencl/tensor.hpp
index 8c424bb..2c03f0f 100644
--- a/src/tensor/opencl/tensor.hpp
+++ b/src/tensor/opencl/tensor.hpp
@@ -45,9 +45,13 @@ private:
                                         all(other.getEvent()), &event_);
   }
 
+  static cl::Kernel createKernel(Kernels<T>::Method method) {
+    static Kernels<T> kernels(Kernels<T>::Vector::type4);
+    return kernels.create(method);
+  }
+
 public:
   typedef class ITensor<T, Dim> ITensor;
-  typedef class Kernels<T, Dim> Kernels;
 
   using ITensor::axes_;
   using ITensor::checkAxisInDim;
@@ -105,7 +109,7 @@ public:
     ITensor::operator=(std::move(other));
     data_ = other.data_;
     event_ = other.event_;
-    other.data = nullptr;
+    other.data_ = nullptr;
     return *this;
   }
   ~Tensor() {
@@ -120,8 +124,9 @@ public:
   using ITensor::operator-;
 
   Tensor operator+() const override {
-    cl::Kernel kernel = Kernels::create(Kernels::Method::POSITIVE);
+    cl::Kernel kernel = createKernel(Kernels<T>::Method::POSITIVE);
     kernel.setArg(0, *data_);
+    kernel.setArg(1, (int)getSize());
     openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
                                            cl::NDRange(getSize()),
                                            cl::NullRange, all(event_), &event_);
@@ -129,8 +134,9 @@ public:
   }
 
   Tensor operator-() const override {
-    cl::Kernel kernel = Kernels::create(Kernels::Method::NEGATIVE);
+    cl::Kernel kernel = createKernel(Kernels<T>::Method::NEGATIVE);
     kernel.setArg(0, *data_);
+    kernel.setArg(1, (int)getSize());
     openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
                                            cl::NDRange(getSize()),
                                            cl::NullRange, all(event_), &event_);
@@ -138,9 +144,10 @@ public:
   }
 
   Tensor &operator+=(const T scalar) override {
-    cl::Kernel kernel = Kernels::create(Kernels::Method::S_ADD);
+    cl::Kernel kernel = createKernel(Kernels<T>::Method::S_ADD);
     kernel.setArg(0, *data_);
     kernel.setArg(1, scalar);
+    kernel.setArg(2, (int)getSize());
     openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
                                            cl::NDRange(getSize()),
                                            cl::NullRange, all(event_), &event_);
@@ -148,9 +155,10 @@ public:
   }
 
   Tensor &operator*=(const T scalar) override {
-    cl::Kernel kernel = Kernels::create(Kernels::Method::S_MULT);
+    cl::Kernel kernel = createKernel(Kernels<T>::Method::S_MULT);
     kernel.setArg(0, *data_);
     kernel.setArg(1, scalar);
+    kernel.setArg(2, (int)getSize());
     openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
                                            cl::NDRange(getSize()),
                                            cl::NullRange, all(event_), &event_);
@@ -158,9 +166,10 @@ public:
   }
 
   Tensor &operator+=(const Tensor &other) override {
-    cl::Kernel kernel = Kernels::create(Kernels::Method::T_ADD);
+    cl::Kernel kernel = createKernel(Kernels<T>::Method::T_ADD);
     kernel.setArg(0, *data_);
     kernel.setArg(1, *other.getData());
+    kernel.setArg(2, (int)getSize());
     openCL.getQueue().enqueueNDRangeKernel(
         kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange,
         all(event_, other.event_), &event_);
@@ -168,9 +177,10 @@ public:
   }
 
   Tensor &operator*=(const Tensor &other) override {
-    cl::Kernel kernel = Kernels::create(Kernels::Method::T_HADAMARD);
+    cl::Kernel kernel = createKernel(Kernels<T>::Method::T_HADAMARD);
     kernel.setArg(0, *data_);
     kernel.setArg(1, *other.getData());
+    kernel.setArg(2, getSize());
     openCL.getQueue().enqueueNDRangeKernel(
         kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange,
         all(event_, other.event_), &event_);
@@ -192,16 +202,14 @@ public:
       size_t k = shape_[axes_[1]];
       size_t n = other.shape_[other.axes_[1]];
       Tensor<T, 2> result({m, n});
-      cl::Kernel kernel = Kernels::create(Kernels::Method::T_MULT);
+      cl::Kernel kernel = createKernel(Kernels<T>::Method::T_MULT);
       kernel.setArg(0, *data_);
       kernel.setArg(1, *other.getData());
       kernel.setArg(2, *result.getData());
       kernel.setArg(3, (int)m);
       kernel.setArg(4, (int)n);
       kernel.setArg(5, (int)k);
-      cl::NDRange global_size(
-          ((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
-          ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
+      cl::NDRange global_size(m / VEC_SIZE, n);
       cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
       openCL.getQueue().enqueueNDRangeKernel(
           kernel, cl::NullRange, global_size, local_size,
diff --git a/src/tensor/pybind.cpp b/src/tensor/pybind.cpp
index 54ed0e3..523a081 100644
--- a/src/tensor/pybind.cpp
+++ b/src/tensor/pybind.cpp
@@ -125,12 +125,6 @@ PYBIND11_MODULE(tensor, m) {
   register_tensor<float, 2>(m, "Matrix");
   register_tensor<float, 3>(m, "Tensor3");
 
-#ifdef USE_OPENCL
-  m.def("init", [](const std::string &programsBasePath) {
-    openCL.init(programsBasePath);
-  });
-#endif
-
 #ifndef USE_OPENCL
   register_tensor<double, 0>(m, "dScalar");
   register_tensor<double, 1>(m, "dVector");
diff --git a/src/tensor/tensor.pyi b/src/tensor/tensor.pyi
deleted file mode 100644
index d740736..0000000
--- a/src/tensor/tensor.pyi
+++ /dev/null
@@ -1,351 +0,0 @@
-"""
-Tensor math library
-"""
-from __future__ import annotations
-import collections.abc
-import typing
-__all__: list[str] = ['CPU', 'MODE', 'Matrix', 'OPENCL', 'PLATFORM', 'Scalar', 'Tensor3', 'Vector', 'init']
-class Matrix:
-    @typing.overload
-    def __add__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __add__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"], arg1: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def __itruediv__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def __matmul__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def __neg__(self) -> Matrix:
-        ...
-    def __pos__(self) -> Matrix:
-        ...
-    def __radd__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def __repr__(self) -> str:
-        ...
-    def __rmul__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def __rsub__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: Matrix) -> Matrix:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def __truediv__(self, arg0: typing.SupportsFloat) -> Matrix:
-        ...
-    def get_axes(self) -> typing.Annotated[list[int], "FixedSize(2)"]:
-        ...
-    def get_shape(self) -> typing.Annotated[list[int], "FixedSize(2)"]:
-        ...
-    def get_size(self) -> int:
-        ...
-    def t(self) -> Matrix:
-        ...
-    @typing.overload
-    def transpose(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"]) -> Matrix:
-        ...
-    @typing.overload
-    def transpose(self, arg0: typing.SupportsInt, arg1: typing.SupportsInt) -> Matrix:
-        ...
-class PLATFORM:
-    """
-    Members:
-    
-      CPU
-    
-      OPENCL
-    """
-    CPU: typing.ClassVar[PLATFORM]  # value = <PLATFORM.CPU: 0>
-    OPENCL: typing.ClassVar[PLATFORM]  # value = <PLATFORM.OPENCL: 1>
-    __members__: typing.ClassVar[dict[str, PLATFORM]]  # value = {'CPU': <PLATFORM.CPU: 0>, 'OPENCL': <PLATFORM.OPENCL: 1>}
-    def __eq__(self, other: typing.Any) -> bool:
-        ...
-    def __getstate__(self) -> int:
-        ...
-    def __hash__(self) -> int:
-        ...
-    def __index__(self) -> int:
-        ...
-    def __init__(self, value: typing.SupportsInt) -> None:
-        ...
-    def __int__(self) -> int:
-        ...
-    def __ne__(self, other: typing.Any) -> bool:
-        ...
-    def __repr__(self) -> str:
-        ...
-    def __setstate__(self, state: typing.SupportsInt) -> None:
-        ...
-    def __str__(self) -> str:
-        ...
-    @property
-    def name(self) -> str:
-        ...
-    @property
-    def value(self) -> int:
-        ...
-class Scalar:
-    @typing.overload
-    def __add__(self, arg0: Scalar) -> Scalar:
-        ...
-    @typing.overload
-    def __add__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: Scalar) -> Scalar:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: Scalar) -> Scalar:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"], arg1: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: Scalar) -> Scalar:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    def __itruediv__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: Scalar) -> Scalar:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    def __neg__(self) -> Scalar:
-        ...
-    def __pos__(self) -> Scalar:
-        ...
-    def __radd__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    def __repr__(self) -> str:
-        ...
-    def __rmul__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    def __rsub__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: Scalar) -> Scalar:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    def __truediv__(self, arg0: typing.SupportsFloat) -> Scalar:
-        ...
-    def get_axes(self) -> typing.Annotated[list[int], "FixedSize(0)"]:
-        ...
-    def get_shape(self) -> typing.Annotated[list[int], "FixedSize(0)"]:
-        ...
-    def get_size(self) -> int:
-        ...
-class Tensor3:
-    @typing.overload
-    def __add__(self, arg0: Tensor3) -> Tensor3:
-        ...
-    @typing.overload
-    def __add__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: Tensor3) -> Tensor3:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: Tensor3) -> Tensor3:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"], arg1: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: Tensor3) -> Tensor3:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    def __itruediv__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: Tensor3) -> Tensor3:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    def __neg__(self) -> Tensor3:
-        ...
-    def __pos__(self) -> Tensor3:
-        ...
-    def __radd__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    def __repr__(self) -> str:
-        ...
-    def __rmul__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    def __rsub__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: Tensor3) -> Tensor3:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    def __truediv__(self, arg0: typing.SupportsFloat) -> Tensor3:
-        ...
-    def get_axes(self) -> typing.Annotated[list[int], "FixedSize(3)"]:
-        ...
-    def get_shape(self) -> typing.Annotated[list[int], "FixedSize(3)"]:
-        ...
-    def get_size(self) -> int:
-        ...
-    def t(self) -> Tensor3:
-        ...
-    @typing.overload
-    def transpose(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"]) -> Tensor3:
-        ...
-    @typing.overload
-    def transpose(self, arg0: typing.SupportsInt, arg1: typing.SupportsInt) -> Tensor3:
-        ...
-class Vector:
-    @typing.overload
-    def __add__(self, arg0: Vector) -> Vector:
-        ...
-    @typing.overload
-    def __add__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: Vector) -> Vector:
-        ...
-    @typing.overload
-    def __iadd__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: Vector) -> Vector:
-        ...
-    @typing.overload
-    def __imul__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"], arg1: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
-        ...
-    @typing.overload
-    def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: Vector) -> Vector:
-        ...
-    @typing.overload
-    def __isub__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    def __itruediv__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: Vector) -> Vector:
-        ...
-    @typing.overload
-    def __mul__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    def __neg__(self) -> Vector:
-        ...
-    def __pos__(self) -> Vector:
-        ...
-    def __radd__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    def __repr__(self) -> str:
-        ...
-    def __rmul__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    def __rsub__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: Vector) -> Vector:
-        ...
-    @typing.overload
-    def __sub__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    def __truediv__(self, arg0: typing.SupportsFloat) -> Vector:
-        ...
-    def get_axes(self) -> typing.Annotated[list[int], "FixedSize(1)"]:
-        ...
-    def get_shape(self) -> typing.Annotated[list[int], "FixedSize(1)"]:
-        ...
-    def get_size(self) -> int:
-        ...
-def init(arg0: str) -> None:
-    ...
-CPU: PLATFORM  # value = <PLATFORM.CPU: 0>
-MODE: PLATFORM  # value = <PLATFORM.OPENCL: 1>
-OPENCL: PLATFORM  # value = <PLATFORM.OPENCL: 1>