From 8d5a57a8c00e1d701d0b65bacc87f80485e63f2e Mon Sep 17 00:00:00 2001
From: StepanovPlaton <mail2platon@yandex.ru>
Date: Sun, 23 Nov 2025 01:15:51 +0400
Subject: [PATCH] Fixes

---
 src/run.py                          | 14 +++----
 src/tensor/main.cpp                 | 19 ++++++++-
 src/tensor/opencl/kernels/tensor.cl | 61 +++++++++++++++++++++--------
 src/tensor/opencl/tensor.hpp        |  8 ++--
 4 files changed, 74 insertions(+), 28 deletions(-)
diff --git a/src/run.py b/src/run.py
index abf48d1..638da5d 100644
--- a/src/run.py
+++ b/src/run.py
@@ -5,26 +5,26 @@ import time
 if (MODE == PLATFORM.OPENCL):
     init("./tensor/")
 
-a = Matrix([1024, 1024], 1)
-b = Matrix([1024, 1024], 1)
+a = Matrix([4096*4, 4096*4], 1)
+b = Matrix([4096*4, 4096*4], 1)
 
 
 def benchmark_tensor():
-    c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b))
+    c = a + b
     return c
 
 
-a_np = np.ones([1024, 1024], dtype=np.float32)
-b_np = np.ones([1024, 1024], dtype=np.float32)
+a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
+b_np = np.ones([4096*4, 4096*4], dtype=np.float32)
 
 
 def benchmark_numpy():
-    c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np))
+    c = a_np + b_np
     return c
 
 
 # Многократное выполнение для более точного измерения
-iterations = 5
+iterations = 2
 
 print("Бенчмарк Tensor:")
 tensor_times = []
diff --git a/src/tensor/main.cpp b/src/tensor/main.cpp
index 58fd9d9..c7501a0 100644
--- a/src/tensor/main.cpp
+++ b/src/tensor/main.cpp
@@ -7,16 +7,33 @@ OpenCL openCL;
 #include "cpu/tensor.hpp"
 #endif
 
+#include <chrono>
+#include <functional>
 #include <iostream>
 
 // TODO: TMult >2
 
+class Profiler {
+public:
+  static void measure(const std::string &operation, std::function<void()> op) {
+    auto start = std::chrono::high_resolution_clock::now();
+    op();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << operation << ": " << duration.count() << " μs\n";
+  }
+};
+
 int main() {
 #ifdef USE_OPENCL
   openCL.init("./");
 #endif
 
-  Tensor<float, 2> a = Tensor<float, 2>({32, 32}, 2);
+  Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
+  Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
+
+  Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
   std::cout << a.toString();
   return 0;
 }
diff --git a/src/tensor/opencl/kernels/tensor.cl b/src/tensor/opencl/kernels/tensor.cl
index 59dc4ea..6cd5a7b 100644
--- a/src/tensor/opencl/kernels/tensor.cl
+++ b/src/tensor/opencl/kernels/tensor.cl
@@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) {
 }
 
 #define TILE_SIZE 16
+#define VEC_SIZE 4
 __kernel void mult(__global float *A, __global float *B, __global float *C,
                    const int M, const int N, const int K) {
     
-    const int row = get_global_id(0);
+    const int row = get_global_id(0) * VEC_SIZE;
     const int col = get_global_id(1);
     const int local_row = get_local_id(0);
     const int local_col = get_local_id(1);
     
-    __local float tile_A[TILE_SIZE][TILE_SIZE];
-    __local float tile_B[TILE_SIZE][TILE_SIZE];
+    __local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts
+    __local float tile_B[TILE_SIZE][TILE_SIZE + 1];
     
-    float sum = 0.0f;
+    float4 sum[VEC_SIZE];
+    for (int i = 0; i < VEC_SIZE; i++) {
+        sum[i] = (float4)(0.0f);
+    }
     
-    for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) {
-        
+    const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
+    
+    for (int t = 0; t < numTiles; t++) {
+        // Загрузка tile_A с векторизацией
         int a_col = t * TILE_SIZE + local_col;
-        if (row < M && a_col < K) {
-            tile_A[local_row][local_col] = A[row * K + a_col];
-        } else {
-            tile_A[local_row][local_col] = 0.0f;
+        #pragma unroll
+        for (int v = 0; v < VEC_SIZE; v++) {
+            int current_row = row + v;
+            if (current_row < M && a_col < K) {
+                tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col];
+            } else {
+                tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f;
+            }
         }
         
+        // Загрузка tile_B
         int b_row = t * TILE_SIZE + local_row;
         if (b_row < K && col < N) {
             tile_B[local_row][local_col] = B[b_row * N + col];
@@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
         
         barrier(CLK_LOCAL_MEM_FENCE);
         
-        int k_max = min(TILE_SIZE, K - t * TILE_SIZE);
-        for (int k = 0; k < k_max; k++) {
-            sum += tile_A[local_row][k] * tile_B[k][local_col];
+        // Векторизованное вычисление
+        #pragma unroll
+        for (int k = 0; k < TILE_SIZE; k++) {
+            float4 a_vals = (float4)(
+                tile_A[local_row * VEC_SIZE + 0][k],
+                tile_A[local_row * VEC_SIZE + 1][k],
+                tile_A[local_row * VEC_SIZE + 2][k],
+                tile_A[local_row * VEC_SIZE + 3][k]
+            );
+            float b_val = tile_B[k][local_col];
+            
+            sum[0] += a_vals.x * b_val;
+            sum[1] += a_vals.y * b_val;
+            sum[2] += a_vals.z * b_val;
+            sum[3] += a_vals.w * b_val;
         }
         
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     
-    if (row < M && col < N) {
-        C[row * N + col] = sum;
+    // Сохранение результатов с векторизацией
+    #pragma unroll
+    for (int v = 0; v < VEC_SIZE; v++) {
+        int current_row = row + v;
+        if (current_row < M && col < N) {
+            C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w;
+        }
     }
-}
-
+}
\ No newline at end of file
diff --git a/src/tensor/opencl/tensor.hpp b/src/tensor/opencl/tensor.hpp
index 0c2a3fe..6fb3301 100644
--- a/src/tensor/opencl/tensor.hpp
+++ b/src/tensor/opencl/tensor.hpp
@@ -175,6 +175,7 @@ public:
   }
 
 #define TILE_SIZE 16
+#define VEC_SIZE 4
   Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
     static_assert(Dim == 1 || Dim == 2,
                   "Inner product is only defined for vectors and matrices");
@@ -195,9 +196,10 @@ public:
       kernel.setArg(3, (int)m);
       kernel.setArg(4, (int)n);
       kernel.setArg(5, (int)k);
-      cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE,
-                              ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
-      cl::NDRange local_size(TILE_SIZE, TILE_SIZE);
+      cl::NDRange global_size(
+          ((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
+          ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
+      cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
       openCL.getQueue().enqueueNDRangeKernel(
           kernel, cl::NullRange, global_size, local_size,
           all(event_, other.event_), &result.event_);