Fixes

2026-04-03 20:30:39 +04:00 · 2025-11-23 01:15:51 +04:00
parent 0455d9bd5b
commit 8d5a57a8c0
4 changed files with 74 additions and 28 deletions
--- a/src/run.py
+++ b/src/run.py
@@ -5,26 +5,26 @@ import time
 if (MODE == PLATFORM.OPENCL):
    init("./tensor/")

-a = Matrix([1024, 1024], 1)
-b = Matrix([1024, 1024], 1)
+a = Matrix([4096*4, 4096*4], 1)
+b = Matrix([4096*4, 4096*4], 1)


 def benchmark_tensor():
-    c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b))
+    c = a + b
    return c


-a_np = np.ones([1024, 1024], dtype=np.float32)
-b_np = np.ones([1024, 1024], dtype=np.float32)
+a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
+b_np = np.ones([4096*4, 4096*4], dtype=np.float32)


 def benchmark_numpy():
-    c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np))
+    c = a_np + b_np
    return c


 # Многократное выполнение для более точного измерения
-iterations = 5
+iterations = 2

 print("Бенчмарк Tensor:")
 tensor_times = []
--- a/src/tensor/main.cpp
+++ b/src/tensor/main.cpp
@@ -7,16 +7,33 @@ OpenCL openCL;
 #include "cpu/tensor.hpp"
 #endif

+#include <chrono>
+#include <functional>
 #include <iostream>

 // TODO: TMult >2

+class Profiler {
+public:
+  static void measure(const std::string &operation, std::function<void()> op) {
+    auto start = std::chrono::high_resolution_clock::now();
+    op();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << operation << ": " << duration.count() << " μs\n";
+  }
+};
+
 int main() {
 #ifdef USE_OPENCL
  openCL.init("./");
 #endif

-  Tensor<float, 2> a = Tensor<float, 2>({32, 32}, 2);
+  Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
+  Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
+
+  Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
  std::cout << a.toString();
  return 0;
 }
--- a/src/tensor/opencl/kernels/tensor.cl
+++ b/src/tensor/opencl/kernels/tensor.cl
@@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) {
 }

 #define TILE_SIZE 16
+#define VEC_SIZE 4
 __kernel void mult(__global float *A, __global float *B, __global float *C,
                   const int M, const int N, const int K) {
    
-    const int row = get_global_id(0);
+    const int row = get_global_id(0) * VEC_SIZE;
    const int col = get_global_id(1);
    const int local_row = get_local_id(0);
    const int local_col = get_local_id(1);
    
-    __local float tile_A[TILE_SIZE][TILE_SIZE];
-    __local float tile_B[TILE_SIZE][TILE_SIZE];
+    __local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts
+    __local float tile_B[TILE_SIZE][TILE_SIZE + 1];
    
-    float sum = 0.0f;
+    float4 sum[VEC_SIZE];
+    for (int i = 0; i < VEC_SIZE; i++) {
+        sum[i] = (float4)(0.0f);
+    }
    
-    for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) {
+    const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
    
+    for (int t = 0; t < numTiles; t++) {
+        // Загрузка tile_A с векторизацией
        int a_col = t * TILE_SIZE + local_col;
-        if (row < M && a_col < K) {
-            tile_A[local_row][local_col] = A[row * K + a_col];
-        } else {
-            tile_A[local_row][local_col] = 0.0f;
+        #pragma unroll
+        for (int v = 0; v < VEC_SIZE; v++) {
+            int current_row = row + v;
+            if (current_row < M && a_col < K) {
+                tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col];
+            } else {
+                tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f;
+            }
        }
        
+        // Загрузка tile_B
        int b_row = t * TILE_SIZE + local_row;
        if (b_row < K && col < N) {
            tile_B[local_row][local_col] = B[b_row * N + col];
@@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
        
        barrier(CLK_LOCAL_MEM_FENCE);
        
-        int k_max = min(TILE_SIZE, K - t * TILE_SIZE);
-        for (int k = 0; k < k_max; k++) {
-            sum += tile_A[local_row][k] * tile_B[k][local_col];
+        // Векторизованное вычисление
+        #pragma unroll
+        for (int k = 0; k < TILE_SIZE; k++) {
+            float4 a_vals = (float4)(
+                tile_A[local_row * VEC_SIZE + 0][k],
+                tile_A[local_row * VEC_SIZE + 1][k],
+                tile_A[local_row * VEC_SIZE + 2][k],
+                tile_A[local_row * VEC_SIZE + 3][k]
+            );
+            float b_val = tile_B[k][local_col];
+            
+            sum[0] += a_vals.x * b_val;
+            sum[1] += a_vals.y * b_val;
+            sum[2] += a_vals.z * b_val;
+            sum[3] += a_vals.w * b_val;
        }
        
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    
-    if (row < M && col < N) {
-        C[row * N + col] = sum;
+    // Сохранение результатов с векторизацией
+    #pragma unroll
+    for (int v = 0; v < VEC_SIZE; v++) {
+        int current_row = row + v;
+        if (current_row < M && col < N) {
+            C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w;
+        }
    }
 }
-
--- a/src/tensor/opencl/tensor.hpp
+++ b/src/tensor/opencl/tensor.hpp
@@ -175,6 +175,7 @@ public:
  }

 #define TILE_SIZE 16
+#define VEC_SIZE 4
  Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
    static_assert(Dim == 1 || Dim == 2,
                  "Inner product is only defined for vectors and matrices");
@@ -195,9 +196,10 @@ public:
      kernel.setArg(3, (int)m);
      kernel.setArg(4, (int)n);
      kernel.setArg(5, (int)k);
-      cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE,
-                              ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
-      cl::NDRange local_size(TILE_SIZE, TILE_SIZE);
+      cl::NDRange global_size(
+          ((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
+          ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
+      cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
      openCL.getQueue().enqueueNDRangeKernel(
          kernel, cl::NullRange, global_size, local_size,
          all(event_, other.event_), &result.event_);