From 8d5a57a8c00e1d701d0b65bacc87f80485e63f2e Mon Sep 17 00:00:00 2001 From: StepanovPlaton Date: Sun, 23 Nov 2025 01:15:51 +0400 Subject: [PATCH] Fixes --- src/run.py | 14 +++---- src/tensor/main.cpp | 19 ++++++++- src/tensor/opencl/kernels/tensor.cl | 61 +++++++++++++++++++++-------- src/tensor/opencl/tensor.hpp | 8 ++-- 4 files changed, 74 insertions(+), 28 deletions(-) diff --git a/src/run.py b/src/run.py index abf48d1..638da5d 100644 --- a/src/run.py +++ b/src/run.py @@ -5,26 +5,26 @@ import time if (MODE == PLATFORM.OPENCL): init("./tensor/") -a = Matrix([1024, 1024], 1) -b = Matrix([1024, 1024], 1) +a = Matrix([4096*4, 4096*4], 1) +b = Matrix([4096*4, 4096*4], 1) def benchmark_tensor(): - c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b)) + c = a + b return c -a_np = np.ones([1024, 1024], dtype=np.float32) -b_np = np.ones([1024, 1024], dtype=np.float32) +a_np = np.ones([4096*4, 4096*4], dtype=np.float32) +b_np = np.ones([4096*4, 4096*4], dtype=np.float32) def benchmark_numpy(): - c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np)) + c = a_np + b_np return c # Многократное выполнение для более точного измерения -iterations = 5 +iterations = 2 print("Бенчмарк Tensor:") tensor_times = [] diff --git a/src/tensor/main.cpp b/src/tensor/main.cpp index 58fd9d9..c7501a0 100644 --- a/src/tensor/main.cpp +++ b/src/tensor/main.cpp @@ -7,16 +7,33 @@ OpenCL openCL; #include "cpu/tensor.hpp" #endif +#include +#include #include // TODO: TMult >2 +class Profiler { +public: + static void measure(const std::string &operation, std::function op) { + auto start = std::chrono::high_resolution_clock::now(); + op(); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + std::cout << operation << ": " << duration.count() << " μs\n"; + } +}; + int main() { #ifdef USE_OPENCL openCL.init("./"); #endif - Tensor a = Tensor({32, 32}, 2); + Tensor a = Tensor({4096 * 2, 4096 * 2}, 1); + Tensor b = Tensor({4096 * 2, 4096 * 2}, 1); + + Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; }); std::cout << a.toString(); return 0; } diff --git a/src/tensor/opencl/kernels/tensor.cl b/src/tensor/opencl/kernels/tensor.cl index 59dc4ea..6cd5a7b 100644 --- a/src/tensor/opencl/kernels/tensor.cl +++ b/src/tensor/opencl/kernels/tensor.cl @@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) { } #define TILE_SIZE 16 +#define VEC_SIZE 4 __kernel void mult(__global float *A, __global float *B, __global float *C, const int M, const int N, const int K) { - const int row = get_global_id(0); + const int row = get_global_id(0) * VEC_SIZE; const int col = get_global_id(1); const int local_row = get_local_id(0); const int local_col = get_local_id(1); - __local float tile_A[TILE_SIZE][TILE_SIZE]; - __local float tile_B[TILE_SIZE][TILE_SIZE]; + __local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts + __local float tile_B[TILE_SIZE][TILE_SIZE + 1]; - float sum = 0.0f; + float4 sum[VEC_SIZE]; + for (int i = 0; i < VEC_SIZE; i++) { + sum[i] = (float4)(0.0f); + } - for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) { - + const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE; + + for (int t = 0; t < numTiles; t++) { + // Загрузка tile_A с векторизацией int a_col = t * TILE_SIZE + local_col; - if (row < M && a_col < K) { - tile_A[local_row][local_col] = A[row * K + a_col]; - } else { - tile_A[local_row][local_col] = 0.0f; + #pragma unroll + for (int v = 0; v < VEC_SIZE; v++) { + int current_row = row + v; + if (current_row < M && a_col < K) { + tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col]; + } else { + tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f; + } } + // Загрузка tile_B int b_row = t * TILE_SIZE + local_row; if (b_row < K && col < N) { tile_B[local_row][local_col] = B[b_row * N + col]; @@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C, barrier(CLK_LOCAL_MEM_FENCE); - int k_max = min(TILE_SIZE, K - t * TILE_SIZE); - for (int k = 0; k < k_max; k++) { - sum += tile_A[local_row][k] * tile_B[k][local_col]; + // Векторизованное вычисление + #pragma unroll + for (int k = 0; k < TILE_SIZE; k++) { + float4 a_vals = (float4)( + tile_A[local_row * VEC_SIZE + 0][k], + tile_A[local_row * VEC_SIZE + 1][k], + tile_A[local_row * VEC_SIZE + 2][k], + tile_A[local_row * VEC_SIZE + 3][k] + ); + float b_val = tile_B[k][local_col]; + + sum[0] += a_vals.x * b_val; + sum[1] += a_vals.y * b_val; + sum[2] += a_vals.z * b_val; + sum[3] += a_vals.w * b_val; } barrier(CLK_LOCAL_MEM_FENCE); } - if (row < M && col < N) { - C[row * N + col] = sum; + // Сохранение результатов с векторизацией + #pragma unroll + for (int v = 0; v < VEC_SIZE; v++) { + int current_row = row + v; + if (current_row < M && col < N) { + C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w; + } } -} - +} \ No newline at end of file diff --git a/src/tensor/opencl/tensor.hpp b/src/tensor/opencl/tensor.hpp index 0c2a3fe..6fb3301 100644 --- a/src/tensor/opencl/tensor.hpp +++ b/src/tensor/opencl/tensor.hpp @@ -175,6 +175,7 @@ public: } #define TILE_SIZE 16 +#define VEC_SIZE 4 Tensor operator%(const Tensor &other) const { static_assert(Dim == 1 || Dim == 2, "Inner product is only defined for vectors and matrices"); @@ -195,9 +196,10 @@ public: kernel.setArg(3, (int)m); kernel.setArg(4, (int)n); kernel.setArg(5, (int)k); - cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE, - ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE); - cl::NDRange local_size(TILE_SIZE, TILE_SIZE); + cl::NDRange global_size( + ((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE, + ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE); + cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE); openCL.getQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_size, local_size, all(event_, other.event_), &result.event_);