This commit is contained in:
2025-11-23 01:15:51 +04:00
parent 0455d9bd5b
commit 8d5a57a8c0
4 changed files with 74 additions and 28 deletions

View File

@@ -5,26 +5,26 @@ import time
if (MODE == PLATFORM.OPENCL): if (MODE == PLATFORM.OPENCL):
init("./tensor/") init("./tensor/")
a = Matrix([1024, 1024], 1) a = Matrix([4096*4, 4096*4], 1)
b = Matrix([1024, 1024], 1) b = Matrix([4096*4, 4096*4], 1)
def benchmark_tensor(): def benchmark_tensor():
c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b)) c = a + b
return c return c
a_np = np.ones([1024, 1024], dtype=np.float32) a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
b_np = np.ones([1024, 1024], dtype=np.float32) b_np = np.ones([4096*4, 4096*4], dtype=np.float32)
def benchmark_numpy(): def benchmark_numpy():
c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np)) c = a_np + b_np
return c return c
# Многократное выполнение для более точного измерения # Многократное выполнение для более точного измерения
iterations = 5 iterations = 2
print("Бенчмарк Tensor:") print("Бенчмарк Tensor:")
tensor_times = [] tensor_times = []

View File

@@ -7,16 +7,33 @@ OpenCL openCL;
#include "cpu/tensor.hpp" #include "cpu/tensor.hpp"
#endif #endif
#include <chrono>
#include <functional>
#include <iostream> #include <iostream>
// TODO: TMult >2 // TODO: TMult >2
class Profiler {
public:
static void measure(const std::string &operation, std::function<void()> op) {
auto start = std::chrono::high_resolution_clock::now();
op();
auto end = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << operation << ": " << duration.count() << " μs\n";
}
};
int main() { int main() {
#ifdef USE_OPENCL #ifdef USE_OPENCL
openCL.init("./"); openCL.init("./");
#endif #endif
Tensor<float, 2> a = Tensor<float, 2>({32, 32}, 2); Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
std::cout << a.toString(); std::cout << a.toString();
return 0; return 0;
} }

View File

@@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) {
} }
#define TILE_SIZE 16 #define TILE_SIZE 16
#define VEC_SIZE 4
__kernel void mult(__global float *A, __global float *B, __global float *C, __kernel void mult(__global float *A, __global float *B, __global float *C,
const int M, const int N, const int K) { const int M, const int N, const int K) {
const int row = get_global_id(0); const int row = get_global_id(0) * VEC_SIZE;
const int col = get_global_id(1); const int col = get_global_id(1);
const int local_row = get_local_id(0); const int local_row = get_local_id(0);
const int local_col = get_local_id(1); const int local_col = get_local_id(1);
__local float tile_A[TILE_SIZE][TILE_SIZE]; __local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts
__local float tile_B[TILE_SIZE][TILE_SIZE]; __local float tile_B[TILE_SIZE][TILE_SIZE + 1];
float sum = 0.0f; float4 sum[VEC_SIZE];
for (int i = 0; i < VEC_SIZE; i++) {
sum[i] = (float4)(0.0f);
}
for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) { const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
for (int t = 0; t < numTiles; t++) {
// Загрузка tile_A с векторизацией
int a_col = t * TILE_SIZE + local_col; int a_col = t * TILE_SIZE + local_col;
if (row < M && a_col < K) { #pragma unroll
tile_A[local_row][local_col] = A[row * K + a_col]; for (int v = 0; v < VEC_SIZE; v++) {
} else { int current_row = row + v;
tile_A[local_row][local_col] = 0.0f; if (current_row < M && a_col < K) {
tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col];
} else {
tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f;
}
} }
// Загрузка tile_B
int b_row = t * TILE_SIZE + local_row; int b_row = t * TILE_SIZE + local_row;
if (b_row < K && col < N) { if (b_row < K && col < N) {
tile_B[local_row][local_col] = B[b_row * N + col]; tile_B[local_row][local_col] = B[b_row * N + col];
@@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int k_max = min(TILE_SIZE, K - t * TILE_SIZE); // Векторизованное вычисление
for (int k = 0; k < k_max; k++) { #pragma unroll
sum += tile_A[local_row][k] * tile_B[k][local_col]; for (int k = 0; k < TILE_SIZE; k++) {
float4 a_vals = (float4)(
tile_A[local_row * VEC_SIZE + 0][k],
tile_A[local_row * VEC_SIZE + 1][k],
tile_A[local_row * VEC_SIZE + 2][k],
tile_A[local_row * VEC_SIZE + 3][k]
);
float b_val = tile_B[k][local_col];
sum[0] += a_vals.x * b_val;
sum[1] += a_vals.y * b_val;
sum[2] += a_vals.z * b_val;
sum[3] += a_vals.w * b_val;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (row < M && col < N) { // Сохранение результатов с векторизацией
C[row * N + col] = sum; #pragma unroll
for (int v = 0; v < VEC_SIZE; v++) {
int current_row = row + v;
if (current_row < M && col < N) {
C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w;
}
} }
} }

View File

@@ -175,6 +175,7 @@ public:
} }
#define TILE_SIZE 16 #define TILE_SIZE 16
#define VEC_SIZE 4
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const { Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
static_assert(Dim == 1 || Dim == 2, static_assert(Dim == 1 || Dim == 2,
"Inner product is only defined for vectors and matrices"); "Inner product is only defined for vectors and matrices");
@@ -195,9 +196,10 @@ public:
kernel.setArg(3, (int)m); kernel.setArg(3, (int)m);
kernel.setArg(4, (int)n); kernel.setArg(4, (int)n);
kernel.setArg(5, (int)k); kernel.setArg(5, (int)k);
cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE, cl::NDRange global_size(
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE); ((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
cl::NDRange local_size(TILE_SIZE, TILE_SIZE); ((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
openCL.getQueue().enqueueNDRangeKernel( openCL.getQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_size, local_size, kernel, cl::NullRange, global_size, local_size,
all(event_, other.event_), &result.event_); all(event_, other.event_), &result.event_);