This commit is contained in:
2025-11-23 01:15:51 +04:00
parent 0455d9bd5b
commit 8d5a57a8c0
4 changed files with 74 additions and 28 deletions

View File

@@ -5,26 +5,26 @@ import time
if (MODE == PLATFORM.OPENCL):
init("./tensor/")
a = Matrix([1024, 1024], 1)
b = Matrix([1024, 1024], 1)
a = Matrix([4096*4, 4096*4], 1)
b = Matrix([4096*4, 4096*4], 1)
def benchmark_tensor():
c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b))
c = a + b
return c
a_np = np.ones([1024, 1024], dtype=np.float32)
b_np = np.ones([1024, 1024], dtype=np.float32)
a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
b_np = np.ones([4096*4, 4096*4], dtype=np.float32)
def benchmark_numpy():
c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np))
c = a_np + b_np
return c
# Многократное выполнение для более точного измерения
iterations = 5
iterations = 2
print("Бенчмарк Tensor:")
tensor_times = []

View File

@@ -7,16 +7,33 @@ OpenCL openCL;
#include "cpu/tensor.hpp"
#endif
#include <chrono>
#include <functional>
#include <iostream>
// TODO: TMult >2
class Profiler {
public:
static void measure(const std::string &operation, std::function<void()> op) {
auto start = std::chrono::high_resolution_clock::now();
op();
auto end = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::cout << operation << ": " << duration.count() << " μs\n";
}
};
int main() {
#ifdef USE_OPENCL
openCL.init("./");
#endif
Tensor<float, 2> a = Tensor<float, 2>({32, 32}, 2);
Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
std::cout << a.toString();
return 0;
}

View File

@@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) {
}
#define TILE_SIZE 16
#define VEC_SIZE 4
__kernel void mult(__global float *A, __global float *B, __global float *C,
const int M, const int N, const int K) {
const int row = get_global_id(0);
const int row = get_global_id(0) * VEC_SIZE;
const int col = get_global_id(1);
const int local_row = get_local_id(0);
const int local_col = get_local_id(1);
__local float tile_A[TILE_SIZE][TILE_SIZE];
__local float tile_B[TILE_SIZE][TILE_SIZE];
__local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts
__local float tile_B[TILE_SIZE][TILE_SIZE + 1];
float sum = 0.0f;
float4 sum[VEC_SIZE];
for (int i = 0; i < VEC_SIZE; i++) {
sum[i] = (float4)(0.0f);
}
for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) {
const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
for (int t = 0; t < numTiles; t++) {
// Загрузка tile_A с векторизацией
int a_col = t * TILE_SIZE + local_col;
if (row < M && a_col < K) {
tile_A[local_row][local_col] = A[row * K + a_col];
} else {
tile_A[local_row][local_col] = 0.0f;
#pragma unroll
for (int v = 0; v < VEC_SIZE; v++) {
int current_row = row + v;
if (current_row < M && a_col < K) {
tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col];
} else {
tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f;
}
}
// Загрузка tile_B
int b_row = t * TILE_SIZE + local_row;
if (b_row < K && col < N) {
tile_B[local_row][local_col] = B[b_row * N + col];
@@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
barrier(CLK_LOCAL_MEM_FENCE);
int k_max = min(TILE_SIZE, K - t * TILE_SIZE);
for (int k = 0; k < k_max; k++) {
sum += tile_A[local_row][k] * tile_B[k][local_col];
// Векторизованное вычисление
#pragma unroll
for (int k = 0; k < TILE_SIZE; k++) {
float4 a_vals = (float4)(
tile_A[local_row * VEC_SIZE + 0][k],
tile_A[local_row * VEC_SIZE + 1][k],
tile_A[local_row * VEC_SIZE + 2][k],
tile_A[local_row * VEC_SIZE + 3][k]
);
float b_val = tile_B[k][local_col];
sum[0] += a_vals.x * b_val;
sum[1] += a_vals.y * b_val;
sum[2] += a_vals.z * b_val;
sum[3] += a_vals.w * b_val;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (row < M && col < N) {
C[row * N + col] = sum;
// Сохранение результатов с векторизацией
#pragma unroll
for (int v = 0; v < VEC_SIZE; v++) {
int current_row = row + v;
if (current_row < M && col < N) {
C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w;
}
}
}
}

View File

@@ -175,6 +175,7 @@ public:
}
#define TILE_SIZE 16
#define VEC_SIZE 4
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
static_assert(Dim == 1 || Dim == 2,
"Inner product is only defined for vectors and matrices");
@@ -195,9 +196,10 @@ public:
kernel.setArg(3, (int)m);
kernel.setArg(4, (int)n);
kernel.setArg(5, (int)k);
cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE,
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
cl::NDRange local_size(TILE_SIZE, TILE_SIZE);
cl::NDRange global_size(
((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
openCL.getQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_size, local_size,
all(event_, other.event_), &result.event_);