mirror of
https://github.com/StepanovPlaton/NeuralNetwork.git
synced 2026-04-03 20:30:39 +04:00
Fixes
This commit is contained in:
14
src/run.py
14
src/run.py
@@ -5,26 +5,26 @@ import time
|
||||
if (MODE == PLATFORM.OPENCL):
|
||||
init("./tensor/")
|
||||
|
||||
a = Matrix([1024, 1024], 1)
|
||||
b = Matrix([1024, 1024], 1)
|
||||
a = Matrix([4096*4, 4096*4], 1)
|
||||
b = Matrix([4096*4, 4096*4], 1)
|
||||
|
||||
|
||||
def benchmark_tensor():
|
||||
c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b))
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
a_np = np.ones([1024, 1024], dtype=np.float32)
|
||||
b_np = np.ones([1024, 1024], dtype=np.float32)
|
||||
a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
|
||||
b_np = np.ones([4096*4, 4096*4], dtype=np.float32)
|
||||
|
||||
|
||||
def benchmark_numpy():
|
||||
c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np))
|
||||
c = a_np + b_np
|
||||
return c
|
||||
|
||||
|
||||
# Многократное выполнение для более точного измерения
|
||||
iterations = 5
|
||||
iterations = 2
|
||||
|
||||
print("Бенчмарк Tensor:")
|
||||
tensor_times = []
|
||||
|
||||
@@ -7,16 +7,33 @@ OpenCL openCL;
|
||||
#include "cpu/tensor.hpp"
|
||||
#endif
|
||||
|
||||
#include <chrono>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
|
||||
// TODO: TMult >2
|
||||
|
||||
class Profiler {
|
||||
public:
|
||||
static void measure(const std::string &operation, std::function<void()> op) {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
op();
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto duration =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
|
||||
std::cout << operation << ": " << duration.count() << " μs\n";
|
||||
}
|
||||
};
|
||||
|
||||
int main() {
|
||||
#ifdef USE_OPENCL
|
||||
openCL.init("./");
|
||||
#endif
|
||||
|
||||
Tensor<float, 2> a = Tensor<float, 2>({32, 32}, 2);
|
||||
Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
||||
Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
||||
|
||||
Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
|
||||
std::cout << a.toString();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) {
|
||||
}
|
||||
|
||||
#define TILE_SIZE 16
|
||||
#define VEC_SIZE 4
|
||||
__kernel void mult(__global float *A, __global float *B, __global float *C,
|
||||
const int M, const int N, const int K) {
|
||||
|
||||
const int row = get_global_id(0);
|
||||
const int row = get_global_id(0) * VEC_SIZE;
|
||||
const int col = get_global_id(1);
|
||||
const int local_row = get_local_id(0);
|
||||
const int local_col = get_local_id(1);
|
||||
|
||||
__local float tile_A[TILE_SIZE][TILE_SIZE];
|
||||
__local float tile_B[TILE_SIZE][TILE_SIZE];
|
||||
__local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts
|
||||
__local float tile_B[TILE_SIZE][TILE_SIZE + 1];
|
||||
|
||||
float sum = 0.0f;
|
||||
float4 sum[VEC_SIZE];
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
sum[i] = (float4)(0.0f);
|
||||
}
|
||||
|
||||
for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) {
|
||||
const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
|
||||
|
||||
for (int t = 0; t < numTiles; t++) {
|
||||
// Загрузка tile_A с векторизацией
|
||||
int a_col = t * TILE_SIZE + local_col;
|
||||
if (row < M && a_col < K) {
|
||||
tile_A[local_row][local_col] = A[row * K + a_col];
|
||||
} else {
|
||||
tile_A[local_row][local_col] = 0.0f;
|
||||
#pragma unroll
|
||||
for (int v = 0; v < VEC_SIZE; v++) {
|
||||
int current_row = row + v;
|
||||
if (current_row < M && a_col < K) {
|
||||
tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col];
|
||||
} else {
|
||||
tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Загрузка tile_B
|
||||
int b_row = t * TILE_SIZE + local_row;
|
||||
if (b_row < K && col < N) {
|
||||
tile_B[local_row][local_col] = B[b_row * N + col];
|
||||
@@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int k_max = min(TILE_SIZE, K - t * TILE_SIZE);
|
||||
for (int k = 0; k < k_max; k++) {
|
||||
sum += tile_A[local_row][k] * tile_B[k][local_col];
|
||||
// Векторизованное вычисление
|
||||
#pragma unroll
|
||||
for (int k = 0; k < TILE_SIZE; k++) {
|
||||
float4 a_vals = (float4)(
|
||||
tile_A[local_row * VEC_SIZE + 0][k],
|
||||
tile_A[local_row * VEC_SIZE + 1][k],
|
||||
tile_A[local_row * VEC_SIZE + 2][k],
|
||||
tile_A[local_row * VEC_SIZE + 3][k]
|
||||
);
|
||||
float b_val = tile_B[k][local_col];
|
||||
|
||||
sum[0] += a_vals.x * b_val;
|
||||
sum[1] += a_vals.y * b_val;
|
||||
sum[2] += a_vals.z * b_val;
|
||||
sum[3] += a_vals.w * b_val;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (row < M && col < N) {
|
||||
C[row * N + col] = sum;
|
||||
// Сохранение результатов с векторизацией
|
||||
#pragma unroll
|
||||
for (int v = 0; v < VEC_SIZE; v++) {
|
||||
int current_row = row + v;
|
||||
if (current_row < M && col < N) {
|
||||
C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -175,6 +175,7 @@ public:
|
||||
}
|
||||
|
||||
#define TILE_SIZE 16
|
||||
#define VEC_SIZE 4
|
||||
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
|
||||
static_assert(Dim == 1 || Dim == 2,
|
||||
"Inner product is only defined for vectors and matrices");
|
||||
@@ -195,9 +196,10 @@ public:
|
||||
kernel.setArg(3, (int)m);
|
||||
kernel.setArg(4, (int)n);
|
||||
kernel.setArg(5, (int)k);
|
||||
cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE,
|
||||
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
|
||||
cl::NDRange local_size(TILE_SIZE, TILE_SIZE);
|
||||
cl::NDRange global_size(
|
||||
((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
|
||||
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
|
||||
cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
|
||||
openCL.getQueue().enqueueNDRangeKernel(
|
||||
kernel, cl::NullRange, global_size, local_size,
|
||||
all(event_, other.event_), &result.event_);
|
||||
|
||||
Reference in New Issue
Block a user