mirror of
https://github.com/StepanovPlaton/NeuralNetwork.git
synced 2026-04-03 20:30:39 +04:00
Fixes
This commit is contained in:
14
src/run.py
14
src/run.py
@@ -5,26 +5,26 @@ import time
|
|||||||
if (MODE == PLATFORM.OPENCL):
|
if (MODE == PLATFORM.OPENCL):
|
||||||
init("./tensor/")
|
init("./tensor/")
|
||||||
|
|
||||||
a = Matrix([1024, 1024], 1)
|
a = Matrix([4096*4, 4096*4], 1)
|
||||||
b = Matrix([1024, 1024], 1)
|
b = Matrix([4096*4, 4096*4], 1)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_tensor():
|
def benchmark_tensor():
|
||||||
c = ((a @ b) @ (a @ b)) @ ((a @ b) @ (a @ b))
|
c = a + b
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
a_np = np.ones([1024, 1024], dtype=np.float32)
|
a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
|
||||||
b_np = np.ones([1024, 1024], dtype=np.float32)
|
b_np = np.ones([4096*4, 4096*4], dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_numpy():
|
def benchmark_numpy():
|
||||||
c = ((a_np @ b_np) @ (a_np @ b_np)) @ ((a_np @ b_np) @ (a_np @ b_np))
|
c = a_np + b_np
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
# Многократное выполнение для более точного измерения
|
# Многократное выполнение для более точного измерения
|
||||||
iterations = 5
|
iterations = 2
|
||||||
|
|
||||||
print("Бенчмарк Tensor:")
|
print("Бенчмарк Tensor:")
|
||||||
tensor_times = []
|
tensor_times = []
|
||||||
|
|||||||
@@ -7,16 +7,33 @@ OpenCL openCL;
|
|||||||
#include "cpu/tensor.hpp"
|
#include "cpu/tensor.hpp"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <functional>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
// TODO: TMult >2
|
// TODO: TMult >2
|
||||||
|
|
||||||
|
class Profiler {
|
||||||
|
public:
|
||||||
|
static void measure(const std::string &operation, std::function<void()> op) {
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
op();
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
auto duration =
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
|
||||||
|
std::cout << operation << ": " << duration.count() << " μs\n";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
openCL.init("./");
|
openCL.init("./");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Tensor<float, 2> a = Tensor<float, 2>({32, 32}, 2);
|
Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
||||||
|
Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
||||||
|
|
||||||
|
Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
|
||||||
std::cout << a.toString();
|
std::cout << a.toString();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,28 +8,39 @@ __kernel void hadamard_mult(__global float *A, __global float *B) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define TILE_SIZE 16
|
#define TILE_SIZE 16
|
||||||
|
#define VEC_SIZE 4
|
||||||
__kernel void mult(__global float *A, __global float *B, __global float *C,
|
__kernel void mult(__global float *A, __global float *B, __global float *C,
|
||||||
const int M, const int N, const int K) {
|
const int M, const int N, const int K) {
|
||||||
|
|
||||||
const int row = get_global_id(0);
|
const int row = get_global_id(0) * VEC_SIZE;
|
||||||
const int col = get_global_id(1);
|
const int col = get_global_id(1);
|
||||||
const int local_row = get_local_id(0);
|
const int local_row = get_local_id(0);
|
||||||
const int local_col = get_local_id(1);
|
const int local_col = get_local_id(1);
|
||||||
|
|
||||||
__local float tile_A[TILE_SIZE][TILE_SIZE];
|
__local float tile_A[TILE_SIZE][TILE_SIZE + 1]; // +1 для избежания bank conflicts
|
||||||
__local float tile_B[TILE_SIZE][TILE_SIZE];
|
__local float tile_B[TILE_SIZE][TILE_SIZE + 1];
|
||||||
|
|
||||||
float sum = 0.0f;
|
float4 sum[VEC_SIZE];
|
||||||
|
for (int i = 0; i < VEC_SIZE; i++) {
|
||||||
|
sum[i] = (float4)(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
for (int t = 0; t < (K - 1) / TILE_SIZE + 1; t++) {
|
const int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
|
||||||
|
|
||||||
|
for (int t = 0; t < numTiles; t++) {
|
||||||
|
// Загрузка tile_A с векторизацией
|
||||||
int a_col = t * TILE_SIZE + local_col;
|
int a_col = t * TILE_SIZE + local_col;
|
||||||
if (row < M && a_col < K) {
|
#pragma unroll
|
||||||
tile_A[local_row][local_col] = A[row * K + a_col];
|
for (int v = 0; v < VEC_SIZE; v++) {
|
||||||
} else {
|
int current_row = row + v;
|
||||||
tile_A[local_row][local_col] = 0.0f;
|
if (current_row < M && a_col < K) {
|
||||||
|
tile_A[local_row * VEC_SIZE + v][local_col] = A[current_row * K + a_col];
|
||||||
|
} else {
|
||||||
|
tile_A[local_row * VEC_SIZE + v][local_col] = 0.0f;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Загрузка tile_B
|
||||||
int b_row = t * TILE_SIZE + local_row;
|
int b_row = t * TILE_SIZE + local_row;
|
||||||
if (b_row < K && col < N) {
|
if (b_row < K && col < N) {
|
||||||
tile_B[local_row][local_col] = B[b_row * N + col];
|
tile_B[local_row][local_col] = B[b_row * N + col];
|
||||||
@@ -39,16 +50,32 @@ __kernel void mult(__global float *A, __global float *B, __global float *C,
|
|||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int k_max = min(TILE_SIZE, K - t * TILE_SIZE);
|
// Векторизованное вычисление
|
||||||
for (int k = 0; k < k_max; k++) {
|
#pragma unroll
|
||||||
sum += tile_A[local_row][k] * tile_B[k][local_col];
|
for (int k = 0; k < TILE_SIZE; k++) {
|
||||||
|
float4 a_vals = (float4)(
|
||||||
|
tile_A[local_row * VEC_SIZE + 0][k],
|
||||||
|
tile_A[local_row * VEC_SIZE + 1][k],
|
||||||
|
tile_A[local_row * VEC_SIZE + 2][k],
|
||||||
|
tile_A[local_row * VEC_SIZE + 3][k]
|
||||||
|
);
|
||||||
|
float b_val = tile_B[k][local_col];
|
||||||
|
|
||||||
|
sum[0] += a_vals.x * b_val;
|
||||||
|
sum[1] += a_vals.y * b_val;
|
||||||
|
sum[2] += a_vals.z * b_val;
|
||||||
|
sum[3] += a_vals.w * b_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (row < M && col < N) {
|
// Сохранение результатов с векторизацией
|
||||||
C[row * N + col] = sum;
|
#pragma unroll
|
||||||
|
for (int v = 0; v < VEC_SIZE; v++) {
|
||||||
|
int current_row = row + v;
|
||||||
|
if (current_row < M && col < N) {
|
||||||
|
C[current_row * N + col] = sum[v].x + sum[v].y + sum[v].z + sum[v].w;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -175,6 +175,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define TILE_SIZE 16
|
#define TILE_SIZE 16
|
||||||
|
#define VEC_SIZE 4
|
||||||
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
|
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
|
||||||
static_assert(Dim == 1 || Dim == 2,
|
static_assert(Dim == 1 || Dim == 2,
|
||||||
"Inner product is only defined for vectors and matrices");
|
"Inner product is only defined for vectors and matrices");
|
||||||
@@ -195,9 +196,10 @@ public:
|
|||||||
kernel.setArg(3, (int)m);
|
kernel.setArg(3, (int)m);
|
||||||
kernel.setArg(4, (int)n);
|
kernel.setArg(4, (int)n);
|
||||||
kernel.setArg(5, (int)k);
|
kernel.setArg(5, (int)k);
|
||||||
cl::NDRange global_size(((m + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE,
|
cl::NDRange global_size(
|
||||||
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
|
((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
|
||||||
cl::NDRange local_size(TILE_SIZE, TILE_SIZE);
|
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
|
||||||
|
cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
|
||||||
openCL.getQueue().enqueueNDRangeKernel(
|
openCL.getQueue().enqueueNDRangeKernel(
|
||||||
kernel, cl::NullRange, global_size, local_size,
|
kernel, cl::NullRange, global_size, local_size,
|
||||||
all(event_, other.event_), &result.event_);
|
all(event_, other.event_), &result.event_);
|
||||||
|
|||||||
Reference in New Issue
Block a user