mirror of
https://github.com/StepanovPlaton/NeuralNetwork.git
synced 2026-04-03 12:20:39 +04:00
Finally it works
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,5 +3,6 @@
|
|||||||
*.exe
|
*.exe
|
||||||
*.so
|
*.so
|
||||||
*.pyd
|
*.pyd
|
||||||
|
*pyi
|
||||||
|
|
||||||
src/tensor/build
|
src/tensor/build
|
||||||
125
src/run.py
125
src/run.py
@@ -1,54 +1,91 @@
|
|||||||
from tensor.tensor import *
|
import tensor.tensor as T
|
||||||
import numpy as np
|
|
||||||
import time
|
|
||||||
|
|
||||||
if (MODE == PLATFORM.OPENCL):
|
if (T.MODE == T.PLATFORM.OPENCL):
|
||||||
init("./tensor/")
|
T.init()
|
||||||
|
|
||||||
a = Matrix([4096*4, 4096*4], 1)
|
|
||||||
b = Matrix([4096*4, 4096*4], 1)
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_tensor():
|
class Layer:
|
||||||
c = a + b
|
inputFeatures: int
|
||||||
return c
|
outputFeatures: int
|
||||||
|
weights: T.Matrix
|
||||||
|
bias: T.Matrix # T.Vector
|
||||||
|
activation: T.FUNCTION
|
||||||
|
|
||||||
|
internal: T.Matrix # T.Vector
|
||||||
|
outputs: T.Matrix # T.Vector
|
||||||
|
|
||||||
|
def __init__(self, inputFeatures: int, outputFeatures: int, activation: T.FUNCTION):
|
||||||
|
self.inputFeatures = inputFeatures
|
||||||
|
self.outputFeatures = outputFeatures
|
||||||
|
self.weights = T.Matrix([outputFeatures, inputFeatures], 0, 1)*0.1
|
||||||
|
self.bias = T.Matrix([outputFeatures, 1], 0)
|
||||||
|
self.activation = activation
|
||||||
|
|
||||||
|
self.internal = T.Matrix([outputFeatures, 1], 0)
|
||||||
|
self.outputs = T.Matrix([outputFeatures, 1], 0)
|
||||||
|
|
||||||
|
|
||||||
a_np = np.ones([4096*4, 4096*4], dtype=np.float32)
|
class NN:
|
||||||
b_np = np.ones([4096*4, 4096*4], dtype=np.float32)
|
layers: list[Layer]
|
||||||
|
|
||||||
|
def __init__(self, layers: list[Layer]):
|
||||||
|
self.layers = layers
|
||||||
|
|
||||||
|
def forward(self, inputs: T.Matrix) -> T.Matrix:
|
||||||
|
for i, layer in enumerate(self.layers):
|
||||||
|
layer.internal = (
|
||||||
|
layer.weights @
|
||||||
|
(inputs if i == 0 else self.layers[i-1].outputs)
|
||||||
|
) + layer.bias
|
||||||
|
layer.outputs = layer.internal(layer.activation)
|
||||||
|
return self.layers[len(self.layers)-1].outputs
|
||||||
|
|
||||||
|
def learn(self, inputs: T.Matrix, target: T.Matrix):
|
||||||
|
self.forward(inputs)
|
||||||
|
|
||||||
|
lossVector = self.layers[len(self.layers) -
|
||||||
|
1].outputs - target
|
||||||
|
# print("loss", lossVector(T.FUNCTION.MSE))
|
||||||
|
dAnl = lossVector(T.FUNCTION.MSE, True)
|
||||||
|
for i in range(len(self.layers)-1, -1, -1):
|
||||||
|
dZl = dAnl * \
|
||||||
|
self.layers[i].internal(self.layers[i].activation, True)
|
||||||
|
dWl = dZl @ (inputs if i ==
|
||||||
|
0 else self.layers[i-1].outputs).t()
|
||||||
|
dbl = dZl
|
||||||
|
# dbl = dZl.sum(axis=1).reshape(dZl.shape[0], 1)
|
||||||
|
dAnl = self.layers[i].weights.t() @ dZl
|
||||||
|
self.layers[i].weights.t()
|
||||||
|
self.layers[i].weights += (dWl * -0.3)
|
||||||
|
self.layers[i].bias += (dbl * -0.3)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_numpy():
|
nn = NN([Layer(2, 3, T.FUNCTION.SIGMOID), Layer(3, 1, T.FUNCTION.LINEAR)])
|
||||||
c = a_np + b_np
|
|
||||||
return c
|
|
||||||
|
|
||||||
|
print("Обучение...")
|
||||||
|
for epoch in range(1000):
|
||||||
|
total_loss = 0
|
||||||
|
for i in range(0, 2):
|
||||||
|
for j in range(0, 2):
|
||||||
|
input = T.Matrix([2, 1], [i, j])
|
||||||
|
output = T.Matrix([1, 1], [i ^ j])
|
||||||
|
nn.learn(input, output)
|
||||||
|
|
||||||
# Многократное выполнение для более точного измерения
|
if epoch % 100 == 0:
|
||||||
iterations = 2
|
print(f"Эпоха {epoch}")
|
||||||
|
for i in range(0, 2):
|
||||||
|
for j in range(0, 2):
|
||||||
|
input = T.Matrix([2, 1], [i, j])
|
||||||
|
predicted = nn.forward(input)
|
||||||
|
print(
|
||||||
|
f"{i} XOR {j} = {i ^ j}, NN: ", predicted)
|
||||||
|
print()
|
||||||
|
|
||||||
print("Бенчмарк Tensor:")
|
print("Финальные результаты:")
|
||||||
tensor_times = []
|
for i in range(0, 2):
|
||||||
for i in range(iterations):
|
for j in range(0, 2):
|
||||||
start = time.time()
|
input = T.Matrix([2, 1], [i, j])
|
||||||
result_tensor = benchmark_tensor()
|
predicted = nn.forward(input)
|
||||||
print(result_tensor)
|
print(
|
||||||
tensor_times.append(time.time() - start)
|
f"{i} XOR {j} = {i ^ j}, NN: ", predicted)
|
||||||
|
print()
|
||||||
print("Бенчмарк NumPy:")
|
|
||||||
numpy_times = []
|
|
||||||
for i in range(iterations):
|
|
||||||
start = time.time()
|
|
||||||
result_numpy = benchmark_numpy()
|
|
||||||
print(result_numpy)
|
|
||||||
numpy_times.append(time.time() - start)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"\nСреднее время Tensor: {np.mean(tensor_times):.4f} ± {np.std(tensor_times):.4f} сек")
|
|
||||||
print(
|
|
||||||
f"Среднее время NumPy: {np.mean(numpy_times):.4f} ± {np.std(numpy_times):.4f} сек")
|
|
||||||
|
|
||||||
ratio = np.mean(numpy_times) / np.mean(tensor_times)
|
|
||||||
if ratio > 1:
|
|
||||||
print(f"Tensor быстрее в {ratio:.2f} раз")
|
|
||||||
else:
|
|
||||||
print(f"NumPy быстрее в {1/ratio:.2f} раз")
|
|
||||||
|
|||||||
@@ -54,4 +54,4 @@ opencl_module: $(COMMON_SRC) $(OPENCL_SRC) pybind.cpp | $(BUILD_DIR)
|
|||||||
PYTHONPATH=. pybind11-stubgen tensor -o .
|
PYTHONPATH=. pybind11-stubgen tensor -o .
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf $(BUILD_DIR) $(TARGET) *.$(SHARED_LIB_EXT)
|
rm -rf $(BUILD_DIR) $(TARGET) *.$(SHARED_LIB_EXT) *.pyi
|
||||||
|
|||||||
@@ -51,6 +51,8 @@ public:
|
|||||||
|
|
||||||
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const;
|
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const;
|
||||||
|
|
||||||
|
Tensor apply(Function f, bool derivative = false) const override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -149,14 +149,49 @@ Tensor<T, Dim>::operator%(const Tensor &other) const {
|
|||||||
for (size_t j = 0; j < p; ++j) {
|
for (size_t j = 0; j < p; ++j) {
|
||||||
T sum = T(0);
|
T sum = T(0);
|
||||||
for (size_t k = 0; k < n; ++k)
|
for (size_t k = 0; k < n; ++k)
|
||||||
sum += (*this)(i, k) * other(k, j);
|
sum += (*this)[i * n + k] * other[k * p + j];
|
||||||
result(i, j) = sum;
|
result[i * p + j] = sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T, int Dim>
|
||||||
|
Tensor<T, Dim> Tensor<T, Dim>::apply(Function f, bool derivative) const {
|
||||||
|
Tensor result = *this;
|
||||||
|
auto func = [f, derivative](T x) -> T {
|
||||||
|
switch (f) {
|
||||||
|
case Function::SIGMOID:
|
||||||
|
if (!derivative)
|
||||||
|
return T(1) / (T(1) + std::exp(-x));
|
||||||
|
else {
|
||||||
|
T sigmoid = T(1) / (T(1) + std::exp(-x));
|
||||||
|
return sigmoid * (T(1) - sigmoid);
|
||||||
|
}
|
||||||
|
case Function::RELU:
|
||||||
|
if (!derivative)
|
||||||
|
return std::max(T(0), x);
|
||||||
|
else
|
||||||
|
return (x > T(0)) ? T(1) : T(0);
|
||||||
|
case Function::MSE:
|
||||||
|
if (!derivative)
|
||||||
|
return x * x;
|
||||||
|
else
|
||||||
|
return T(2) * x;
|
||||||
|
case Function::LINEAR:
|
||||||
|
default:
|
||||||
|
if (!derivative)
|
||||||
|
return x;
|
||||||
|
else
|
||||||
|
return T(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for (size_t i = 0; i < getSize(); ++i)
|
||||||
|
result[i] = func((*this)[i]);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ===== UTILS =====
|
// ===== UTILS =====
|
||||||
template <typename T, int Dim> std::string Tensor<T, Dim>::toString() const {
|
template <typename T, int Dim> std::string Tensor<T, Dim>::toString() const {
|
||||||
return ITensor::format(data_);
|
return ITensor::format(data_);
|
||||||
|
|||||||
@@ -20,20 +20,22 @@ public:
|
|||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
auto duration =
|
auto duration =
|
||||||
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
|
||||||
std::cout << operation << ": " << duration.count() << " ns\n";
|
std::cout << operation << ": " << duration.count() / 1000000.0f << "s\n";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
openCL.printDeviceInfo();
|
openCL.init();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
Tensor<float, 2> a = Tensor<float, 2>({2, 3}, 0, 1);
|
||||||
Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
std::cout << a.toString() << std::endl;
|
||||||
Profiler::measure("Matrix multiplication", [&]() {
|
Tensor<float, 2> b = Tensor<float, 2>({2, 3}, 0, 1);
|
||||||
auto result = a % b;
|
std::cout << b.toString() << std::endl;
|
||||||
std::cout << result.toString();
|
Profiler::measure("Time", [&]() {
|
||||||
|
auto result = a * b;
|
||||||
|
std::cout << result.toString() << std::endl;
|
||||||
});
|
});
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@@ -11,6 +11,7 @@
|
|||||||
template <typename T> class Kernels {
|
template <typename T> class Kernels {
|
||||||
public:
|
public:
|
||||||
enum class Vector {
|
enum class Vector {
|
||||||
|
type1 = 1,
|
||||||
type2 = 2,
|
type2 = 2,
|
||||||
type4 = 4,
|
type4 = 4,
|
||||||
type8 = 8,
|
type8 = 8,
|
||||||
@@ -24,6 +25,7 @@ public:
|
|||||||
T_ADD,
|
T_ADD,
|
||||||
T_HADAMARD,
|
T_HADAMARD,
|
||||||
T_MULT,
|
T_MULT,
|
||||||
|
FUNC
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@@ -42,6 +44,7 @@ private:
|
|||||||
pos += value.length();
|
pos += value.length();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// std::cout << result << std::endl;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,6 +53,7 @@ private:
|
|||||||
R"(
|
R"(
|
||||||
__kernel void {method}(__global type* A, int len) {
|
__kernel void {method}(__global type* A, int len) {
|
||||||
int gid = get_global_id(0);
|
int gid = get_global_id(0);
|
||||||
|
#if WIDTH != 1
|
||||||
int base = gid * WIDTH;
|
int base = gid * WIDTH;
|
||||||
if (base + WIDTH <= len) {
|
if (base + WIDTH <= len) {
|
||||||
typeX data = vloadX(gid, A);
|
typeX data = vloadX(gid, A);
|
||||||
@@ -60,6 +64,9 @@ private:
|
|||||||
if (idx < len) A[idx] = {operation}A[idx];
|
if (idx < len) A[idx] = {operation}A[idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
A[gid] = {operation}A[gid];
|
||||||
|
#endif
|
||||||
})",
|
})",
|
||||||
{{"method", name}, {"operation", operation}});
|
{{"method", name}, {"operation", operation}});
|
||||||
}
|
}
|
||||||
@@ -69,6 +76,7 @@ private:
|
|||||||
R"(
|
R"(
|
||||||
__kernel void {method}(__global type* A, int len, type scalar) {
|
__kernel void {method}(__global type* A, int len, type scalar) {
|
||||||
int gid = get_global_id(0);
|
int gid = get_global_id(0);
|
||||||
|
#if WIDTH != 1
|
||||||
int base = gid * WIDTH;
|
int base = gid * WIDTH;
|
||||||
if (base + WIDTH <= len) {
|
if (base + WIDTH <= len) {
|
||||||
typeX data = vloadX(gid, A);
|
typeX data = vloadX(gid, A);
|
||||||
@@ -80,6 +88,9 @@ private:
|
|||||||
if (idx < len) A[idx] = A[idx] {operation} scalar;
|
if (idx < len) A[idx] = A[idx] {operation} scalar;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
A[gid] = A[gid] {operation} scalar;
|
||||||
|
#endif
|
||||||
})",
|
})",
|
||||||
{{"method", name}, {"operation", operation}});
|
{{"method", name}, {"operation", operation}});
|
||||||
}
|
}
|
||||||
@@ -89,6 +100,7 @@ private:
|
|||||||
R"(
|
R"(
|
||||||
__kernel void {method}(__global type* A, __global type* B, int len) {
|
__kernel void {method}(__global type* A, __global type* B, int len) {
|
||||||
int gid = get_global_id(0);
|
int gid = get_global_id(0);
|
||||||
|
#if WIDTH != 1
|
||||||
int base = gid * WIDTH;
|
int base = gid * WIDTH;
|
||||||
if (base + WIDTH <= len) {
|
if (base + WIDTH <= len) {
|
||||||
typeX dataA = vloadX(gid, A);
|
typeX dataA = vloadX(gid, A);
|
||||||
@@ -100,48 +112,65 @@ private:
|
|||||||
if (idx < len) A[idx] = A[idx] {operation} B[idx];
|
if (idx < len) A[idx] = A[idx] {operation} B[idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
A[gid] = A[gid] {operation} B[gid];
|
||||||
|
#endif
|
||||||
})",
|
})",
|
||||||
{{"method", name}, {"operation", operation}});
|
{{"method", name}, {"operation", operation}});
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string matrixMult(std::string name) {
|
std::string matrixMult() {
|
||||||
return format(
|
return R"(
|
||||||
R"(
|
__kernel void mult(const __global type* A,
|
||||||
#define TILE_SIZE WIDTH*4
|
const __global type* B,
|
||||||
__kernel void mult(const __global typeX* A,
|
__global type* C,
|
||||||
const __global typeX* B,
|
const int M, const int N, const int K) {
|
||||||
__global typeX* C, const int M, const int N, const int K) {
|
const int row = get_global_id(0);
|
||||||
const int row = get_local_id(0);
|
const int col = get_global_id(1);
|
||||||
const int col = get_local_id(1);
|
if (row < M && col < N) {
|
||||||
const int globalRow = (TILE_SIZE/WIDTH)*get_group_id(0) + row;
|
type sum = 0.0f;
|
||||||
const int globalCol = TILE_SIZE*get_group_id(1) + col;
|
for (int k = 0; k < K; k++)
|
||||||
__local typeX Asub[TILE_SIZE][TILE_SIZE/WIDTH];
|
sum += A[row * K + k] * B[k * N + col];
|
||||||
__local typeX Bsub[TILE_SIZE][TILE_SIZE/WIDTH];
|
C[row * N + col] = sum;
|
||||||
typeX acc = 0;
|
}
|
||||||
const int numTiles = K/TILE_SIZE;
|
})";
|
||||||
for (int tile = 0; tile < numTiles; tile++) {
|
}
|
||||||
const int tiledRow = (TILE_SIZE/WIDTH)*tile + row;
|
|
||||||
const int tiledCol = TILE_SIZE*tile + col;
|
std::string func() {
|
||||||
Asub[col][row] = A[tiledCol*(M/WIDTH) + globalRow];
|
return R"(
|
||||||
Bsub[col][row] = B[globalCol*(K/WIDTH) + tiledRow];
|
__kernel void func(__global type* A, const int f, const int derivative) {
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
int gid = get_global_id(0);
|
||||||
typeX vecA, vecB;
|
type x = A[gid];
|
||||||
type valB;
|
switch (f) {
|
||||||
for (int k = 0; k < TILE_SIZE/WIDTH; k++) {
|
case 0: // SIGMOID
|
||||||
vecB = Bsub[col][k];
|
if (!derivative)
|
||||||
for (int w = 0; w < WIDTH; w++) {
|
A[gid] = (type)1 / ((type)1 + exp(-x));
|
||||||
vecA = Asub[WIDTH*k + w][row];
|
else {
|
||||||
valB = vecB[w];
|
type sigmoid = (type)1 / ((type)1 + exp(-x));
|
||||||
for (int i = 0; i < WIDTH; i++)
|
A[gid] = sigmoid * ((type)1 - sigmoid);
|
||||||
acc[i] += vecA[i] * valB;
|
}
|
||||||
}
|
break;
|
||||||
}
|
case 1: // RELU
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
if (!derivative)
|
||||||
}
|
A[gid] = fmax((type)0, x);
|
||||||
C[globalCol*(M/WIDTH) + globalRow] = acc;
|
else
|
||||||
}
|
A[gid] = (x > (type)0) ? (type)1 : (type)0;
|
||||||
)",
|
break;
|
||||||
{{"method", name}});
|
case 2: // MSE (здесь это скорее квадратная функция)
|
||||||
|
if (!derivative)
|
||||||
|
A[gid] = x * x;
|
||||||
|
else
|
||||||
|
A[gid] = (type)2 * x;
|
||||||
|
break;
|
||||||
|
case 3: // LINEAR
|
||||||
|
default:
|
||||||
|
if (!derivative)
|
||||||
|
A[gid] = x;
|
||||||
|
else
|
||||||
|
A[gid] = (type)1.0f;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
})";
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unordered_map<Method, std::tuple<std::string, std::string>> programs = {
|
std::unordered_map<Method, std::tuple<std::string, std::string>> programs = {
|
||||||
@@ -155,13 +184,18 @@ private:
|
|||||||
{Method::T_HADAMARD,
|
{Method::T_HADAMARD,
|
||||||
{binaryOperation("hadamard_mult", "*"), "hadamard_mult"}},
|
{binaryOperation("hadamard_mult", "*"), "hadamard_mult"}},
|
||||||
|
|
||||||
{Method::T_MULT, {matrixMult("mult"), "mult"}},
|
{Method::T_MULT, {matrixMult(), "mult"}},
|
||||||
|
|
||||||
|
{Method::FUNC, {func(), "func"}},
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_map<Method, cl::Program> compiledPrograms;
|
std::unordered_map<Method, cl::Program> compiledPrograms;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Kernels(Vector vec = Vector::type4) : vector(vec) {
|
Kernels(Vector vec) : vector(vec) {
|
||||||
|
std::cout << "Compile " << getTypeName()
|
||||||
|
<< " kernels with vector size = " << std::to_string((int)vector)
|
||||||
|
<< " ";
|
||||||
std::string extensions = openCL.getDevice().getInfo<CL_DEVICE_EXTENSIONS>();
|
std::string extensions = openCL.getDevice().getInfo<CL_DEVICE_EXTENSIONS>();
|
||||||
if (extensions.find("cl_khr_fp16") != std::string::npos)
|
if (extensions.find("cl_khr_fp16") != std::string::npos)
|
||||||
configuration = R"(
|
configuration = R"(
|
||||||
@@ -183,10 +217,12 @@ public:
|
|||||||
configuration += format(
|
configuration += format(
|
||||||
R"(
|
R"(
|
||||||
typedef {type} type;
|
typedef {type} type;
|
||||||
typedef {type}{vector} typeX;
|
|
||||||
#define WIDTH {vector}
|
#define WIDTH {vector}
|
||||||
#define vloadX vload{vector}
|
#if WIDTH != 1
|
||||||
#define vstoreX vstore{vector}
|
typedef {type}{vector} typeX;
|
||||||
|
#define vloadX vload{vector}
|
||||||
|
#define vstoreX vstore{vector}
|
||||||
|
#endif
|
||||||
)",
|
)",
|
||||||
{{"type", getTypeName()}, {"vector", std::to_string((int)vector)}});
|
{{"type", getTypeName()}, {"vector", std::to_string((int)vector)}});
|
||||||
|
|
||||||
@@ -209,6 +245,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
std::cout << "completed" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
cl::Kernel create(Method method) {
|
cl::Kernel create(Method method) {
|
||||||
|
|||||||
@@ -3,18 +3,16 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
OpenCL::OpenCL() {
|
OpenCL::OpenCL() {}
|
||||||
|
|
||||||
|
void OpenCL::init() {
|
||||||
try {
|
try {
|
||||||
std::vector<cl::Platform> platforms;
|
std::vector<cl::Platform> platforms;
|
||||||
cl::Platform::get(&platforms);
|
cl::Platform::get(&platforms);
|
||||||
|
if (platforms.empty())
|
||||||
if (platforms.empty()) {
|
|
||||||
throw std::runtime_error("No OpenCL platforms found");
|
throw std::runtime_error("No OpenCL platforms found");
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<cl::Device> devices;
|
std::vector<cl::Device> devices;
|
||||||
bool deviceFound = false;
|
bool deviceFound = false;
|
||||||
|
|
||||||
for (const auto &platform : platforms) {
|
for (const auto &platform : platforms) {
|
||||||
try {
|
try {
|
||||||
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
|
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
|
||||||
@@ -26,7 +24,6 @@ OpenCL::OpenCL() {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!deviceFound) {
|
if (!deviceFound) {
|
||||||
for (const auto &platform : platforms) {
|
for (const auto &platform : platforms) {
|
||||||
try {
|
try {
|
||||||
@@ -40,12 +37,10 @@ OpenCL::OpenCL() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!deviceFound)
|
||||||
if (!deviceFound) {
|
|
||||||
throw std::runtime_error("No suitable OpenCL devices found");
|
throw std::runtime_error("No suitable OpenCL devices found");
|
||||||
}
|
|
||||||
|
|
||||||
device = devices[0];
|
device = devices[0];
|
||||||
|
printDeviceInfo();
|
||||||
context = cl::Context(device);
|
context = cl::Context(device);
|
||||||
queue = cl::CommandQueue(context, device,
|
queue = cl::CommandQueue(context, device,
|
||||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
|
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ private:
|
|||||||
public:
|
public:
|
||||||
OpenCL();
|
OpenCL();
|
||||||
|
|
||||||
|
void init();
|
||||||
|
|
||||||
OpenCL(const OpenCL &) = delete;
|
OpenCL(const OpenCL &) = delete;
|
||||||
OpenCL &operator=(const OpenCL &) = delete;
|
OpenCL &operator=(const OpenCL &) = delete;
|
||||||
OpenCL(OpenCL &&) = delete;
|
OpenCL(OpenCL &&) = delete;
|
||||||
|
|||||||
@@ -45,8 +45,12 @@ private:
|
|||||||
all(other.getEvent()), &event_);
|
all(other.getEvent()), &event_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr const static Kernels<T>::Vector vector = Kernels<T>::Vector::type1;
|
||||||
|
constexpr const static int vectorSize = (int)vector;
|
||||||
|
constexpr const static int tileSize = vectorSize * 4;
|
||||||
|
|
||||||
static cl::Kernel createKernel(Kernels<T>::Method method) {
|
static cl::Kernel createKernel(Kernels<T>::Method method) {
|
||||||
static Kernels<T> kernels(Kernels<T>::Vector::type4);
|
static Kernels<T> kernels(vector);
|
||||||
return kernels.create(method);
|
return kernels.create(method);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -56,7 +60,7 @@ public:
|
|||||||
using ITensor::axes_;
|
using ITensor::axes_;
|
||||||
using ITensor::checkAxisInDim;
|
using ITensor::checkAxisInDim;
|
||||||
using ITensor::checkItHasSameShape;
|
using ITensor::checkItHasSameShape;
|
||||||
using ITensor::computeIndex;
|
// using ITensor::computeIndex;
|
||||||
using ITensor::getSize;
|
using ITensor::getSize;
|
||||||
using ITensor::shape_;
|
using ITensor::shape_;
|
||||||
|
|
||||||
@@ -124,30 +128,32 @@ public:
|
|||||||
using ITensor::operator-;
|
using ITensor::operator-;
|
||||||
|
|
||||||
Tensor operator+() const override {
|
Tensor operator+() const override {
|
||||||
|
Tensor result = *this;
|
||||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::POSITIVE);
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::POSITIVE);
|
||||||
kernel.setArg(0, *data_);
|
kernel.setArg(0, *result.getData());
|
||||||
kernel.setArg(1, (int)getSize());
|
kernel.setArg(1, (int)result.getSize());
|
||||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
openCL.getQueue().enqueueNDRangeKernel(
|
||||||
cl::NDRange(getSize()),
|
kernel, cl::NullRange, cl::NDRange(result.getSize()), cl::NullRange,
|
||||||
cl::NullRange, all(event_), &event_);
|
all(result.event_), &result.event_);
|
||||||
return *this;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor operator-() const override {
|
Tensor operator-() const override {
|
||||||
|
Tensor result = *this;
|
||||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::NEGATIVE);
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::NEGATIVE);
|
||||||
kernel.setArg(0, *data_);
|
kernel.setArg(0, *result.getData());
|
||||||
kernel.setArg(1, (int)getSize());
|
kernel.setArg(1, (int)result.getSize());
|
||||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
openCL.getQueue().enqueueNDRangeKernel(
|
||||||
cl::NDRange(getSize()),
|
kernel, cl::NullRange, cl::NDRange(result.getSize()), cl::NullRange,
|
||||||
cl::NullRange, all(event_), &event_);
|
all(result.event_), &result.event_);
|
||||||
return *this;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor &operator+=(const T scalar) override {
|
Tensor &operator+=(const T scalar) override {
|
||||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::S_ADD);
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::S_ADD);
|
||||||
kernel.setArg(0, *data_);
|
kernel.setArg(0, *data_);
|
||||||
kernel.setArg(1, scalar);
|
kernel.setArg(1, (int)getSize());
|
||||||
kernel.setArg(2, (int)getSize());
|
kernel.setArg(2, scalar);
|
||||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
||||||
cl::NDRange(getSize()),
|
cl::NDRange(getSize()),
|
||||||
cl::NullRange, all(event_), &event_);
|
cl::NullRange, all(event_), &event_);
|
||||||
@@ -157,8 +163,8 @@ public:
|
|||||||
Tensor &operator*=(const T scalar) override {
|
Tensor &operator*=(const T scalar) override {
|
||||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::S_MULT);
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::S_MULT);
|
||||||
kernel.setArg(0, *data_);
|
kernel.setArg(0, *data_);
|
||||||
kernel.setArg(1, scalar);
|
kernel.setArg(1, (int)getSize());
|
||||||
kernel.setArg(2, (int)getSize());
|
kernel.setArg(2, scalar);
|
||||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
||||||
cl::NDRange(getSize()),
|
cl::NDRange(getSize()),
|
||||||
cl::NullRange, all(event_), &event_);
|
cl::NullRange, all(event_), &event_);
|
||||||
@@ -166,6 +172,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor &operator+=(const Tensor &other) override {
|
Tensor &operator+=(const Tensor &other) override {
|
||||||
|
checkItHasSameShape(other);
|
||||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_ADD);
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_ADD);
|
||||||
kernel.setArg(0, *data_);
|
kernel.setArg(0, *data_);
|
||||||
kernel.setArg(1, *other.getData());
|
kernel.setArg(1, *other.getData());
|
||||||
@@ -177,18 +184,17 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor &operator*=(const Tensor &other) override {
|
Tensor &operator*=(const Tensor &other) override {
|
||||||
|
checkItHasSameShape(other);
|
||||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_HADAMARD);
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_HADAMARD);
|
||||||
kernel.setArg(0, *data_);
|
kernel.setArg(0, *data_);
|
||||||
kernel.setArg(1, *other.getData());
|
kernel.setArg(1, *other.getData());
|
||||||
kernel.setArg(2, getSize());
|
kernel.setArg(2, (int)getSize());
|
||||||
openCL.getQueue().enqueueNDRangeKernel(
|
openCL.getQueue().enqueueNDRangeKernel(
|
||||||
kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange,
|
kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange,
|
||||||
all(event_, other.event_), &event_);
|
all(event_, other.event_), &event_);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define TILE_SIZE 16
|
|
||||||
#define VEC_SIZE 4
|
|
||||||
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
|
Tensor<T, Dim == 1 ? 0 : 2> operator%(const Tensor &other) const {
|
||||||
static_assert(Dim == 1 || Dim == 2,
|
static_assert(Dim == 1 || Dim == 2,
|
||||||
"Inner product is only defined for vectors and matrices");
|
"Inner product is only defined for vectors and matrices");
|
||||||
@@ -209,17 +215,28 @@ public:
|
|||||||
kernel.setArg(3, (int)m);
|
kernel.setArg(3, (int)m);
|
||||||
kernel.setArg(4, (int)n);
|
kernel.setArg(4, (int)n);
|
||||||
kernel.setArg(5, (int)k);
|
kernel.setArg(5, (int)k);
|
||||||
cl::NDRange global_size(m / VEC_SIZE, n);
|
cl::NDRange globalSize(m, n);
|
||||||
cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
|
|
||||||
openCL.getQueue().enqueueNDRangeKernel(
|
openCL.getQueue().enqueueNDRangeKernel(
|
||||||
kernel, cl::NullRange, global_size, local_size,
|
kernel, cl::NullRange, globalSize, cl::NullRange,
|
||||||
all(event_, other.event_), &result.event_);
|
all(event_, other.event_), &result.event_);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor apply(Function f, bool derivative = false) const override {
|
||||||
|
Tensor result = *this;
|
||||||
|
cl::Kernel kernel = createKernel(Kernels<T>::Method::FUNC);
|
||||||
|
kernel.setArg(0, *result.getData());
|
||||||
|
kernel.setArg(1, (int)f);
|
||||||
|
kernel.setArg(2, (int)derivative);
|
||||||
|
openCL.getQueue().enqueueNDRangeKernel(
|
||||||
|
kernel, cl::NullRange, cl::NDRange(result.getSize()), cl::NullRange,
|
||||||
|
all(result.event_), &result.event_);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
|
||||||
std::string toString() const override {
|
std::string toString() const override {
|
||||||
std::vector<float> result(getSize());
|
std::vector<T> result(getSize());
|
||||||
openCL.getQueue().enqueueReadBuffer(*data_, CL_FALSE, 0,
|
openCL.getQueue().enqueueReadBuffer(*data_, CL_FALSE, 0,
|
||||||
getSize() * sizeof(T), result.data(),
|
getSize() * sizeof(T), result.data(),
|
||||||
all(event_), &event_);
|
all(event_), &event_);
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
#include "opencl/tensor.hpp"
|
#include "opencl/tensor.hpp"
|
||||||
|
#include <iostream>
|
||||||
OpenCL openCL;
|
OpenCL openCL;
|
||||||
#elif USE_CPU
|
#elif USE_CPU
|
||||||
#include "cpu/tensor.hpp"
|
#include "cpu/tensor.hpp"
|
||||||
@@ -15,40 +16,48 @@ enum class TENSOR_PLATFORM { CPU, OPENCL };
|
|||||||
|
|
||||||
template <typename T, int Dim>
|
template <typename T, int Dim>
|
||||||
void register_tensor(py::module &m, const std::string &name) {
|
void register_tensor(py::module &m, const std::string &name) {
|
||||||
auto tensor = py::class_<Tensor<T, Dim>>(m, name.c_str())
|
auto tensor =
|
||||||
.def(py::init<const std::array<size_t, Dim> &>())
|
py::class_<Tensor<T, Dim>>(m, name.c_str())
|
||||||
.def(py::init<const std::array<size_t, Dim> &, T>())
|
.def(py::init<const std::array<size_t, Dim> &>())
|
||||||
.def(py::init<const std::array<size_t, Dim> &,
|
.def(py::init<const std::array<size_t, Dim> &, T>())
|
||||||
const std::vector<T> &>())
|
.def(py::init<const std::array<size_t, Dim> &,
|
||||||
.def(py::init<const std::array<size_t, Dim> &, T, T>())
|
const std::vector<T> &>())
|
||||||
|
.def(py::init<const std::array<size_t, Dim> &, T, T>())
|
||||||
|
|
||||||
.def("get_shape", &Tensor<T, Dim>::getShape)
|
.def("get_shape", &Tensor<T, Dim>::getShape)
|
||||||
.def("get_axes", &Tensor<T, Dim>::getAxes)
|
.def("get_axes", &Tensor<T, Dim>::getAxes)
|
||||||
.def("get_size", &Tensor<T, Dim>::getSize)
|
.def("get_size", &Tensor<T, Dim>::getSize)
|
||||||
|
|
||||||
.def(py::self + py::self)
|
.def(py::self + py::self)
|
||||||
.def(py::self - py::self)
|
.def(py::self - py::self)
|
||||||
.def(py::self * py::self)
|
.def(py::self * py::self)
|
||||||
.def(py::self += py::self)
|
.def(py::self += py::self)
|
||||||
.def(py::self -= py::self)
|
.def(py::self -= py::self)
|
||||||
.def(py::self *= py::self)
|
.def(py::self *= py::self)
|
||||||
|
|
||||||
.def(py::self + T())
|
.def(py::self + T())
|
||||||
.def(py::self - T())
|
.def(py::self - T())
|
||||||
.def(py::self * T())
|
.def(py::self * T())
|
||||||
.def(py::self / T())
|
.def(py::self / T())
|
||||||
.def(py::self += T())
|
.def(py::self += T())
|
||||||
.def(py::self -= T())
|
.def(py::self -= T())
|
||||||
.def(py::self *= T())
|
.def(py::self *= T())
|
||||||
.def(py::self /= T())
|
.def(py::self /= T())
|
||||||
.def(T() + py::self)
|
.def(T() + py::self)
|
||||||
.def(T() - py::self)
|
.def(T() - py::self)
|
||||||
.def(T() * py::self)
|
.def(T() * py::self)
|
||||||
|
|
||||||
.def("__pos__", [](const Tensor<T, Dim> &t) { return +t; })
|
.def("__pos__", [](const Tensor<T, Dim> &t) { return +t; })
|
||||||
.def("__neg__", [](const Tensor<T, Dim> &t) { return -t; })
|
.def("__neg__", [](const Tensor<T, Dim> &t) { return -t; })
|
||||||
|
|
||||||
.def("__repr__", &Tensor<T, Dim>::toString);
|
.def("__call__", [](const Tensor<T, Dim> &self,
|
||||||
|
Function f) { return self.apply(f); })
|
||||||
|
.def("__call__",
|
||||||
|
[](const Tensor<T, Dim> &self, Function f, bool derivative) {
|
||||||
|
return self.apply(f, derivative);
|
||||||
|
})
|
||||||
|
|
||||||
|
.def("__repr__", &Tensor<T, Dim>::toString);
|
||||||
|
|
||||||
if constexpr (Dim >= 2) {
|
if constexpr (Dim >= 2) {
|
||||||
tensor
|
tensor
|
||||||
@@ -101,7 +110,6 @@ void register_tensor(py::module &m, const std::string &name) {
|
|||||||
});
|
});
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// if constexpr (Dim == 1 || Dim == 2)
|
|
||||||
if constexpr (Dim == 2)
|
if constexpr (Dim == 2)
|
||||||
tensor.def("__matmul__", &Tensor<T, Dim>::operator%);
|
tensor.def("__matmul__", &Tensor<T, Dim>::operator%);
|
||||||
}
|
}
|
||||||
@@ -114,18 +122,28 @@ PYBIND11_MODULE(tensor, m) {
|
|||||||
.value("OPENCL", TENSOR_PLATFORM::OPENCL)
|
.value("OPENCL", TENSOR_PLATFORM::OPENCL)
|
||||||
.export_values();
|
.export_values();
|
||||||
|
|
||||||
|
py::enum_<Function>(m, "FUNCTION")
|
||||||
|
.value("SIGMOID", Function::SIGMOID)
|
||||||
|
.value("RELU", Function::RELU)
|
||||||
|
.value("MSE", Function::MSE)
|
||||||
|
.value("LINEAR", Function::LINEAR)
|
||||||
|
.export_values();
|
||||||
|
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
m.attr("MODE") = TENSOR_PLATFORM::OPENCL;
|
m.attr("MODE") = TENSOR_PLATFORM::OPENCL;
|
||||||
#elif USE_CPU
|
#elif USE_CPU
|
||||||
m.attr("MODE") = TENSOR_PLATFORM::CPU;
|
m.attr("MODE") = TENSOR_PLATFORM::CPU;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
m.def("init", []() { openCL.init(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
register_tensor<float, 0>(m, "Scalar");
|
register_tensor<float, 0>(m, "Scalar");
|
||||||
register_tensor<float, 1>(m, "Vector");
|
register_tensor<float, 1>(m, "Vector");
|
||||||
register_tensor<float, 2>(m, "Matrix");
|
register_tensor<float, 2>(m, "Matrix");
|
||||||
register_tensor<float, 3>(m, "Tensor3");
|
register_tensor<float, 3>(m, "Tensor3");
|
||||||
|
|
||||||
#ifndef USE_OPENCL
|
|
||||||
register_tensor<double, 0>(m, "dScalar");
|
register_tensor<double, 0>(m, "dScalar");
|
||||||
register_tensor<double, 1>(m, "dVector");
|
register_tensor<double, 1>(m, "dVector");
|
||||||
register_tensor<double, 2>(m, "dMatrix");
|
register_tensor<double, 2>(m, "dMatrix");
|
||||||
@@ -135,5 +153,11 @@ PYBIND11_MODULE(tensor, m) {
|
|||||||
register_tensor<int, 1>(m, "iVector");
|
register_tensor<int, 1>(m, "iVector");
|
||||||
register_tensor<int, 2>(m, "iMatrix");
|
register_tensor<int, 2>(m, "iMatrix");
|
||||||
register_tensor<int, 3>(m, "iTensor3");
|
register_tensor<int, 3>(m, "iTensor3");
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
register_tensor<half, 0>(m, "hScalar");
|
||||||
|
register_tensor<half, 1>(m, "hVector");
|
||||||
|
register_tensor<half, 2>(m, "hMatrix");
|
||||||
|
register_tensor<half, 3>(m, "hTensor3");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
template <typename T, int Dim> class Tensor;
|
template <typename T, int Dim> class Tensor;
|
||||||
|
enum class Function { SIGMOID, RELU, MSE, LINEAR };
|
||||||
|
|
||||||
template <typename T, int Dim> class ITensor {
|
template <typename T, int Dim> class ITensor {
|
||||||
protected:
|
protected:
|
||||||
@@ -73,6 +74,8 @@ public:
|
|||||||
|
|
||||||
Tensor operator*(const Tensor &other) const;
|
Tensor operator*(const Tensor &other) const;
|
||||||
|
|
||||||
|
virtual Tensor apply(Function f, bool derivative = false) const = 0;
|
||||||
|
|
||||||
// === Utils ===
|
// === Utils ===
|
||||||
virtual std::string toString() const = 0;
|
virtual std::string toString() const = 0;
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user