mirror of
https://github.com/StepanovPlaton/NeuralNetwork.git
synced 2026-04-03 20:30:39 +04:00
Check
This commit is contained in:
@@ -34,6 +34,7 @@ OPENCL_LIB_PATH = -L"A:/Programs/OpenCL/lib"
|
||||
OPENCL_LIB = -lOpenCL
|
||||
|
||||
.DEFAULT_GOAL := cpu
|
||||
.PHONY: cpu opencl cpu_module opencl_module clean
|
||||
|
||||
$(BUILD_DIR):
|
||||
$(MKDIR) $(BUILD_DIR)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#ifdef USE_OPENCL
|
||||
#include "opencl/tensor.hpp"
|
||||
OpenCL openCL;
|
||||
// TODO: GENERIC KERNELS
|
||||
// TODO: Scalar mult
|
||||
#elif USE_CPU
|
||||
#include "cpu/tensor.hpp"
|
||||
@@ -21,19 +20,21 @@ public:
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto duration =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(end - start);
|
||||
std::cout << operation << ": " << duration.count() << " μs\n";
|
||||
std::cout << operation << ": " << duration.count() << " ns\n";
|
||||
}
|
||||
};
|
||||
|
||||
int main() {
|
||||
#ifdef USE_OPENCL
|
||||
openCL.init("./");
|
||||
openCL.printDeviceInfo();
|
||||
#endif
|
||||
|
||||
Tensor<float, 2> a = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
||||
Tensor<float, 2> b = Tensor<float, 2>({4096 * 2, 4096 * 2}, 1);
|
||||
Profiler::measure("Matrix multiplication", [&]() {
|
||||
auto result = a % b;
|
||||
std::cout << result.toString();
|
||||
});
|
||||
|
||||
Profiler::measure("Matrix multiplication", [&]() { auto result = a % b; });
|
||||
std::cout << a.toString();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
#include "opencl.hpp"
|
||||
#include <CL/opencl.hpp>
|
||||
|
||||
#include "opencl.hpp"
|
||||
|
||||
#include <format>
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
template <typename T, int Dim> class Kernels {
|
||||
template <typename T> class Kernels {
|
||||
public:
|
||||
enum class Vector {
|
||||
type2 = 2,
|
||||
@@ -21,136 +25,211 @@ public:
|
||||
T_HADAMARD,
|
||||
T_MULT,
|
||||
};
|
||||
constexpr const static std::string type = typeid(T).name();
|
||||
|
||||
// TODO: get native vector size
|
||||
static Vector vector = Vector::type8;
|
||||
|
||||
private:
|
||||
static std::string unaryOperation(std::string name, std::string operation) {
|
||||
return std::format(
|
||||
constexpr std::string getTypeName() { return "unknown"; }
|
||||
Vector vector;
|
||||
std::string configuration;
|
||||
|
||||
std::string format(std::string tmp,
|
||||
std::unordered_map<std::string, std::string> args) {
|
||||
std::string result(tmp);
|
||||
for (const auto &[key, value] : args) {
|
||||
std::string placeholder = "{" + key + "}";
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(placeholder, pos)) != std::string::npos) {
|
||||
result.replace(pos, placeholder.length(), value);
|
||||
pos += value.length();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string unaryOperation(std::string name, std::string operation) {
|
||||
return format(
|
||||
R"(
|
||||
__kernel void {method}(__global {type}* A, int len) {{
|
||||
__kernel void {method}(__global type* A, int len) {
|
||||
int gid = get_global_id(0);
|
||||
int base = gid * {vector};
|
||||
if (base + ({vector}-1) < len) {{
|
||||
{type}{vector} data = vload{vector}(gid, A);
|
||||
vstore{vector}({operation}data, gid, A);
|
||||
}} else {{
|
||||
for (int i = 0; i < {vec_size}; i++) {{
|
||||
int base = gid * WIDTH;
|
||||
if (base + WIDTH <= len) {
|
||||
typeX data = vloadX(gid, A);
|
||||
vstoreX({operation}data, gid, A);
|
||||
} else {
|
||||
for (int i = 0; i < WIDTH; i++) {
|
||||
int idx = base + i;
|
||||
if (idx < len) A[idx] = {operation}A[idx];
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
)",
|
||||
std::make_format_args(std::make_pair("method", name),
|
||||
std::make_pair("vector", vector),
|
||||
std::make_pair("type", type),
|
||||
std::make_pair("operation", operation)));
|
||||
}
|
||||
}
|
||||
})",
|
||||
{{"method", name}, {"operation", operation}});
|
||||
}
|
||||
|
||||
static std::string scalarOperation(std::string name, std::string operation) {
|
||||
return std::format(
|
||||
std::string scalarOperation(std::string name, std::string operation) {
|
||||
return format(
|
||||
R"(
|
||||
__kernel void {method}(__global {type}* A, int len, {type} scalar) {{
|
||||
__kernel void {method}(__global type* A, int len, type scalar) {
|
||||
int gid = get_global_id(0);
|
||||
int base = gid * {vector};
|
||||
if (base + ({vector}-1) < len) {{
|
||||
{type}{vector} data = vload{vector}(gid, A);
|
||||
int base = gid * WIDTH;
|
||||
if (base + WIDTH <= len) {
|
||||
typeX data = vloadX(gid, A);
|
||||
data = data {operation} scalar;
|
||||
vstore{vector}(data, gid, A);
|
||||
}} else {{
|
||||
for (int i = 0; i < {vec_size}; i++) {{
|
||||
vstoreX(data, gid, A);
|
||||
} else {
|
||||
for (int i = 0; i < WIDTH; i++) {
|
||||
int idx = base + i;
|
||||
if (idx < len) A[idx] = A[idx] {operation} scalar;
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
)",
|
||||
std::make_format_args(std::make_pair("method", name),
|
||||
std::make_pair("vector", vector),
|
||||
std::make_pair("type", type),
|
||||
std::make_pair("operation", operation)));
|
||||
}
|
||||
}
|
||||
})",
|
||||
{{"method", name}, {"operation", operation}});
|
||||
}
|
||||
|
||||
static std::string binaryOperation(std::string name, std::string operation) {
|
||||
return std::format(
|
||||
std::string binaryOperation(std::string name, std::string operation) {
|
||||
return format(
|
||||
R"(
|
||||
__kernel void {method}(__global {type}* A, __global {type}* B, int len) {{
|
||||
__kernel void {method}(__global type* A, __global type* B, int len) {
|
||||
int gid = get_global_id(0);
|
||||
int base = gid * {vector};
|
||||
if (base + ({vector}-1) < len) {{
|
||||
{type}{vector} dataA = vload{vector}(gid, A);
|
||||
{type}{vector} dataB = vload{vector}(gid, B);
|
||||
vstore{vector}(dataA {operation} dataB, gid, A);
|
||||
}} else {{
|
||||
for (int i = 0; i < {vector}; i++) {{
|
||||
int base = gid * WIDTH;
|
||||
if (base + WIDTH <= len) {
|
||||
typeX dataA = vloadX(gid, A);
|
||||
typeX dataB = vloadX(gid, B);
|
||||
vstoreX(dataA {operation} dataB, gid, A);
|
||||
} else {
|
||||
for (int i = 0; i < WIDTH; i++) {
|
||||
int idx = base + i;
|
||||
if (idx < len) A[idx] = A[idx] {operation} B[idx];
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
)",
|
||||
std::make_format_args(std::make_pair("method", name),
|
||||
std::make_pair("vector", vector),
|
||||
std::make_pair("type", type),
|
||||
std::make_pair("operation", operation)));
|
||||
}
|
||||
}
|
||||
})",
|
||||
{{"method", name}, {"operation", operation}});
|
||||
}
|
||||
|
||||
static std::unordered_map<Method, std::tuple<std::string, std::string>>
|
||||
programs = {
|
||||
std::string matrixMult(std::string name) {
|
||||
return format(
|
||||
R"(
|
||||
#define TILE_SIZE WIDTH*4
|
||||
__kernel void mult(const __global typeX* A,
|
||||
const __global typeX* B,
|
||||
__global typeX* C, const int M, const int N, const int K) {
|
||||
const int row = get_local_id(0);
|
||||
const int col = get_local_id(1);
|
||||
const int globalRow = (TILE_SIZE/WIDTH)*get_group_id(0) + row;
|
||||
const int globalCol = TILE_SIZE*get_group_id(1) + col;
|
||||
__local typeX Asub[TILE_SIZE][TILE_SIZE/WIDTH];
|
||||
__local typeX Bsub[TILE_SIZE][TILE_SIZE/WIDTH];
|
||||
typeX acc = 0;
|
||||
const int numTiles = K/TILE_SIZE;
|
||||
for (int tile = 0; tile < numTiles; tile++) {
|
||||
const int tiledRow = (TILE_SIZE/WIDTH)*tile + row;
|
||||
const int tiledCol = TILE_SIZE*tile + col;
|
||||
Asub[col][row] = A[tiledCol*(M/WIDTH) + globalRow];
|
||||
Bsub[col][row] = B[globalCol*(K/WIDTH) + tiledRow];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
typeX vecA, vecB;
|
||||
type valB;
|
||||
for (int k = 0; k < TILE_SIZE/WIDTH; k++) {
|
||||
vecB = Bsub[col][k];
|
||||
for (int w = 0; w < WIDTH; w++) {
|
||||
vecA = Asub[WIDTH*k + w][row];
|
||||
valB = vecB[w];
|
||||
for (int i = 0; i < WIDTH; i++)
|
||||
acc[i] += vecA[i] * valB;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
C[globalCol*(M/WIDTH) + globalRow] = acc;
|
||||
}
|
||||
)",
|
||||
{{"method", name}});
|
||||
}
|
||||
|
||||
std::unordered_map<Method, std::tuple<std::string, std::string>> programs = {
|
||||
{Method::POSITIVE, {unaryOperation("positive", "+"), "positive"}},
|
||||
{Method::NEGATIVE, {unaryOperation("negative", "-")}, "negative"},
|
||||
{Method::NEGATIVE, {unaryOperation("negative", "-"), "negative"}},
|
||||
|
||||
{Method::S_ADD, {scalarOperation("add", "+")}, "add"},
|
||||
{Method::S_MULT, {scalarOperation("mult", "*")}, "mult"},
|
||||
{Method::S_ADD, {scalarOperation("add", "+"), "add"}},
|
||||
{Method::S_MULT, {scalarOperation("mult", "*"), "mult"}},
|
||||
|
||||
{Method::T_ADD, {binaryOperation("add", "+")}, "add"},
|
||||
{Method::T_ADD, {binaryOperation("add", "+"), "add"}},
|
||||
{Method::T_HADAMARD,
|
||||
{binaryOperation("hadamard_mult", "*")},
|
||||
"hadamard_mult"},
|
||||
{Method::T_MULT, {"", "mult"}},
|
||||
{binaryOperation("hadamard_mult", "*"), "hadamard_mult"}},
|
||||
|
||||
{Method::T_MULT, {matrixMult("mult"), "mult"}},
|
||||
};
|
||||
|
||||
static inline std::unordered_map<Method, cl::Program> compiledPrograms;
|
||||
static inline std::mutex compileMutex;
|
||||
std::unordered_map<Method, cl::Program> compiledPrograms;
|
||||
|
||||
public:
|
||||
static cl::Kernel create(Method method) {
|
||||
std::lock_guard<std::mutex> lock(compileMutex);
|
||||
|
||||
auto cache = compiledPrograms.find(method);
|
||||
if (cache != compiledPrograms.end()) {
|
||||
const auto &programName = std::get<1>(programs[method]);
|
||||
return cl::Kernel(cache->second, programName.c_str());
|
||||
}
|
||||
|
||||
auto program = programs.find(method);
|
||||
if (program == programs.end())
|
||||
throw std::runtime_error("Unknown method: " +
|
||||
std::to_string(static_cast<int>(method)));
|
||||
const auto &[sourceCode, kernelName] = program->second;
|
||||
Kernels(Vector vec = Vector::type4) : vector(vec) {
|
||||
std::string extensions = openCL.getDevice().getInfo<CL_DEVICE_EXTENSIONS>();
|
||||
if (extensions.find("cl_khr_fp16") != std::string::npos)
|
||||
configuration = R"(
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
typedef half _half;
|
||||
typedef half2 _half2;
|
||||
typedef half4 _half4;
|
||||
typedef half8 _half8;
|
||||
typedef half16 _half16;
|
||||
)";
|
||||
else
|
||||
configuration = R"(
|
||||
typedef float _half;
|
||||
typedef float2 _half2;
|
||||
typedef float4 _half4;
|
||||
typedef float8 _half8;
|
||||
typedef float16 _half16;
|
||||
)";
|
||||
configuration += format(
|
||||
R"(
|
||||
typedef {type} type;
|
||||
typedef {type}{vector} typeX;
|
||||
#define WIDTH {vector}
|
||||
#define vloadX vload{vector}
|
||||
#define vstoreX vstore{vector}
|
||||
)",
|
||||
{{"type", getTypeName()}, {"vector", std::to_string((int)vector)}});
|
||||
|
||||
for (const auto &[method, programInfo] : programs) {
|
||||
const auto &[sourceCode, kernelName] = programInfo;
|
||||
if (!sourceCode.empty()) {
|
||||
cl::Program program(openCL.getContext(), configuration + sourceCode);
|
||||
try {
|
||||
cl::Program::Sources sources;
|
||||
sources.push_back({sourceCode.c_str(), sourceCode.length()});
|
||||
cl::Program program(openCL.getContext(), sources);
|
||||
program.build({openCL.getDevice()});
|
||||
compiledPrograms[method] = program;
|
||||
return cl::Kernel(program, kernelName.c_str());
|
||||
|
||||
} catch (const cl::Error &e) {
|
||||
if (e.err() == CL_BUILD_PROGRAM_FAILURE) {
|
||||
cl::Program program(openCL.getContext(),
|
||||
{sourceCode.c_str(), sourceCode.length()});
|
||||
auto buildInfo =
|
||||
std::cerr << "OpenCL compilation error for method "
|
||||
<< static_cast<int>(method) << ": " << e.what()
|
||||
<< std::endl;
|
||||
std::string buildLog =
|
||||
program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(openCL.getDevice());
|
||||
throw std::runtime_error(
|
||||
"OpenCL compilation failed: " + std::string(e.what()) +
|
||||
"\nBuild log:\n" + buildInfo);
|
||||
std::cerr << "Build log for method " << static_cast<int>(method)
|
||||
<< ":" << std::endl;
|
||||
std::cerr << buildLog << std::endl;
|
||||
}
|
||||
throw std::runtime_error("OpenCL error: " + std::string(e.what()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cl::Kernel create(Method method) {
|
||||
auto it = compiledPrograms.find(method);
|
||||
if (it == compiledPrograms.end())
|
||||
throw std::runtime_error("Program for method not found or not compiled");
|
||||
const auto &kernelName = std::get<1>(programs[method]);
|
||||
return cl::Kernel(it->second, kernelName.c_str());
|
||||
}
|
||||
};
|
||||
|
||||
#define SPECIALIZE_KERNELS_TYPE(type, name) \
|
||||
template <> constexpr std::string Kernels<type>::getTypeName() { \
|
||||
return name; \
|
||||
}
|
||||
SPECIALIZE_KERNELS_TYPE(char, "char")
|
||||
SPECIALIZE_KERNELS_TYPE(short, "short")
|
||||
SPECIALIZE_KERNELS_TYPE(int, "int")
|
||||
SPECIALIZE_KERNELS_TYPE(long, "long")
|
||||
SPECIALIZE_KERNELS_TYPE(float, "float")
|
||||
SPECIALIZE_KERNELS_TYPE(double, "double")
|
||||
|
||||
typedef cl_half half;
|
||||
SPECIALIZE_KERNELS_TYPE(half, "_half")
|
||||
@@ -71,4 +71,65 @@ void OpenCL::printDeviceInfo() const {
|
||||
<< std::endl;
|
||||
std::cout << "Max Work Group Size: "
|
||||
<< device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << std::endl;
|
||||
std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
|
||||
|
||||
std::cout << "Optimal vector sizes:" << std::endl;
|
||||
try {
|
||||
cl_uint short_native =
|
||||
device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT>();
|
||||
cl_uint short_preferred =
|
||||
device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT>();
|
||||
std::cout << " short: native=" << short_native
|
||||
<< ", preferred=" << short_preferred << std::endl;
|
||||
} catch (const cl::Error &e) {
|
||||
std::cout << " short: N/A (error: " << e.what() << ")" << std::endl;
|
||||
}
|
||||
try {
|
||||
cl_uint int_native = device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_INT>();
|
||||
cl_uint int_preferred =
|
||||
device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT>();
|
||||
std::cout << " int: native=" << int_native
|
||||
<< ", preferred=" << int_preferred << std::endl;
|
||||
} catch (const cl::Error &e) {
|
||||
std::cout << " int: N/A (error: " << e.what() << ")" << std::endl;
|
||||
}
|
||||
try {
|
||||
if (extensions.find("cl_khr_fp16") != std::string::npos) {
|
||||
cl_uint half_native =
|
||||
device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF>();
|
||||
cl_uint half_preferred =
|
||||
device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF>();
|
||||
std::cout << " half: native=" << half_native
|
||||
<< ", preferred=" << half_preferred << std::endl;
|
||||
} else {
|
||||
std::cout << " half: not supported" << std::endl;
|
||||
}
|
||||
} catch (const cl::Error &e) {
|
||||
std::cout << " half: N/A (error: " << e.what() << ")" << std::endl;
|
||||
}
|
||||
try {
|
||||
cl_uint float_native =
|
||||
device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT>();
|
||||
cl_uint float_preferred =
|
||||
device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>();
|
||||
std::cout << " float: native=" << float_native
|
||||
<< ", preferred=" << float_preferred << std::endl;
|
||||
} catch (const cl::Error &e) {
|
||||
std::cout << " float: N/A (error: " << e.what() << ")" << std::endl;
|
||||
}
|
||||
try {
|
||||
if (extensions.find("cl_khr_fp64") != std::string::npos ||
|
||||
device.getInfo<CL_DEVICE_VERSION>().find("1.0") == std::string::npos) {
|
||||
cl_uint double_native =
|
||||
device.getInfo<CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE>();
|
||||
cl_uint double_preferred =
|
||||
device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE>();
|
||||
std::cout << " double: native=" << double_native
|
||||
<< ", preferred=" << double_preferred << std::endl;
|
||||
} else {
|
||||
std::cout << " double: not supported" << std::endl;
|
||||
}
|
||||
} catch (const cl::Error &e) {
|
||||
std::cout << " double: N/A (error: " << e.what() << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,9 +45,13 @@ private:
|
||||
all(other.getEvent()), &event_);
|
||||
}
|
||||
|
||||
static cl::Kernel createKernel(Kernels<T>::Method method) {
|
||||
static Kernels<T> kernels(Kernels<T>::Vector::type4);
|
||||
return kernels.create(method);
|
||||
}
|
||||
|
||||
public:
|
||||
typedef class ITensor<T, Dim> ITensor;
|
||||
typedef class Kernels<T, Dim> Kernels;
|
||||
|
||||
using ITensor::axes_;
|
||||
using ITensor::checkAxisInDim;
|
||||
@@ -105,7 +109,7 @@ public:
|
||||
ITensor::operator=(std::move(other));
|
||||
data_ = other.data_;
|
||||
event_ = other.event_;
|
||||
other.data = nullptr;
|
||||
other.data_ = nullptr;
|
||||
return *this;
|
||||
}
|
||||
~Tensor() {
|
||||
@@ -120,8 +124,9 @@ public:
|
||||
using ITensor::operator-;
|
||||
|
||||
Tensor operator+() const override {
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::POSITIVE);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::POSITIVE);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, (int)getSize());
|
||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
||||
cl::NDRange(getSize()),
|
||||
cl::NullRange, all(event_), &event_);
|
||||
@@ -129,8 +134,9 @@ public:
|
||||
}
|
||||
|
||||
Tensor operator-() const override {
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::NEGATIVE);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::NEGATIVE);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, (int)getSize());
|
||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
||||
cl::NDRange(getSize()),
|
||||
cl::NullRange, all(event_), &event_);
|
||||
@@ -138,9 +144,10 @@ public:
|
||||
}
|
||||
|
||||
Tensor &operator+=(const T scalar) override {
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::S_ADD);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::S_ADD);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, scalar);
|
||||
kernel.setArg(2, (int)getSize());
|
||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
||||
cl::NDRange(getSize()),
|
||||
cl::NullRange, all(event_), &event_);
|
||||
@@ -148,9 +155,10 @@ public:
|
||||
}
|
||||
|
||||
Tensor &operator*=(const T scalar) override {
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::S_MULT);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::S_MULT);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, scalar);
|
||||
kernel.setArg(2, (int)getSize());
|
||||
openCL.getQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
|
||||
cl::NDRange(getSize()),
|
||||
cl::NullRange, all(event_), &event_);
|
||||
@@ -158,9 +166,10 @@ public:
|
||||
}
|
||||
|
||||
Tensor &operator+=(const Tensor &other) override {
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::T_ADD);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_ADD);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, *other.getData());
|
||||
kernel.setArg(2, (int)getSize());
|
||||
openCL.getQueue().enqueueNDRangeKernel(
|
||||
kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange,
|
||||
all(event_, other.event_), &event_);
|
||||
@@ -168,9 +177,10 @@ public:
|
||||
}
|
||||
|
||||
Tensor &operator*=(const Tensor &other) override {
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::T_HADAMARD);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_HADAMARD);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, *other.getData());
|
||||
kernel.setArg(2, getSize());
|
||||
openCL.getQueue().enqueueNDRangeKernel(
|
||||
kernel, cl::NullRange, cl::NDRange(getSize()), cl::NullRange,
|
||||
all(event_, other.event_), &event_);
|
||||
@@ -192,16 +202,14 @@ public:
|
||||
size_t k = shape_[axes_[1]];
|
||||
size_t n = other.shape_[other.axes_[1]];
|
||||
Tensor<T, 2> result({m, n});
|
||||
cl::Kernel kernel = Kernels::create(Kernels::Method::T_MULT);
|
||||
cl::Kernel kernel = createKernel(Kernels<T>::Method::T_MULT);
|
||||
kernel.setArg(0, *data_);
|
||||
kernel.setArg(1, *other.getData());
|
||||
kernel.setArg(2, *result.getData());
|
||||
kernel.setArg(3, (int)m);
|
||||
kernel.setArg(4, (int)n);
|
||||
kernel.setArg(5, (int)k);
|
||||
cl::NDRange global_size(
|
||||
((m + TILE_SIZE * VEC_SIZE - 1) / (TILE_SIZE * VEC_SIZE)) * TILE_SIZE,
|
||||
((n + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE);
|
||||
cl::NDRange global_size(m / VEC_SIZE, n);
|
||||
cl::NDRange local_size(TILE_SIZE / VEC_SIZE, TILE_SIZE);
|
||||
openCL.getQueue().enqueueNDRangeKernel(
|
||||
kernel, cl::NullRange, global_size, local_size,
|
||||
|
||||
@@ -125,12 +125,6 @@ PYBIND11_MODULE(tensor, m) {
|
||||
register_tensor<float, 2>(m, "Matrix");
|
||||
register_tensor<float, 3>(m, "Tensor3");
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
m.def("init", [](const std::string &programsBasePath) {
|
||||
openCL.init(programsBasePath);
|
||||
});
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENCL
|
||||
register_tensor<double, 0>(m, "dScalar");
|
||||
register_tensor<double, 1>(m, "dVector");
|
||||
|
||||
@@ -1,351 +0,0 @@
|
||||
"""
|
||||
Tensor math library
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import collections.abc
|
||||
import typing
|
||||
__all__: list[str] = ['CPU', 'MODE', 'Matrix', 'OPENCL', 'PLATFORM', 'Scalar', 'Tensor3', 'Vector', 'init']
|
||||
class Matrix:
|
||||
@typing.overload
|
||||
def __add__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __add__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"], arg1: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def __itruediv__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def __matmul__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def __neg__(self) -> Matrix:
|
||||
...
|
||||
def __pos__(self) -> Matrix:
|
||||
...
|
||||
def __radd__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def __repr__(self) -> str:
|
||||
...
|
||||
def __rmul__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def __rsub__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: Matrix) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def __truediv__(self, arg0: typing.SupportsFloat) -> Matrix:
|
||||
...
|
||||
def get_axes(self) -> typing.Annotated[list[int], "FixedSize(2)"]:
|
||||
...
|
||||
def get_shape(self) -> typing.Annotated[list[int], "FixedSize(2)"]:
|
||||
...
|
||||
def get_size(self) -> int:
|
||||
...
|
||||
def t(self) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def transpose(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(2)"]) -> Matrix:
|
||||
...
|
||||
@typing.overload
|
||||
def transpose(self, arg0: typing.SupportsInt, arg1: typing.SupportsInt) -> Matrix:
|
||||
...
|
||||
class PLATFORM:
|
||||
"""
|
||||
Members:
|
||||
|
||||
CPU
|
||||
|
||||
OPENCL
|
||||
"""
|
||||
CPU: typing.ClassVar[PLATFORM] # value = <PLATFORM.CPU: 0>
|
||||
OPENCL: typing.ClassVar[PLATFORM] # value = <PLATFORM.OPENCL: 1>
|
||||
__members__: typing.ClassVar[dict[str, PLATFORM]] # value = {'CPU': <PLATFORM.CPU: 0>, 'OPENCL': <PLATFORM.OPENCL: 1>}
|
||||
def __eq__(self, other: typing.Any) -> bool:
|
||||
...
|
||||
def __getstate__(self) -> int:
|
||||
...
|
||||
def __hash__(self) -> int:
|
||||
...
|
||||
def __index__(self) -> int:
|
||||
...
|
||||
def __init__(self, value: typing.SupportsInt) -> None:
|
||||
...
|
||||
def __int__(self) -> int:
|
||||
...
|
||||
def __ne__(self, other: typing.Any) -> bool:
|
||||
...
|
||||
def __repr__(self) -> str:
|
||||
...
|
||||
def __setstate__(self, state: typing.SupportsInt) -> None:
|
||||
...
|
||||
def __str__(self) -> str:
|
||||
...
|
||||
@property
|
||||
def name(self) -> str:
|
||||
...
|
||||
@property
|
||||
def value(self) -> int:
|
||||
...
|
||||
class Scalar:
|
||||
@typing.overload
|
||||
def __add__(self, arg0: Scalar) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __add__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: Scalar) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: Scalar) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"], arg1: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(0)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: Scalar) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
def __itruediv__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: Scalar) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
def __neg__(self) -> Scalar:
|
||||
...
|
||||
def __pos__(self) -> Scalar:
|
||||
...
|
||||
def __radd__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
def __repr__(self) -> str:
|
||||
...
|
||||
def __rmul__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
def __rsub__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: Scalar) -> Scalar:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
def __truediv__(self, arg0: typing.SupportsFloat) -> Scalar:
|
||||
...
|
||||
def get_axes(self) -> typing.Annotated[list[int], "FixedSize(0)"]:
|
||||
...
|
||||
def get_shape(self) -> typing.Annotated[list[int], "FixedSize(0)"]:
|
||||
...
|
||||
def get_size(self) -> int:
|
||||
...
|
||||
class Tensor3:
|
||||
@typing.overload
|
||||
def __add__(self, arg0: Tensor3) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __add__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: Tensor3) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: Tensor3) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"], arg1: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: Tensor3) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
def __itruediv__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: Tensor3) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
def __neg__(self) -> Tensor3:
|
||||
...
|
||||
def __pos__(self) -> Tensor3:
|
||||
...
|
||||
def __radd__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
def __repr__(self) -> str:
|
||||
...
|
||||
def __rmul__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
def __rsub__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: Tensor3) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
def __truediv__(self, arg0: typing.SupportsFloat) -> Tensor3:
|
||||
...
|
||||
def get_axes(self) -> typing.Annotated[list[int], "FixedSize(3)"]:
|
||||
...
|
||||
def get_shape(self) -> typing.Annotated[list[int], "FixedSize(3)"]:
|
||||
...
|
||||
def get_size(self) -> int:
|
||||
...
|
||||
def t(self) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def transpose(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(3)"]) -> Tensor3:
|
||||
...
|
||||
@typing.overload
|
||||
def transpose(self, arg0: typing.SupportsInt, arg1: typing.SupportsInt) -> Tensor3:
|
||||
...
|
||||
class Vector:
|
||||
@typing.overload
|
||||
def __add__(self, arg0: Vector) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __add__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: Vector) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __iadd__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: Vector) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __imul__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"], arg1: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"], arg1: collections.abc.Sequence[typing.SupportsFloat]) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __init__(self, arg0: typing.Annotated[collections.abc.Sequence[typing.SupportsInt], "FixedSize(1)"], arg1: typing.SupportsFloat, arg2: typing.SupportsFloat) -> None:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: Vector) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __isub__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
def __itruediv__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: Vector) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __mul__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
def __neg__(self) -> Vector:
|
||||
...
|
||||
def __pos__(self) -> Vector:
|
||||
...
|
||||
def __radd__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
def __repr__(self) -> str:
|
||||
...
|
||||
def __rmul__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
def __rsub__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: Vector) -> Vector:
|
||||
...
|
||||
@typing.overload
|
||||
def __sub__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
def __truediv__(self, arg0: typing.SupportsFloat) -> Vector:
|
||||
...
|
||||
def get_axes(self) -> typing.Annotated[list[int], "FixedSize(1)"]:
|
||||
...
|
||||
def get_shape(self) -> typing.Annotated[list[int], "FixedSize(1)"]:
|
||||
...
|
||||
def get_size(self) -> int:
|
||||
...
|
||||
def init(arg0: str) -> None:
|
||||
...
|
||||
CPU: PLATFORM # value = <PLATFORM.CPU: 0>
|
||||
MODE: PLATFORM # value = <PLATFORM.OPENCL: 1>
|
||||
OPENCL: PLATFORM # value = <PLATFORM.OPENCL: 1>
|
||||
Reference in New Issue
Block a user