mirror of
https://github.com/StepanovPlaton/NeuralNetwork.git
synced 2026-04-03 20:30:39 +04:00
First XOR NN
This commit is contained in:
594
src/main.cpp
594
src/main.cpp
@@ -1,275 +1,383 @@
|
|||||||
#define NOGPU
|
#include <CL/cl.h>
|
||||||
|
#include <chrono>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "math/math.hpp"
|
// Чтение файла в строку
|
||||||
|
std::string readFile(const char *filename) {
|
||||||
|
std::ifstream file(filename);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
throw std::runtime_error(std::string("Failed to open file: ") + filename);
|
||||||
|
}
|
||||||
|
return std::string((std::istreambuf_iterator<char>(file)),
|
||||||
|
std::istreambuf_iterator<char>());
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef NOGPU
|
// Получение ошибки OpenCL в виде строки
|
||||||
using namespace CPU;
|
const char *getErrorString(cl_int error) {
|
||||||
#else
|
switch (error) {
|
||||||
using namespace GPU;
|
case CL_SUCCESS:
|
||||||
#endif
|
return "CL_SUCCESS";
|
||||||
|
case CL_DEVICE_NOT_FOUND:
|
||||||
|
return "CL_DEVICE_NOT_FOUND";
|
||||||
|
case CL_DEVICE_NOT_AVAILABLE:
|
||||||
|
return "CL_DEVICE_NOT_AVAILABLE";
|
||||||
|
case CL_COMPILER_NOT_AVAILABLE:
|
||||||
|
return "CL_COMPILER_NOT_AVAILABLE";
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
return "CL_OUT_OF_RESOURCES";
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
return "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
case CL_PROFILING_INFO_NOT_AVAILABLE:
|
||||||
|
return "CL_PROFILING_INFO_NOT_AVAILABLE";
|
||||||
|
case CL_MEM_COPY_OVERLAP:
|
||||||
|
return "CL_MEM_COPY_OVERLAP";
|
||||||
|
case CL_IMAGE_FORMAT_MISMATCH:
|
||||||
|
return "CL_IMAGE_FORMAT_MISMATCH";
|
||||||
|
case CL_IMAGE_FORMAT_NOT_SUPPORTED:
|
||||||
|
return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
|
||||||
|
case CL_BUILD_PROGRAM_FAILURE:
|
||||||
|
return "CL_BUILD_PROGRAM_FAILURE";
|
||||||
|
case CL_MAP_FAILURE:
|
||||||
|
return "CL_MAP_FAILURE";
|
||||||
|
case CL_INVALID_VALUE:
|
||||||
|
return "CL_INVALID_VALUE";
|
||||||
|
case CL_INVALID_DEVICE_TYPE:
|
||||||
|
return "CL_INVALID_DEVICE_TYPE";
|
||||||
|
case CL_INVALID_PLATFORM:
|
||||||
|
return "CL_INVALID_PLATFORM";
|
||||||
|
case CL_INVALID_DEVICE:
|
||||||
|
return "CL_INVALID_DEVICE";
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
return "CL_INVALID_CONTEXT";
|
||||||
|
case CL_INVALID_QUEUE_PROPERTIES:
|
||||||
|
return "CL_INVALID_QUEUE_PROPERTIES";
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
return "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
case CL_INVALID_HOST_PTR:
|
||||||
|
return "CL_INVALID_HOST_PTR";
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
return "CL_INVALID_MEM_OBJECT";
|
||||||
|
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
|
||||||
|
return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
|
||||||
|
case CL_INVALID_IMAGE_SIZE:
|
||||||
|
return "CL_INVALID_IMAGE_SIZE";
|
||||||
|
case CL_INVALID_SAMPLER:
|
||||||
|
return "CL_INVALID_SAMPLER";
|
||||||
|
case CL_INVALID_BINARY:
|
||||||
|
return "CL_INVALID_BINARY";
|
||||||
|
case CL_INVALID_BUILD_OPTIONS:
|
||||||
|
return "CL_INVALID_BUILD_OPTIONS";
|
||||||
|
case CL_INVALID_PROGRAM:
|
||||||
|
return "CL_INVALID_PROGRAM";
|
||||||
|
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||||
|
return "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||||
|
case CL_INVALID_KERNEL_NAME:
|
||||||
|
return "CL_INVALID_KERNEL_NAME";
|
||||||
|
case CL_INVALID_KERNEL_DEFINITION:
|
||||||
|
return "CL_INVALID_KERNEL_DEFINITION";
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
return "CL_INVALID_KERNEL";
|
||||||
|
case CL_INVALID_ARG_INDEX:
|
||||||
|
return "CL_INVALID_ARG_INDEX";
|
||||||
|
case CL_INVALID_ARG_VALUE:
|
||||||
|
return "CL_INVALID_ARG_VALUE";
|
||||||
|
case CL_INVALID_ARG_SIZE:
|
||||||
|
return "CL_INVALID_ARG_SIZE";
|
||||||
|
case CL_INVALID_KERNEL_ARGS:
|
||||||
|
return "CL_INVALID_KERNEL_ARGS";
|
||||||
|
case CL_INVALID_WORK_DIMENSION:
|
||||||
|
return "CL_INVALID_WORK_DIMENSION";
|
||||||
|
case CL_INVALID_WORK_GROUP_SIZE:
|
||||||
|
return "CL_INVALID_WORK_GROUP_SIZE";
|
||||||
|
case CL_INVALID_WORK_ITEM_SIZE:
|
||||||
|
return "CL_INVALID_WORK_ITEM_SIZE";
|
||||||
|
case CL_INVALID_GLOBAL_OFFSET:
|
||||||
|
return "CL_INVALID_GLOBAL_OFFSET";
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
return "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
case CL_INVALID_EVENT:
|
||||||
|
return "CL_INVALID_EVENT";
|
||||||
|
case CL_INVALID_OPERATION:
|
||||||
|
return "CL_INVALID_OPERATION";
|
||||||
|
case CL_INVALID_GL_OBJECT:
|
||||||
|
return "CL_INVALID_GL_OBJECT";
|
||||||
|
case CL_INVALID_BUFFER_SIZE:
|
||||||
|
return "CL_INVALID_BUFFER_SIZE";
|
||||||
|
case CL_INVALID_MIP_LEVEL:
|
||||||
|
return "CL_INVALID_MIP_LEVEL";
|
||||||
|
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||||
|
return "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||||
|
default:
|
||||||
|
return "Unknown OpenCL error";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class Layer {
|
// Проверка ошибок OpenCL
|
||||||
protected:
|
void checkError(cl_int err, const char *operation) {
|
||||||
int outputFeatures;
|
if (err != CL_SUCCESS) {
|
||||||
Vector bias;
|
std::cerr << "Error during " << operation << ": " << getErrorString(err)
|
||||||
Activation activation;
|
<< " (" << err << ")" << std::endl;
|
||||||
float alpha;
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
// Код ядра для матричного умножения с тайлингом
|
||||||
Layer(int outputFeatures, Activation activation, float alpha = 0.0f)
|
const char *kernelSource = R"(
|
||||||
: outputFeatures(outputFeatures), bias(outputFeatures),
|
__kernel void matmul_tiled(__global const float* A,
|
||||||
activation(activation), alpha(alpha) {}
|
__global const float* B,
|
||||||
|
__global float* C,
|
||||||
|
const int N,
|
||||||
|
const int TILE_SIZE) {
|
||||||
|
|
||||||
int getOuputFeatures() const { return outputFeatures; }
|
int row = get_global_id(1);
|
||||||
Activation getActivation() const { return activation; }
|
int col = get_global_id(0);
|
||||||
float getAlpha() const { return alpha; }
|
|
||||||
|
|
||||||
const Vector &getBias() const { return bias; }
|
__local float tileA[16][16];
|
||||||
void setBias(const Vector &b) { bias = b; }
|
__local float tileB[16][16];
|
||||||
};
|
|
||||||
|
|
||||||
class ConnectedLayer : public Layer {
|
float sum = 0.0f;
|
||||||
protected:
|
|
||||||
int inputFeatures;
|
|
||||||
Matrix weights;
|
|
||||||
|
|
||||||
public:
|
int numTiles = (N + TILE_SIZE - 1) / TILE_SIZE;
|
||||||
ConnectedLayer(int inputFeatures, const Layer &layer)
|
|
||||||
: Layer(layer), inputFeatures(inputFeatures),
|
|
||||||
weights(layer.getOuputFeatures(), inputFeatures) {}
|
|
||||||
ConnectedLayer(const Layer &a, const Layer &b)
|
|
||||||
: ConnectedLayer(a.getOuputFeatures(), b) {}
|
|
||||||
|
|
||||||
int getInputFeatures() const { return inputFeatures; }
|
for (int t = 0; t < numTiles; t++) {
|
||||||
const Matrix &getWeights() const { return weights; }
|
// Загрузка тайлов в локальную память
|
||||||
void setWeights(const Matrix &w) { weights = w; }
|
int tileRow = get_local_id(1);
|
||||||
};
|
int tileCol = get_local_id(0);
|
||||||
|
|
||||||
class LearnLayer : public ConnectedLayer {
|
int loadRow = row;
|
||||||
protected:
|
int loadCol = t * TILE_SIZE + tileCol;
|
||||||
Matrix internal;
|
if (loadRow < N && loadCol < N) {
|
||||||
Matrix outputs;
|
tileA[tileRow][tileCol] = A[loadRow * N + loadCol];
|
||||||
|
} else {
|
||||||
public:
|
tileA[tileRow][tileCol] = 0.0f;
|
||||||
LearnLayer(int inputFeatures, const Layer &layer)
|
|
||||||
: ConnectedLayer(inputFeatures, layer),
|
|
||||||
internal(layer.getOuputFeatures(), inputFeatures, false),
|
|
||||||
outputs(layer.getOuputFeatures(), inputFeatures, false) {}
|
|
||||||
LearnLayer(const Layer &a, const Layer &b)
|
|
||||||
: LearnLayer(a.getOuputFeatures(), b) {}
|
|
||||||
|
|
||||||
const Matrix &getInternal() const { return internal; }
|
|
||||||
const Matrix &getOutputs() const { return outputs; }
|
|
||||||
void setInternal(const Matrix &i) { internal = i; }
|
|
||||||
void setOutputs(const Matrix &o) { outputs = o; }
|
|
||||||
};
|
|
||||||
|
|
||||||
class NeuralNetwork {
|
|
||||||
private:
|
|
||||||
std::vector<ConnectedLayer> layers;
|
|
||||||
|
|
||||||
public:
|
|
||||||
NeuralNetwork(int inputFeatures, std::vector<Layer> l) {
|
|
||||||
// employ back
|
|
||||||
layers.push_back(ConnectedLayer(inputFeatures, l[0]));
|
|
||||||
for (size_t i = 1; i < l.size(); i++)
|
|
||||||
layers.push_back(ConnectedLayer(l[i - 1].getOuputFeatures(), l[i]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Matrix predict(Matrix inputs) {
|
loadRow = t * TILE_SIZE + tileRow;
|
||||||
MatrixMath mm;
|
loadCol = col;
|
||||||
std::vector<Matrix> steps;
|
if (loadRow < N && loadCol < N) {
|
||||||
steps.push_back(inputs);
|
tileB[tileRow][tileCol] = B[loadRow * N + loadCol];
|
||||||
for (size_t i = 0; i < layers.size(); i++)
|
} else {
|
||||||
steps.push_back(mm.dot(steps[steps.size() - 1], layers[i].getWeights(),
|
tileB[tileRow][tileCol] = 0.0f;
|
||||||
false, true, &layers[i].getBias(),
|
|
||||||
layers[i].getActivation(), layers[i].getAlpha()));
|
|
||||||
mm.await();
|
|
||||||
return steps[steps.size() - 1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const ConnectedLayer &getLayer(int i) const { return layers[i]; }
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
};
|
|
||||||
|
|
||||||
class LearnNerualNetrowk {
|
// Вычисление частичной суммы
|
||||||
private:
|
for (int k = 0; k < TILE_SIZE; k++) {
|
||||||
std::vector<LearnLayer> layers;
|
sum += tileA[tileRow][k] * tileB[k][tileCol];
|
||||||
|
|
||||||
public:
|
|
||||||
LearnNerualNetrowk(int inputFeatures, std::vector<Layer> l) {
|
|
||||||
// employ back
|
|
||||||
layers.push_back(LearnLayer(inputFeatures, l[0]));
|
|
||||||
for (size_t i = 1; i < l.size(); i++)
|
|
||||||
layers.push_back(LearnLayer(l[i - 1], l[i]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Matrix learn(Matrix inputs, Matrix target, float speed = 1.0f) {
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
MatrixMath mm;
|
|
||||||
VectorMath vm;
|
|
||||||
for (size_t i = 0; i < layers.size(); i++) {
|
|
||||||
layers[i].setInternal(mm.dot(layers[i].getWeights(),
|
|
||||||
i == 0 ? inputs : layers[i - 1].getOutputs(),
|
|
||||||
false, false, &layers[i].getBias()));
|
|
||||||
layers[i].setOutputs(mm.activate(layers[i].getInternal(),
|
|
||||||
layers[i].getActivation(),
|
|
||||||
layers[i].getAlpha()));
|
|
||||||
}
|
|
||||||
mm.await();
|
|
||||||
|
|
||||||
std::vector<float> io = inputs.toVector();
|
|
||||||
std::cout << "I: ";
|
|
||||||
for (size_t i = 0; i < io.size(); ++i)
|
|
||||||
printf("%4.2f ", io[i]);
|
|
||||||
|
|
||||||
std::vector<float> ni = layers[layers.size() - 1].getInternal().toVector();
|
|
||||||
std::cout << "| NNI: ";
|
|
||||||
for (size_t i = 0; i < ni.size(); ++i)
|
|
||||||
printf("%4.2f ", ni[i]);
|
|
||||||
|
|
||||||
std::vector<float> no = layers[layers.size() - 1].getOutputs().toVector();
|
|
||||||
std::cout << "| NNO: ";
|
|
||||||
for (size_t i = 0; i < no.size(); ++i)
|
|
||||||
printf("%4.2f ", no[i]);
|
|
||||||
|
|
||||||
std::vector<float> to = target.toVector();
|
|
||||||
std::cout << "| T: ";
|
|
||||||
for (size_t i = 0; i < to.size(); ++i)
|
|
||||||
printf("%4.2f ", to[i]);
|
|
||||||
|
|
||||||
Matrix mse =
|
|
||||||
mm.loss(layers[layers.size() - 1].getOutputs(), target, Loss::MSE);
|
|
||||||
|
|
||||||
std::vector<float> lo = mse.toVector();
|
|
||||||
std::cout << "| L: ";
|
|
||||||
for (size_t i = 0; i < lo.size(); ++i)
|
|
||||||
printf("%5.3f ", lo[i]);
|
|
||||||
std::cout << std::endl;
|
|
||||||
|
|
||||||
Matrix dAnl =
|
|
||||||
mm.d_loss(layers[layers.size() - 1].getOutputs(), target, Loss::MSE);
|
|
||||||
|
|
||||||
for (int i = layers.size() - 1; i >= 0; --i) {
|
|
||||||
printf("=== Layer %d ===\n", i + 1);
|
|
||||||
printf("dAnl: ");
|
|
||||||
dAnl.print();
|
|
||||||
|
|
||||||
Matrix dZl = mm.mult(dAnl, mm.d_activate(layers[i].getInternal()));
|
|
||||||
printf("dZl: ");
|
|
||||||
dZl.print();
|
|
||||||
|
|
||||||
Matrix dWl =
|
|
||||||
mm.mult(mm.dot(dZl, i == 0 ? inputs : layers[i - 1].getOutputs(),
|
|
||||||
false, true),
|
|
||||||
1.0f / (float)inputs.getRows());
|
|
||||||
printf("dWl: ");
|
|
||||||
dWl.print();
|
|
||||||
|
|
||||||
Vector dbl = mm.axis_sum(mm.mult(dZl, 1.0f / (float)inputs.getRows()));
|
|
||||||
printf("dbl: ");
|
|
||||||
dbl.print();
|
|
||||||
|
|
||||||
dAnl = mm.dot(layers[i].getWeights(), dZl, true); // false true?!
|
|
||||||
|
|
||||||
mm.await();
|
|
||||||
|
|
||||||
layers[i].setWeights(mm.add(layers[i].getWeights(), dWl, -speed));
|
|
||||||
printf("Weights %d: ", i + 1);
|
|
||||||
layers[i].getWeights().print();
|
|
||||||
|
|
||||||
layers[i].setBias(
|
|
||||||
vm.add(layers[i].getBias(), dbl, -speed / (float)inputs.getRows()));
|
|
||||||
printf("Bias %d: ", i + 1);
|
|
||||||
layers[i].getBias().print();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return mse;
|
if (row < N && col < N) {
|
||||||
|
C[row * N + col] = sum;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
const LearnLayer &getLayer(int i) const { return layers[i]; }
|
)";
|
||||||
|
|
||||||
// delete
|
|
||||||
LearnLayer &getLayer(int i) { return layers[i]; }
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifndef NOGPU
|
|
||||||
OpenCL openCL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
// LearnNerualNetrowk nn(
|
cl_int err;
|
||||||
// 3, {Layer(3, Activation::SIGMOID), Layer(3, Activation::SIGMOID)});
|
|
||||||
//
|
|
||||||
// Matrix weights1(3, 3,
|
|
||||||
// {0.88f, 0.39f, 0.9f, 0.37f, 0.14f, 0.41f, 0.96f, 0.5f,
|
|
||||||
// 0.6f});
|
|
||||||
// Matrix weights2(
|
|
||||||
// 3, 3, {0.29f, 0.57f, 0.36f, 0.73f, 0.53f, 0.68f, 0.01f, 0.02f, 0.58f});
|
|
||||||
//
|
|
||||||
// Vector bias1(std::vector<float>{0.23f, 0.89f, 0.08f});
|
|
||||||
// Vector bias2(std::vector<float>{0.78f, 0.83f, 0.8f});
|
|
||||||
//
|
|
||||||
// nn.getLayer(0).setWeights(weights1);
|
|
||||||
// nn.getLayer(0).setBias(bias1);
|
|
||||||
//
|
|
||||||
// nn.getLayer(1).setWeights(weights2);
|
|
||||||
// nn.getLayer(1).setBias(bias2);
|
|
||||||
//
|
|
||||||
// std::cout << std::endl;
|
|
||||||
//
|
|
||||||
// Matrix input(3, 1, {0.03f, 0.72f, 0.49f});
|
|
||||||
// Matrix target(3, 1, {0.93f, 0.74f, 0.17f});
|
|
||||||
//
|
|
||||||
// // for (int i = 0; i < 1000; i++)
|
|
||||||
// nn.learn(input, target, 0.01f);
|
|
||||||
|
|
||||||
LearnNerualNetrowk nn(
|
// Параметры матрицы
|
||||||
2, {Layer(3, Activation::SIGMOID), Layer(1, Activation::SIGMOID)});
|
const int N = 1024; // Размер матрицы (уменьшено для демонстрации)
|
||||||
|
const int TILE_SIZE = 16;
|
||||||
|
const size_t matrixSize = N * N * sizeof(float);
|
||||||
|
|
||||||
Matrix input(2, 4);
|
std::cout << "Matrix size: " << N << "x" << N << " (" << N * N << " elements)"
|
||||||
Matrix target(1, 4);
|
<< std::endl;
|
||||||
|
std::cout << "Total data: " << matrixSize / (1024 * 1024) << " MB per matrix"
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
float min = 100.0f;
|
// Инициализация данных
|
||||||
for (int batch = 0; batch < 4; batch++) {
|
std::vector<float> A(N * N);
|
||||||
for (int i = 0; i < 4; i++) {
|
std::vector<float> B(N * N);
|
||||||
int v1 = (i / 2) % 2;
|
std::vector<float> C(N * N, 0.0f);
|
||||||
int v2 = i % 2;
|
|
||||||
|
|
||||||
input(0, i) = static_cast<float>(v1);
|
// Заполнение матриц тестовыми данными
|
||||||
input(1, i) = static_cast<float>(v2);
|
for (int i = 0; i < N * N; i++) {
|
||||||
target(0, i) = static_cast<float>(v1 ^ v2);
|
A[i] = static_cast<float>(i % 100) * 0.1f;
|
||||||
}
|
B[i] = static_cast<float>((i + 1) % 100) * 0.1f;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 1000; i++) {
|
// 1. Получение платформы
|
||||||
printf("%4d | ", i + 1);
|
cl_platform_id platform;
|
||||||
Matrix mse = nn.learn(input, target, 0.0001f * std::pow(0.99f, i));
|
err = clGetPlatformIDs(1, &platform, NULL);
|
||||||
std::vector<float> lv = mse.toVector();
|
checkError(err, "clGetPlatformIDs");
|
||||||
float loss = 0.0f;
|
|
||||||
for (size_t i = 0; i < lv.size(); ++i)
|
|
||||||
loss += lv[i];
|
|
||||||
if (loss < min)
|
|
||||||
min = loss;
|
|
||||||
}
|
|
||||||
std::cout << min << std::endl;
|
|
||||||
|
|
||||||
// LearnNerualNetrowk nn(
|
// 2. Получение устройства (GPU)
|
||||||
// 2, {Layer(3, Activation::SIGMOID), Layer(1, Activation::SIGMOID)});
|
cl_device_id device;
|
||||||
// float min = 100.0f;
|
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
|
||||||
// for (int i = 0; i < 4 * 10000; i++) {
|
if (err != CL_SUCCESS) {
|
||||||
// int v1 = (i / 2) % 2;
|
std::cout << "GPU not found, trying CPU..." << std::endl;
|
||||||
// int v2 = i % 2;
|
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
|
||||||
//
|
checkError(err, "clGetDeviceIDs");
|
||||||
// Matrix input(2, 1, {static_cast<float>(v1), static_cast<float>(v2)});
|
std::cout << "Using CPU" << std::endl;
|
||||||
// Matrix target(1, 1, static_cast<float>(v1 ^ v2));
|
} else {
|
||||||
//
|
std::cout << "Using GPU" << std::endl;
|
||||||
// printf("%5d | ", i + 1);
|
}
|
||||||
// Matrix mse = nn.learn(input, target, 0.0001f * std::pow(0.95f, i));
|
|
||||||
// if (i % 4 == 3)
|
// 3. Создание контекста
|
||||||
// std::cout << std::endl;
|
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||||
// if (mse[0] < min)
|
checkError(err, "clCreateContext");
|
||||||
// min = mse[0];
|
|
||||||
// }
|
// 4. Создание очереди команд
|
||||||
// std::cout << min << std::endl;
|
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
|
||||||
|
checkError(err, "clCreateCommandQueue");
|
||||||
|
|
||||||
|
// 5. Создание буферов
|
||||||
|
cl_mem bufferA =
|
||||||
|
clCreateBuffer(context, CL_MEM_READ_ONLY, matrixSize, NULL, &err);
|
||||||
|
checkError(err, "clCreateBuffer A");
|
||||||
|
|
||||||
|
cl_mem bufferB =
|
||||||
|
clCreateBuffer(context, CL_MEM_READ_ONLY, matrixSize, NULL, &err);
|
||||||
|
checkError(err, "clCreateBuffer B");
|
||||||
|
|
||||||
|
cl_mem bufferC =
|
||||||
|
clCreateBuffer(context, CL_MEM_WRITE_ONLY, matrixSize, NULL, &err);
|
||||||
|
checkError(err, "clCreateBuffer C");
|
||||||
|
|
||||||
|
// 6. Копирование данных на устройство
|
||||||
|
auto copy_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, 0, matrixSize, A.data(),
|
||||||
|
0, NULL, NULL);
|
||||||
|
checkError(err, "clEnqueueWriteBuffer A");
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(queue, bufferB, CL_TRUE, 0, matrixSize, B.data(),
|
||||||
|
0, NULL, NULL);
|
||||||
|
checkError(err, "clEnqueueWriteBuffer B");
|
||||||
|
|
||||||
|
auto copy_end = std::chrono::high_resolution_clock::now();
|
||||||
|
auto copy_time = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
|
copy_end - copy_start);
|
||||||
|
|
||||||
|
// 7. Создание программы
|
||||||
|
auto program_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
cl_program program =
|
||||||
|
clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
|
||||||
|
checkError(err, "clCreateProgramWithSource");
|
||||||
|
|
||||||
|
// Компиляция программы
|
||||||
|
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
// Получение логов компиляции
|
||||||
|
size_t log_size;
|
||||||
|
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL,
|
||||||
|
&log_size);
|
||||||
|
std::vector<char> log(log_size);
|
||||||
|
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size,
|
||||||
|
log.data(), NULL);
|
||||||
|
std::cerr << "Build failed:\n" << log.data() << std::endl;
|
||||||
|
checkError(err, "clBuildProgram");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto program_end = std::chrono::high_resolution_clock::now();
|
||||||
|
auto program_time = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
|
program_end - program_start);
|
||||||
|
|
||||||
|
// 8. Создание ядра
|
||||||
|
auto kernel_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
cl_kernel kernel = clCreateKernel(program, "matmul_tiled", &err);
|
||||||
|
checkError(err, "clCreateKernel");
|
||||||
|
|
||||||
|
auto kernel_end = std::chrono::high_resolution_clock::now();
|
||||||
|
auto kernel_time = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
|
kernel_end - kernel_start);
|
||||||
|
|
||||||
|
// 9. Установка аргументов ядра
|
||||||
|
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
|
||||||
|
checkError(err, "clSetKernelArg 0");
|
||||||
|
|
||||||
|
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
|
||||||
|
checkError(err, "clSetKernelArg 1");
|
||||||
|
|
||||||
|
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
|
||||||
|
checkError(err, "clSetKernelArg 2");
|
||||||
|
|
||||||
|
err = clSetKernelArg(kernel, 3, sizeof(int), &N);
|
||||||
|
checkError(err, "clSetKernelArg 3");
|
||||||
|
|
||||||
|
err = clSetKernelArg(kernel, 4, sizeof(int), &TILE_SIZE);
|
||||||
|
checkError(err, "clSetKernelArg 4");
|
||||||
|
|
||||||
|
// 10. Запуск матричного умножения
|
||||||
|
size_t global[2] = {N, N};
|
||||||
|
size_t local[2] = {TILE_SIZE, TILE_SIZE};
|
||||||
|
|
||||||
|
auto matmul_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL,
|
||||||
|
NULL);
|
||||||
|
checkError(err, "clEnqueueNDRangeKernel");
|
||||||
|
|
||||||
|
clFinish(queue);
|
||||||
|
|
||||||
|
auto matmul_end = std::chrono::high_resolution_clock::now();
|
||||||
|
auto matmul_time = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||||
|
matmul_end - matmul_start);
|
||||||
|
|
||||||
|
// 11. Чтение результатов
|
||||||
|
auto read_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, matrixSize, C.data(), 0,
|
||||||
|
NULL, NULL);
|
||||||
|
checkError(err, "clEnqueueReadBuffer");
|
||||||
|
|
||||||
|
auto read_end = std::chrono::high_resolution_clock::now();
|
||||||
|
auto read_time = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
|
read_end - read_start);
|
||||||
|
|
||||||
|
// Вывод результатов измерений
|
||||||
|
std::cout << "\n=== TIMING RESULTS ===" << std::endl;
|
||||||
|
std::cout << "Data copy to device: " << copy_time.count() << " ns"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "Program creation: " << program_time.count() << " ns"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "Kernel creation: " << kernel_time.count() << " ns" << std::endl;
|
||||||
|
std::cout << "Matrix multiplication: " << matmul_time.count() << " ms"
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << "Data read from device: " << read_time.count() << " ns"
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
// Расчет отношения времени выполнения к времени создания ядра
|
||||||
|
if (kernel_time.count() > 0) {
|
||||||
|
long long ratio = (matmul_time.count() * 1000) /
|
||||||
|
kernel_time.count(); // переводим ms в ns для сравнения
|
||||||
|
std::cout << "Kernel creation vs execution ratio: 1 : " << ratio
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Расчет производительности
|
||||||
|
long long total_flops = 2LL * N * N * N; // 2*N^3 FLOP
|
||||||
|
double gflops = (double)total_flops / (matmul_time.count() * 1e6); // GFLOP/s
|
||||||
|
std::cout << "Performance: " << gflops << " GFLOP/s" << std::endl;
|
||||||
|
|
||||||
|
// Проверка результата (простая валидация)
|
||||||
|
float checksum = 0.0f;
|
||||||
|
for (int i = 0; i < N * N; i++) {
|
||||||
|
checksum += C[i];
|
||||||
|
}
|
||||||
|
std::cout << "Result checksum: " << checksum << std::endl;
|
||||||
|
|
||||||
|
// 12. Освобождение ресурсов
|
||||||
|
clReleaseMemObject(bufferA);
|
||||||
|
clReleaseMemObject(bufferB);
|
||||||
|
clReleaseMemObject(bufferC);
|
||||||
|
clReleaseKernel(kernel);
|
||||||
|
clReleaseProgram(program);
|
||||||
|
clReleaseCommandQueue(queue);
|
||||||
|
clReleaseContext(context);
|
||||||
|
|
||||||
|
std::cout << "\nDone!" << std::endl;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -116,13 +116,13 @@ private:
|
|||||||
Tensor2 mse(const Tensor2 &a, const Tensor2 &b) {
|
Tensor2 mse(const Tensor2 &a, const Tensor2 &b) {
|
||||||
Tensor2 result(a.getShape(), false);
|
Tensor2 result(a.getShape(), false);
|
||||||
for (size_t i = 0; i < result.getSize(); ++i)
|
for (size_t i = 0; i < result.getSize(); ++i)
|
||||||
result[i] = (a[i] - b[i]) * (a[i] - b[i]) / (float)a.getCols();
|
result[i] = (a[i] - b[i]) * (a[i] - b[i]) / (float)a.getSize();
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
Tensor2 d_mse(const Tensor2 &a, const Tensor2 &b) {
|
Tensor2 d_mse(const Tensor2 &a, const Tensor2 &b) {
|
||||||
Tensor2 result(a.getShape(), false);
|
Tensor2 result(a.getShape(), false);
|
||||||
for (size_t i = 0; i < result.getSize(); ++i)
|
for (size_t i = 0; i < result.getSize(); ++i)
|
||||||
result[i] = 2 * (a[i] - b[i]) / (float)a.getCols();
|
result[i] = 2 * (a[i] - b[i]) / (float)a.getSize();
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,8 @@
|
|||||||
|
|
||||||
#include "../tensor.hpp"
|
#include "../tensor.hpp"
|
||||||
|
|
||||||
|
#include "../../../utils/output.h"
|
||||||
|
|
||||||
extern std::mt19937 gen;
|
extern std::mt19937 gen;
|
||||||
|
|
||||||
namespace CPU {
|
namespace CPU {
|
||||||
@@ -52,17 +54,17 @@ public:
|
|||||||
const float &operator[](int index) const { return data[index]; }
|
const float &operator[](int index) const { return data[index]; }
|
||||||
|
|
||||||
virtual void print() const {
|
virtual void print() const {
|
||||||
std::cout << "Tensor(" << getDim() << "): [";
|
debugi("Tensor(%d): [", getDim());
|
||||||
for (size_t i = 0; i < data.size(); ++i) {
|
for (size_t i = 0; i < data.size(); ++i) {
|
||||||
std::cout << data[i];
|
debugi("%4.3f", data[i]);
|
||||||
if (i > 15) {
|
if (i > 15) {
|
||||||
std::cout << "... ";
|
debugi("... ");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (i != data.size() - 1)
|
if (i != data.size() - 1)
|
||||||
std::cout << ", ";
|
debugi(" ");
|
||||||
}
|
}
|
||||||
std::cout << "]" << std::endl;
|
debug("]");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> toVector() const { return data; }
|
std::vector<float> toVector() const { return data; }
|
||||||
@@ -132,9 +134,7 @@ public:
|
|||||||
Tensor0(Tensor0 &&other) = default;
|
Tensor0(Tensor0 &&other) = default;
|
||||||
Tensor0 &operator=(Tensor0 &&other) = default;
|
Tensor0 &operator=(Tensor0 &&other) = default;
|
||||||
|
|
||||||
void print() const override {
|
void print() const override { debug("Scalar: %4.3f", data[0]); }
|
||||||
std::cout << "Scalar: " << data[0] << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
float &value() { return data[0]; }
|
float &value() { return data[0]; }
|
||||||
const float &value() const { return data[0]; }
|
const float &value() const { return data[0]; }
|
||||||
@@ -161,13 +161,13 @@ public:
|
|||||||
Tensor1 &operator=(Tensor1 &&other) = default;
|
Tensor1 &operator=(Tensor1 &&other) = default;
|
||||||
|
|
||||||
void print() const override {
|
void print() const override {
|
||||||
std::cout << "Vector(" << shape[0] << "): [";
|
debugi("Vector(%d): [", shape[0]);
|
||||||
for (size_t i = 0; i < data.size(); ++i) {
|
for (size_t i = 0; i < data.size(); ++i) {
|
||||||
std::cout << data[i];
|
debugi("%4.3f", data[i]);
|
||||||
if (i != data.size() - 1)
|
if (i != data.size() - 1)
|
||||||
std::cout << ", ";
|
debugi(" ");
|
||||||
}
|
}
|
||||||
std::cout << "]" << std::endl;
|
debug("]");
|
||||||
}
|
}
|
||||||
|
|
||||||
float &operator()(int i) { return data[i]; }
|
float &operator()(int i) { return data[i]; }
|
||||||
@@ -209,12 +209,11 @@ public:
|
|||||||
Tensor2 &operator=(Tensor2 &&other) = default;
|
Tensor2 &operator=(Tensor2 &&other) = default;
|
||||||
|
|
||||||
void print() const override {
|
void print() const override {
|
||||||
std::cout << "Matrix(" << shape[0] << "x" << shape[1] << "):\n";
|
debug("Matrix(%dx%d):", shape[0], shape[1]);
|
||||||
for (int i = 0; i < shape[0]; ++i) {
|
for (int i = 0; i < shape[0]; ++i) {
|
||||||
for (int j = 0; j < shape[1]; ++j) {
|
for (int j = 0; j < shape[1]; ++j)
|
||||||
std::cout << data[i * shape[1] + j] << " ";
|
debugi("%4.3f ", data[i * shape[1] + j]);
|
||||||
}
|
debugi("\n");
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -265,17 +264,15 @@ public:
|
|||||||
Tensor3 &operator=(Tensor3 &&other) = default;
|
Tensor3 &operator=(Tensor3 &&other) = default;
|
||||||
|
|
||||||
void print() const override {
|
void print() const override {
|
||||||
std::cout << "Tensor3(" << shape[0] << "x" << shape[1] << "x" << shape[2]
|
debugi("Tensor3(%dx%dx%d):", shape[0], shape[1], shape[2]);
|
||||||
<< "):\n";
|
|
||||||
for (int i = 0; i < shape[0]; ++i) {
|
for (int i = 0; i < shape[0]; ++i) {
|
||||||
std::cout << "Slice " << i << ":\n";
|
debug("Slice %d", i);
|
||||||
for (int j = 0; j < shape[1]; ++j) {
|
for (int j = 0; j < shape[1]; ++j) {
|
||||||
for (int k = 0; k < shape[2]; ++k) {
|
for (int k = 0; k < shape[2]; ++k)
|
||||||
std::cout << data[i * shape[1] * shape[2] + j * shape[2] + k] << " ";
|
debugi("%4.3f ", data[i * shape[1] * shape[2] + j * shape[2] + k]);
|
||||||
|
debugi("\n");
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
debugi("\n");
|
||||||
}
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
61
src/utils/output.h
Normal file
61
src/utils/output.h
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
// Определения цветов и стилей
|
||||||
|
#define RESET "\033[0m"
|
||||||
|
#define BOLD "\033[1m"
|
||||||
|
#define ITALIC "\033[3m"
|
||||||
|
#define UNDERLINE "\033[4m"
|
||||||
|
|
||||||
|
// Цвета текста
|
||||||
|
#define BLACK "\033[30m"
|
||||||
|
#define RED "\033[31m"
|
||||||
|
#define GREEN "\033[32m"
|
||||||
|
#define YELLOW "\033[33m"
|
||||||
|
#define BLUE "\033[34m"
|
||||||
|
#define MAGENTA "\033[35m"
|
||||||
|
#define CYAN "\033[36m"
|
||||||
|
#define WHITE "\033[37m"
|
||||||
|
|
||||||
|
// Фоновые цвета
|
||||||
|
#define BG_BLACK "\033[40m"
|
||||||
|
#define BG_RED "\033[41m"
|
||||||
|
#define BG_GREEN "\033[42m"
|
||||||
|
#define BG_YELLOW "\033[43m"
|
||||||
|
#define BG_BLUE "\033[44m"
|
||||||
|
#define BG_MAGENTA "\033[45m"
|
||||||
|
#define BG_CYAN "\033[46m"
|
||||||
|
#define BG_WHITE "\033[47m"
|
||||||
|
|
||||||
|
#define printff(format_codes, ...) \
|
||||||
|
do { \
|
||||||
|
printf("%s", format_codes); \
|
||||||
|
printf(__VA_ARGS__); \
|
||||||
|
printf("\033[0m\n"); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#ifdef DEBUG_MODE
|
||||||
|
#define debug(fmt, ...) \
|
||||||
|
do { \
|
||||||
|
printf(fmt, ##__VA_ARGS__); \
|
||||||
|
printf("\n"); \
|
||||||
|
} while (0)
|
||||||
|
#define debugi(fmt, ...) \
|
||||||
|
do { \
|
||||||
|
printf(fmt, ##__VA_ARGS__); \
|
||||||
|
} while (0)
|
||||||
|
#define debugf(format_codes, fmt, ...) \
|
||||||
|
do { \
|
||||||
|
printf("%s", format_codes); \
|
||||||
|
printf("[%s:%d] ", __FILE__, __LINE__); \
|
||||||
|
printf(fmt, ##__VA_ARGS__); \
|
||||||
|
printf("\n"); \
|
||||||
|
printf("\033[0m\n"); \
|
||||||
|
} while (0)
|
||||||
|
#define loge(fmt, ...) logff(RED UNDERLINE, fmt, ##__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define debug(fmt, ...)
|
||||||
|
#define debugi(fmt, ...)
|
||||||
|
#define debugf(format_codes, fmt, ...)
|
||||||
|
#endif
|
||||||
Reference in New Issue
Block a user