diff --git a/src/main b/src/main
index cc9cc83..45b2c56 100755
Binary files a/src/main and b/src/main differ
diff --git a/src/main.cpp b/src/main.cpp
index 1bea81d..2f4187f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -25,6 +25,7 @@ public:
   float getAlpha() const { return alpha; }
 
   const Vector &getBias() const { return bias; }
+  void setBias(const Vector &b) { bias = b; }
 };
 
 class ConnectedLayer : public Layer {
@@ -41,6 +42,7 @@ public:
 
   int getInputFeatures() const { return inputFeatures; }
   const Matrix &getWeights() const { return weights; }
+  void setWeights(const Matrix &w) { weights = w; }
 };
 
 class LearnLayer : public ConnectedLayer {
@@ -81,7 +83,7 @@ public:
     steps.push_back(inputs);
     for (size_t i = 0; i < layers.size(); i++)
       steps.push_back(mm.dot(steps[steps.size() - 1], layers[i].getWeights(),
-                             true, &layers[i].getBias(),
+                             false, true, &layers[i].getBias(),
                              layers[i].getActivation(), layers[i].getAlpha()));
     mm.await();
     return steps[steps.size() - 1];
@@ -102,11 +104,12 @@ public:
       layers.push_back(LearnLayer(l[i - 1].getOuputFeatures(), l[i]));
   }
 
-  Matrix learn(Matrix inputs, Matrix target) {
+  Matrix learn(Matrix inputs, Matrix target, float speed = 1.0f) {
     MatrixMath mm;
+    VectorMath vm;
     for (size_t i = 0; i < layers.size(); i++) {
       layers[i].setInternal(mm.dot(i == 0 ? inputs : layers[i - 1].getOutputs(),
-                                   layers[i].getWeights(), true,
+                                   layers[i].getWeights(), false, true,
                                    &layers[i].getBias()));
       layers[i].setOutputs(mm.activate(layers[i].getInternal(),
                                        layers[i].getActivation(),
@@ -134,10 +137,22 @@ public:
       printf("%5.3f ", lo[i]);
     std::cout << std::endl;
 
-    // Matrix dA2 =
-    //     mm.d_loss(layers[layers.size() - 1].getOutputs(), target, Loss::MSE);
-    // Matrix  = mm.dot(dA2,
-    // mm.d_activate(layers[layers.size()-1].getOutputs()));
+    Matrix dAnl =
+        mm.d_loss(layers[layers.size() - 1].getOutputs(), target, Loss::MSE);
+    for (int i = layers.size() - 1; i >= 0; --i) {
+      Matrix dZl = mm.mult(dAnl, mm.d_activate(layers[i].getInternal()));
+      Matrix dWl = mm.mult(
+          mm.dot(dZl, i == 0 ? inputs : layers[i - 1].getOutputs(), true),
+          1.0f / (float)inputs.getRows());
+      Vector dbl = mm.axis_sum(mm.mult(dZl, 1.0f / (float)inputs.getRows()));
+      dAnl = mm.dot(dZl, layers[i].getWeights(), false, false); // false true?!
+
+      mm.await();
+
+      layers[i].setWeights(mm.add(layers[i].getWeights(), dWl, -speed));
+      layers[i].setBias(
+          vm.add(layers[i].getBias(), dbl, -speed / (float)inputs.getRows()));
+    }
 
     return mse;
   }
@@ -151,19 +166,39 @@ OpenCL openCL;
 
 int main() {
   LearnNerualNetrowk nn(
-      2, {Layer(3, Activation::SIGMOID), Layer(3, Activation::SIGMOID)});
-  std::cout << "NN created!" << std::endl;
+      2, {Layer(2, Activation::TANH), Layer(1, Activation::SIGMOID)});
+  std::cout << std::endl;
 
-  for (int i = 0; i < 4; i++) {
+  // Matrix input(4, 2);
+  // Matrix target(4, 1);
+  //
+  // for (int batch = 0; batch < 4; batch++) {
+  //   for (int i = 0; i < 4; i++) {
+  //     int v1 = (i / 2) % 2;
+  //     int v2 = i % 2;
+  //
+  //     input(i, 0) = static_cast<float>(v1);
+  //     input(i, 1) = static_cast<float>(v2);
+  //     target(i, 0) = static_cast<float>(v1 ^ v2);
+  //   }
+  // }
+  //
+  // for (int i = 0; i < 10; i++) {
+  //   printf("%4d | ", i + 1);
+  //   Matrix mse = nn.learn(input, target, 0.1f * std::pow(0.99, i));
+  // }
+
+  for (int i = 0; i < 4 * 1000; i++) {
     int v1 = (i / 2) % 2;
     int v2 = i % 2;
 
     Matrix input(1, 2, {static_cast<float>(v1), static_cast<float>(v2)});
-    Matrix target(1, 3,
-                  {static_cast<float>(v1 ^ v2), static_cast<float>(v1 & v2),
-                   static_cast<float>(v1 | v2)});
+    Matrix target(1, 1, static_cast<float>(v1 ^ v2));
 
-    nn.learn(input, target);
+    printf("%5d | ", i + 1);
+    Matrix mse = nn.learn(input, target, 0.00003f);
+    if (i % 4 == 3)
+      std::cout << std::endl;
   }
 
   return 0;
diff --git a/src/math/tensor/cpu/math.hpp b/src/math/tensor/cpu/math.hpp
index efdef18..cb34ed6 100644
--- a/src/math/tensor/cpu/math.hpp
+++ b/src/math/tensor/cpu/math.hpp
@@ -35,20 +35,24 @@ protected:
       throw std::invalid_argument("Unknown activation type");
     }
   }
-  float d_activateX(float f, Activation type, float alpha = 0.01f) {
+  float d_activateX(float x, Activation type, float alpha = 0.01f) {
     switch (type) {
     case Activation::LINEAR:
       return 1.0f;
-    case Activation::SIGMOID:
-      return f * (1.0f - f);
-    case Activation::TANH:
-      return 1.0f - f * f;
+    case Activation::SIGMOID: {
+      float sigmoid = 1.0f / (1.0f + std::exp(-x));
+      return sigmoid * (1.0f - sigmoid);
+    }
+    case Activation::TANH: {
+      float tanh_x = std::tanh(x);
+      return 1.0f - tanh_x * tanh_x;
+    }
     case Activation::RELU:
-      return (f > 0.0f) ? 1.0f : 0.0f;
+      return (x > 0.0f) ? 1.0f : 0.0f;
     case Activation::LEAKY_RELU:
-      return (f > 0.0f) ? 1.0f : alpha;
+      return (x > 0.0f) ? 1.0f : alpha;
     case Activation::ELU:
-      return (f > 0.0f) ? 1.0f : f + alpha;
+      return (x > 0.0f) ? 1.0f : alpha * std::exp(x);
     default:
       throw std::invalid_argument("Unknown activation type");
     }
@@ -72,6 +76,13 @@ public:
     return result;
   }
 
+  T mult(const T &a, const T &b) override {
+    this->validateSameDimensions(a, b);
+    T result(a.getShape(), false);
+    for (size_t i = 0; i < a.getSize(); ++i)
+      result[i] = a[i] * b[i];
+    return result;
+  }
   T mult(const T &t, float x) override {
     T result(t.getShape(), false);
     for (size_t i = 0; i < t.getSize(); ++i)
@@ -116,19 +127,21 @@ private:
   }
 
 public:
-  Tensor2 dot(const Tensor2 &a, const Tensor2 &b, bool transpose = false,
-              const Vector *bias = nullptr,
+  Tensor2 dot(const Tensor2 &a, const Tensor2 &b, bool transpose_a = false,
+              bool transpose_b = false, const Vector *bias = nullptr,
               Activation type = Activation::LINEAR,
               float alpha = 0.01f) override {
-    validateMultDimensions(a, b, transpose);
+    validateMultDimensions(a, b, transpose_a, transpose_b);
     if (bias != nullptr)
-      validateBiasDimensions(b, *bias, transpose);
-    Tensor2 result(a.getRows(), transpose ? b.getRows() : b.getCols(), 0.0f);
+      validateBiasDimensions(b, *bias, transpose_b);
+    Tensor2 result(transpose_a ? a.getCols() : a.getRows(),
+                   transpose_b ? b.getRows() : b.getCols(), 0.0f);
     for (int i = 0; i < result.getRows(); ++i) {
       for (int j = 0; j < result.getCols(); ++j) {
         float sum = 0.0f;
         for (int k = 0; k < a.getCols(); ++k)
-          sum += a(i, k) * (transpose ? b(j, k) : b(k, j));
+          sum += (transpose_a ? a(k, i) : a(i, k)) *
+                 (transpose_b ? b(j, k) : b(k, j));
         result(i, j) =
             activateX(sum + (bias == nullptr ? 0.0f : (*bias)(j)), type, alpha);
       }
@@ -154,6 +167,17 @@ public:
       throw std::invalid_argument("Unknown loss type");
     }
   }
+
+  Tensor1 axis_sum(const Tensor2 &m) override {
+    Tensor1 result(m.getCols(), 0.0f);
+    for (int i = 0; i < m.getCols(); ++i) {
+      float sum = 0.0f;
+      for (int j = 0; j < m.getRows(); ++j)
+        sum += m(j, i);
+      result(i) = sum;
+    }
+    return result;
+  }
 };
 
 class Tensor3Math : public TensorMath<Tensor3>, public ITensor3Math<Tensor3> {};
diff --git a/src/math/tensor/cpu/tensor.hpp b/src/math/tensor/cpu/tensor.hpp
index ab67455..59ae653 100644
--- a/src/math/tensor/cpu/tensor.hpp
+++ b/src/math/tensor/cpu/tensor.hpp
@@ -172,8 +172,6 @@ public:
 
   float &operator()(int i) { return data[i]; }
   const float &operator()(int i) const { return data[i]; }
-
-  int getSize() const override { return shape[0]; }
 };
 
 class Tensor2 : public ITensor2, public Tensor {
diff --git a/src/math/tensor/math.hpp b/src/math/tensor/math.hpp
index 12db29d..b0c435f 100644
--- a/src/math/tensor/math.hpp
+++ b/src/math/tensor/math.hpp
@@ -34,6 +34,7 @@ public:
   virtual T activate(const T &m, Activation type, float alpha) = 0;
   virtual T d_activate(const T &m, Activation type, float alpha) = 0;
 
+  virtual T mult(const T &a, const T &b) = 0;
   virtual T mult(const T &m, float x) = 0;
   virtual T add(const T &a, const T &b, float x) = 0;
   virtual T add(const T &m, float x) = 0;
@@ -47,24 +48,26 @@ template <ITensor1Type T> class ITensor1Math {};
 
 template <ITensor2Type M, ITensor1Type V> class ITensor2Math {
 public:
-  virtual M dot(const M &a, const M &b, bool transpose, const V *bias,
-                Activation type, float alpha) = 0;
+  virtual M dot(const M &a, const M &b, bool transpose_a, bool transpose_b,
+                const V *bias, Activation type, float alpha) = 0;
 
   virtual M loss(const M &a, const M &b, Loss type) = 0;
   virtual M d_loss(const M &a, const M &b, Loss type) = 0;
 
-  void validateMultDimensions(const M &a, const M &b, bool transpose) const {
-    if ((!transpose && a.getCols() != b.getRows()) ||
-        (transpose && a.getCols() != b.getCols())) {
+  virtual V axis_sum(const M &m) = 0;
+
+  void validateMultDimensions(const M &a, const M &b, bool transpose_a,
+                              bool transpose_b) const {
+    int a_cols = transpose_a ? a.getRows() : a.getCols();
+    int b_rows = transpose_b ? b.getCols() : b.getRows();
+    if (a_cols != b_rows)
       throw std::invalid_argument(
           "Invalid matrix dimensions for multiplication");
-    }
   };
   void validateBiasDimensions(const M &a, const V &b, bool transpose) const {
     if ((!transpose && a.getCols() != b.getSize()) ||
-        (transpose && a.getRows() != b.getSize())) {
+        (transpose && a.getRows() != b.getSize()))
       throw std::invalid_argument("Invalid matrix bias");
-    }
   };
 };
 
diff --git a/src/math/tensor/tensor.hpp b/src/math/tensor/tensor.hpp
index b590dcd..4b1ff81 100644
--- a/src/math/tensor/tensor.hpp
+++ b/src/math/tensor/tensor.hpp
@@ -50,10 +50,7 @@ public:
 
 class ITensor0 {};
 
-class ITensor1 {
-public:
-  virtual int getSize() const = 0;
-};
+class ITensor1 {};
 
 class ITensor2 {
 public: