dkurt · dkurt · Feb 7, 2022 · Mar 9, 2022 · Apr 1, 2022 · Apr 1, 2022
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -69,9 +69,12 @@ jobs:
 
     - name: Install OpenVINO
       run: |
-        curl ${{env.DIST_WIN}} -o openvino.exe
-        start /WAIT openvino.exe --s --a install --eula=accept --output=log.txt
-      shell: cmd
+        Invoke-WebRequest ${{env.DIST_WIN}} -OutFile openvino.exe
+        Start-Process -Wait -FilePath "openvino.exe" -ArgumentList "-s --a --silent --eula accept --output=log.txt"
+        ls "C:\Program Files (x86)"
+        ls "C:\Program Files (x86)\Intel"
+        ls "C:\Program Files (x86)\intel"
+      shell: pwsh
 
     - name: Build CPU extensions
       run: |

diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ Repository with guides to enable some layers from PyTorch in Intel OpenVINO:
 * [nn.functional.grid_sample](https://github.com/dkurt/openvino_pytorch_layers/tree/master/examples/grid_sample)
 * [torchvision.ops.DeformConv2d](examples/deformable_conv)
 * [SparseConv](examples/sparse_conv) from [Open3D](https://github.com/isl-org/Open3D)
+* [torch.lstsq](https://pytorch.org/docs/stable/generated/torch.lstsq.html)
 
 
 ## OpenVINO Model Optimizer extension

diff --git a/examples/lstsq/export_model.py b/examples/lstsq/export_model.py
@@ -0,0 +1,32 @@
+import numpy as np
+import torch
+from torch import nn
+from .lstsq import LSTSQ
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, A, B):
+        return LSTSQ.apply(B, A)
+
+
+# Solves min_X||AX - B|| where A has a shape Mx2 and B has a shape MxN
+def export(M, N):
+    np.random.seed(324)
+    torch.manual_seed(32)
+
+    model = Model()
+    A = torch.rand([M, 2])
+    B = torch.rand([M, N])
+
+    with torch.no_grad():
+        torch.onnx.export(model, (A, B), 'model.onnx',
+                          input_names=['input', 'input1'],
+                          output_names=['output'],
+                          operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)
+
+    ref = model(A, B)
+    np.save('inp', A.detach().numpy())
+    np.save('inp1', B.detach().numpy())
+    np.save('ref', ref.detach().numpy())
diff --git a/examples/lstsq/lstsq.py b/examples/lstsq/lstsq.py
@@ -0,0 +1,32 @@
+import torch
+
+def solve_squares(B, A):
+    # 1. Perform QR decomposition of matrix A
+    print("A", A.shape)
+    print("B", B.shape)
+
+    def prod(vec0, vec1):
+        return (vec0 * vec1).sum()
+
+    def norm(vec):
+        return vec / (vec * vec).sum().sqrt()
+
+    col0 = norm(A[:, 0])
+    col1 = norm(A[:, 1] - prod(A[:, 1], col0) * col0)
+
+    Q = torch.stack((col0, col1), axis=1)
+    R = torch.tensor([[prod(A[:, 0], col0), prod(A[:, 1], col0)],
+                      [0, prod(A[:, 1], col1)]])
+
+    X = torch.matmul(torch.inverse(R), Q.transpose(1, 0))
+    X = torch.matmul(X, B)
+    return X
+
+class LSTSQ(torch.autograd.Function):
+    @staticmethod
+    def symbolic(g, input, A):
+        return g.op("lstsq", input, A)
+
+    @staticmethod
+    def forward(self, input, A):
+        return torch.lstsq(input, A)[0][:2]
diff --git a/tests/run_tests.py b/tests/run_tests.py
@@ -32,6 +32,7 @@ def run_test(convert_ir=True, test_onnx=False, num_inputs=1, threshold=1e-5):
     ref = np.load('ref.npy')
 
     ie = IECore()
+    print(get_extensions_path())
     ie.add_extension(get_extensions_path(), 'CPU')
     ie.set_config({'CONFIG_FILE': 'user_ie_extensions/gpu_extensions.xml'}, 'GPU')
 
@@ -145,3 +146,10 @@ def test_deformable_conv():
     )
     run_test(num_inputs=2, threshold=2e-5)
     run_test(num_inputs=2, test_onnx=True, threshold=2e-5)
+
+
+def test_lstsq():
+    from examples.lstsq.export_model import export
+
+    export(5, 1000)
+    run_test(num_inputs=2, test_onnx=True)
diff --git a/user_ie_extensions/cpu_kernel.hpp b/user_ie_extensions/cpu_kernel.hpp
@@ -127,4 +127,20 @@ class CalculateGridImpl : public InferenceEngine::ILayerExecImpl {
     std::string error;
 };
 
+class LSTSQImpl : public InferenceEngine::ILayerExecImpl {
+public:
+    explicit LSTSQImpl(const std::shared_ptr<ngraph::Node>& node);
+    InferenceEngine::StatusCode getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig> &conf,
+                                                           InferenceEngine::ResponseDesc *resp) noexcept override;
+    InferenceEngine::StatusCode init(InferenceEngine::LayerConfig &config,
+                                     InferenceEngine::ResponseDesc *resp) noexcept override;
+    InferenceEngine::StatusCode execute(std::vector<InferenceEngine::Blob::Ptr> &inputs,
+                                        std::vector<InferenceEngine::Blob::Ptr> &outputs,
+                                        InferenceEngine::ResponseDesc *resp) noexcept override;
+private:
+    std::vector<ngraph::Shape> inShapes;
+    ngraph::Shape outShape;
+    std::string error;
+};
+
 }  // namespace TemplateExtension
diff --git a/user_ie_extensions/extension.cpp b/user_ie_extensions/extension.cpp
@@ -49,6 +49,10 @@ Extension::Extension() {
         ngraph::OutputVector ng_inputs {node.get_ng_inputs()};
         return {std::make_shared<CalculateGridOp>(ng_inputs.at(0))};
     });
+    ngraph::onnx_import::register_operator(LSTSQOp::type_info.name, 1, "", [](const ngraph::onnx_import::Node& node) -> ngraph::OutputVector {
+        ngraph::OutputVector ng_inputs {node.get_ng_inputs()};
+        return {std::make_shared<LSTSQOp>(ng_inputs.at(0), ng_inputs.at(1))};
+    });
 }
 
 Extension::~Extension() {
@@ -59,6 +63,7 @@ Extension::~Extension() {
     ngraph::onnx_import::unregister_operator(SparseConvOp::type_info.name, 1, "org.open3d");
     ngraph::onnx_import::unregister_operator(SparseConvTransposeOp::type_info.name, 1, "org.open3d");
     ngraph::onnx_import::unregister_operator(CalculateGridOp::type_info.name, 1, "org.open3d");
+    ngraph::onnx_import::unregister_operator(LSTSQOp::type_info.name, 1, "");
 }
 
 //! [extension:GetVersion]
@@ -85,6 +90,7 @@ std::map<std::string, ngraph::OpSet> Extension::getOpSets() {
     opset.insert<SparseConvOp>();
     opset.insert<SparseConvTransposeOp>();
     opset.insert<CalculateGridOp>();
+    opset.insert<LSTSQOp>();
     opsets["extension"] = opset;
     return opsets;
 }
@@ -98,6 +104,7 @@ std::vector<std::string> Extension::getImplTypes(const std::shared_ptr<ngraph::N
         std::dynamic_pointer_cast<SparseConvOp>(node) ||
         std::dynamic_pointer_cast<SparseConvTransposeOp>(node) ||
         std::dynamic_pointer_cast<CalculateGridOp>(node) ||
+        std::dynamic_pointer_cast<LSTSQOp>(node) ||
         std::dynamic_pointer_cast<IFFTOp>(node) ||
         std::dynamic_pointer_cast<FFTOp>(node)) {
         return {"CPU"};
@@ -129,6 +136,9 @@ InferenceEngine::ILayerImpl::Ptr Extension::getImplementation(const std::shared_
     if (std::dynamic_pointer_cast<CalculateGridOp>(node) && implType == "CPU") {
         return std::make_shared<CalculateGridImpl>(node);
     }
+    if (std::dynamic_pointer_cast<LSTSQOp>(node) && implType == "CPU") {
+        return std::make_shared<LSTSQImpl>(node);
+    }
     return nullptr;
 }
 //! [extension:getImplementation]

diff --git a/user_ie_extensions/lstsq_impl.cpp b/user_ie_extensions/lstsq_impl.cpp
@@ -0,0 +1,162 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "cpu_kernel.hpp"
+#include "op.hpp"
+#include <details/ie_exception.hpp>
+#include <ie_layouts.h>
+#include "ie_parallel.hpp"
+
+using namespace TemplateExtension;
+
+//! [cpu_implementation:ctor]
+LSTSQImpl::LSTSQImpl(const std::shared_ptr<ngraph::Node> &node) {
+    try {
+        auto castedNode = std::dynamic_pointer_cast<LSTSQOp>(node);
+        if (!castedNode)
+            THROW_IE_EXCEPTION << "Cannot create implementation for unknown operation!";
+        if (castedNode->inputs().size() != 2 || castedNode->outputs().size() != 1)
+            THROW_IE_EXCEPTION << "Cannot create implementation for operation with incorrect number of inputs or outputs!";
+        if (castedNode->get_input_partial_shape(0).is_dynamic() || castedNode->get_output_partial_shape(0).is_dynamic())
+            THROW_IE_EXCEPTION << "Cannot create implementation for op with dynamic shapes!";
+        if (castedNode->get_input_shape(0).size() != 2 || castedNode->get_output_shape(0).size() != 2)
+            THROW_IE_EXCEPTION << "Operation supports only 4d tensors for input and output.";
+        if (castedNode->get_input_element_type(0) != ngraph::element::f32 || castedNode->get_output_element_type(0) != ngraph::element::f32)
+            THROW_IE_EXCEPTION << "Operation supports only FP32 tensors.";
+        inShapes.resize(2);
+        for (int i = 0; i < inShapes.size(); ++i)
+            inShapes[i] = castedNode->get_input_shape(i);
+        outShape = castedNode->get_output_shape(0);
+    } catch (InferenceEngine::details::InferenceEngineException& ex) {
+        error = ex.what();
+    }
+
+}
+//! [cpu_implementation:ctor]
+
+//! [cpu_implementation:getSupportedConfigurations]
+InferenceEngine::StatusCode LSTSQImpl::getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig> &conf,
+                                                                       InferenceEngine::ResponseDesc *resp) noexcept {
+    std::vector<InferenceEngine::DataConfig> inDataConfig;
+    std::vector<InferenceEngine::DataConfig> outDataConfig;
+    // Allow any offset before data
+    size_t offset((std::numeric_limits<size_t>::max)());
+
+    // Input shape
+    for (const auto& shape : inShapes)
+    {
+        InferenceEngine::SizeVector order(shape.size());
+        std::iota(order.begin(), order.end(), 0);
+
+        InferenceEngine::DataConfig inpConf;
+        inpConf.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, shape, {shape, order, offset});
+        inDataConfig.push_back(inpConf);
+    }
+
+    // Output shape
+    InferenceEngine::SizeVector order(outShape.size());
+    std::iota(order.begin(), order.end(), 0);
+
+    InferenceEngine::DataConfig outConf;
+    outConf.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, outShape, {outShape, order, offset});
+    outDataConfig.push_back(outConf);
+
+    InferenceEngine::LayerConfig layerConfig;
+    layerConfig.inConfs = inDataConfig;
+    layerConfig.outConfs = outDataConfig;
+
+    conf.push_back(layerConfig);
+    return InferenceEngine::StatusCode::OK;
+}
+//! [cpu_implementation:getSupportedConfigurations]
+
+//! [cpu_implementation:init]
+InferenceEngine::StatusCode LSTSQImpl::init(InferenceEngine::LayerConfig &config, InferenceEngine::ResponseDesc *resp) noexcept {
+    try {
+        if (config.inConfs.size() != 2 || config.outConfs.size() != 1) {
+            THROW_IE_EXCEPTION << "Operation cannot be initialized with incorrect number of inputs/outputs!";
+        }
+
+        if (config.inConfs[0].desc.getDims().size() != 2 || config.outConfs[0].desc.getDims().size() != 2) {
+            THROW_IE_EXCEPTION << "Operation can be initialized only with 2d input/output tensors!";
+        }
+
+        if (config.outConfs[0].desc.getPrecision() != InferenceEngine::Precision::FP32 ||
+            config.inConfs[0].desc.getPrecision() != InferenceEngine::Precision::FP32)  {
+            THROW_IE_EXCEPTION << "Operation supports only FP32 precisions!";
+        }
+    } catch (InferenceEngine::details::InferenceEngineException& ex) {
+        if (resp) {
+            strncpy(resp->msg, error.c_str(), sizeof(resp->msg) - 1);
+            resp->msg[sizeof(resp->msg)-1] = 0;
+        }
+        return InferenceEngine::GENERAL_ERROR;
+    }
+
+    return InferenceEngine::OK;
+}
+//! [cpu_implementation:init]
+
+//! [cpu_implementation:execute]
+InferenceEngine::StatusCode LSTSQImpl::execute(std::vector<InferenceEngine::Blob::Ptr> &inputs,
+                                               std::vector<InferenceEngine::Blob::Ptr> &outputs,
+                                               InferenceEngine::ResponseDesc *resp) noexcept {
+    const float* B = inputs[0]->cbuffer().as<float*>();
+    const float* A = inputs[1]->cbuffer().as<float*>();
+    float* out = outputs[0]->buffer().as<float*>();
+
+    // Perform A = QR factorization. This implementation works on A with 2 columns.
+    const size_t M = inputs[0]->getTensorDesc().getDims()[0];
+    const size_t N = inputs[0]->getTensorDesc().getDims()[1];
+
+    std::vector<float> Q(M * 2);
+    std::vector<float> R(4, 0.0f);
+    float norm0 = 0.0f;
+    float product = 0.0f;  // cross-product between second column of A with first column of Q
+    for (int i = 0; i < M; ++i) {
+        float val = A[i * 2];
+        product += A[i * 2 + 1] * val;
+        norm0 += val * val;
+    }
+    norm0 = sqrtf(norm0);
+    product /= norm0;
+    R[1] = product;
+
+    float norm1 = 0.0f;
+    for (int i = 0; i < M; ++i) {
+        float val = A[i * 2] / norm0;
+        Q[i * 2] = val;
+        R[0] += A[i * 2] * val;
+
+        val = A[i * 2 + 1] - product * val;
+        Q[i * 2 + 1] = val;
+        norm1 += val * val;
+        R[3] += A[i * 2 + 1] * val;
+    }
+    norm1 = sqrtf(norm1);
+    for (int i = 0; i < M; ++i) {
+        Q[i * 2 + 1] /= norm1;
+    }
+    R[3] /= norm1;
+
+    // Inverse R matrix
+    float scale = 1.0f / (R[0] * R[3]);
+    std::vector<float> R_inv{R[3] * scale, -R[1] * scale, 0.0f, R[0] * scale};
+
+    // Output is inverse(R) * transpose(Q) * B
+    for (int i = 0; i < M; ++i) {
+        Q[i * 2] = R_inv[0] * Q[i * 2] + R_inv[1] * Q[i * 2 + 1];
+        Q[i * 2 + 1] *= R_inv[3];
+    }
+
+    for (int i = 0; i < N; ++i) {
+        out[i] = 0.0f;
+        out[N + i] = 0.0f;
+        for (int j = 0; j < M; ++j) {
+            out[i] += Q[j * 2] * B[j * N + i];
+            out[N + i] += Q[j * 2 + 1] * B[j * N + i];
+        }
+    }
+    return InferenceEngine::OK;
+}
+//! [cpu_implementation:execute]
diff --git a/user_ie_extensions/lstsq_op.cpp b/user_ie_extensions/lstsq_op.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "op.hpp"
+
+using namespace TemplateExtension;
+
+constexpr ngraph::NodeTypeInfo LSTSQOp::type_info;
+
+//! [op:ctor]
+LSTSQOp::LSTSQOp(
+    const ngraph::Output<ngraph::Node>& B,
+    const ngraph::Output<ngraph::Node>& A
+)
+    : Op({B, A}) {
+    constructor_validate_and_infer_types();
+}
+//! [op:ctor]
+
+//! [op:validate]
+void LSTSQOp::validate_and_infer_types() {
+    auto outShape = get_input_partial_shape(0);
+    outShape[0] = 2;
+    set_output_type(0, get_input_element_type(0), outShape);
+}
+//! [op:validate]
+
+//! [op:copy]
+std::shared_ptr<ngraph::Node> LSTSQOp::clone_with_new_inputs(const ngraph::OutputVector &new_args) const {
+    if (new_args.size() != 2) {
+        throw ngraph::ngraph_error("Incorrect number of new arguments");
+    }
+    return std::make_shared<LSTSQOp>(new_args.at(0), new_args.at(1));
+}
+//! [op:copy]
+
+//! [op:visit_attributes]
+bool LSTSQOp::visit_attributes(ngraph::AttributeVisitor &visitor) {
+    return true;
+}
+//! [op:visit_attributes]