diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst index df47427a3df..a2555cd0658 100644 --- a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst @@ -26,6 +26,7 @@ REGISTER_KERNEL(FullyConnected) REGISTER_KERNEL(Gather) REGISTER_KERNEL(Gelu) REGISTER_KERNEL(Greater) +REGISTER_KERNEL(GRU) REGISTER_KERNEL(GreaterEqual) REGISTER_KERNEL(HardSwish) REGISTER_KERNEL(If) diff --git a/compiler/luci-interpreter/pal/linux/PALGRU.h b/compiler/luci-interpreter/pal/linux/PALGRU.h new file mode 100644 index 00000000000..cacb6a6d2fb --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALGRU.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_GRU_H +#define LUCI_INTERPRETER_PAL_GRU_H + +#include +#include "PALreference_ops.h" +namespace luci_interpreter_pal +{ + +// tflite's Logistic does not provide inplace Logistic kernel +void Logistic(const int flat_size, const float *input_data, float *output_data) +{ + const float cutoff_upper = 16.619047164916992188f; + const float cutoff_lower = -9.f; + + // Rational for using approximation in reference kernel. + // 0. This approximation gives enough precision for float. + // 1. This works around an issue on an embedded chipset where exp() does not + // return correctly as expected - exp(x) should return inf when overflown + // not 1.701417 IEEE 754 defines representation for inf. + // 2. This will speed up calculation and is matching the behavior in the + // optimized kernels. (check the definition of scalar_logistic_op) + + for (int i = 0; i < flat_size; i++) + { + float val = input_data[i]; + float result; + if (val > cutoff_upper) + { + result = 1.0f; + } + else if (val < cutoff_lower) + { + result = std::exp(val); + } + else + { + result = 1.f / (1.f + std::exp(-val)); + } + output_data[i] = result; + } +} + +void calculateGRU(const float *input_data, const float *weight_input_data, + const float *weight_hidden_data, const float *bias_input_data, + const float *bias_hidden_data, float *output_data, + const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape, + const tflite::RuntimeShape &weight_input_shape, + const tflite::RuntimeShape &weight_hidden_shape, float *output_input_data, + float *output_hidden_data, const tflite::RuntimeShape &output_shape_fc, + float *intermediate_buffer) +{ + tflite::FullyConnectedParams op_params{}; + // As FC nodes doesn't have any activations inside GRU, let' use just numeric limits + op_params.float_activation_min = std::numeric_limits::lowest(); + op_params.float_activation_max = std::numeric_limits::max(); + + // FC Input + tflite::RuntimeShape bias_input_shape{weight_input_shape.Dims(0)}; + tflite::reference_ops::FullyConnected(op_params, output_shape, output_data, weight_input_shape, + weight_input_data, bias_input_shape, bias_input_data, + output_shape_fc, output_input_data); + + // FC Hidden + tflite::RuntimeShape bias_hidden_shape{weight_hidden_shape.Dims(0)}; + // Note: input for this FC node will be saved without intermediate buffer + tflite::reference_ops::FullyConnected(op_params, input_shape, input_data, weight_hidden_shape, + weight_hidden_data, bias_hidden_shape, bias_hidden_data, + output_shape_fc, output_hidden_data); + + int num_elements = output_shape_fc.Dims(1) / 3; + + float *second_hidden_part = output_hidden_data + num_elements; + float *second_input_part = output_input_data + num_elements; + + float *third_hidden_part = second_hidden_part + num_elements; + float *third_input_part = second_input_part + num_elements; + + // Calculate Left part + for (int i = 0; i < num_elements; ++i) + { + output_input_data[i] += output_hidden_data[i]; + } + + Logistic(num_elements, output_input_data, output_input_data); + + // Calculate most left mul + float *most_left_part_final = output_input_data; + float *first_part = output_input_data; + for (int i = 0; i < num_elements; ++i) + { + output_data[i] *= most_left_part_final[i]; + first_part[i] = 1.0f - first_part[i]; + } + + // Calc second part + for (int i = 0; i < num_elements; ++i) + { + second_hidden_part[i] += second_input_part[i]; + } + + Logistic(num_elements, second_hidden_part, second_hidden_part); + + for (int i = 0; i < num_elements; ++i) + { + second_hidden_part[i] *= third_input_part[i]; + second_hidden_part[i] += third_hidden_part[i]; + } + + for (int i = 0; i < num_elements; ++i) + { + if (second_hidden_part[i] > 19) + { + second_hidden_part[i] = 1; + } + else if (second_hidden_part[i] < -19) + { + second_hidden_part[i] = -1; + } + else + { + second_hidden_part[i] = std::tanh(second_hidden_part[i]); + } + } + + for (int i = 0; i < num_elements; ++i) + { + second_hidden_part[i] *= first_part[i]; + output_data[i] += second_hidden_part[i]; + } +} + +void GRU(const float *input_data, const float *weight_input_data, const float *weight_hidden_data, + const float *bias_input_data, const float *bias_hidden_data, + const float *hidden_state_data, float *output_data, float *output_input_data, + float *output_hidden_data, const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape, + const tflite::RuntimeShape &weight_hidden_shape, const size_t intermediate_buffer_size, + float *intermediate_buffer) +{ + const int32_t time = input_shape.Dims(0); + + tflite::RuntimeShape output_shape_fc(2); + output_shape_fc.SetDim(0, 1); + output_shape_fc.SetDim(1, weight_hidden_shape.Dims(0)); + + std::memcpy(output_data, hidden_state_data, output_shape.FlatSize() * sizeof(float)); + + for (int i = 0; i < time; ++i) + { + calculateGRU(input_data, weight_input_data, weight_hidden_data, bias_input_data, + bias_hidden_data, output_data, input_shape, output_shape, weight_input_shape, + weight_hidden_shape, output_input_data, output_hidden_data, output_shape_fc, + intermediate_buffer); + input_data += input_shape.Dims(2); + } +} + +} // namespace luci_interpreter_pal + +#endif // ONERT_MICRO_EXECUTE_PAL_GRU_COMMON_H diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h index cc6a83e08c7..d7ea8629370 100644 --- a/compiler/luci-interpreter/src/core/KernelParams.h +++ b/compiler/luci-interpreter/src/core/KernelParams.h @@ -111,6 +111,13 @@ struct GeluParams bool approximate; }; +struct GRUParams +{ + Activation fused_act_function = Activation::NONE; + bool return_sequences = false; + bool time_major = false; +}; + struct InstanceNormParams { float epsilon; diff --git a/compiler/luci-interpreter/src/kernels/GRU.cpp b/compiler/luci-interpreter/src/kernels/GRU.cpp new file mode 100644 index 00000000000..505b9c97471 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/GRU.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/GRU.h" + +#include "kernels/Utils.h" + +#include "PALFullyConnected.h" +#include "PALGRU.h" + +namespace luci_interpreter +{ +namespace kernels +{ +GRU::GRU(const Tensor *input, const Tensor *hidden_hidden, const Tensor *hidden_hidden_bias, + const Tensor *hidden_input, const Tensor *hidden_input_bias, const Tensor *state, + Tensor *output, const GRUParams ¶ms) + : KernelWithParams( + {input, hidden_hidden, hidden_hidden_bias, hidden_input, hidden_input_bias, state}, {output}, + params) +{ +} + +void GRU::configure() +{ + auto hidden_hidden_shape = getTensorShape(hidden_hidden()); + auto hidden_input_shape = getTensorShape(hidden_input()); + LUCI_INTERPRETER_CHECK(hidden_hidden_shape.Dims(0) == hidden_input_shape.Dims(0)); + + const int32_t div_factor = 3; + + auto output_shape = getTensorShape(output()); + auto state_shape = getTensorShape(state()); + + output()->resize(state()->shape()); + + LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type()); +} + +void GRU::execute() const +{ + switch (input()->element_type()) + { + case DataType::FLOAT32: + evalFloat(); + break; + default: + throw std::runtime_error("luci-GRU Unsupported data type."); + } +} + +void GRU::evalFloat() const +{ + uint8_t *output_hidden_data; + uint8_t *output_input_data; + + // allocate output datas above + output_hidden_data = new uint8_t[getTensorShape(hidden_hidden()).FlatSize() * sizeof(float)]; + output_input_data = new uint8_t[getTensorShape(hidden_input()).FlatSize() * sizeof(float)]; + + luci_interpreter_pal::GRU( + getTensorData(input()), getTensorData(hidden_input()), + getTensorData(hidden_hidden()), getTensorData(hidden_input_bias()), + getTensorData(hidden_hidden_bias()), getTensorData(state()), + getTensorData(output()), reinterpret_cast(output_input_data), + reinterpret_cast(output_hidden_data), getTensorShape(input()), + getTensorShape(output()), getTensorShape(hidden_input()), getTensorShape(hidden_hidden()), 0, + nullptr); + + delete output_hidden_data; + delete output_input_data; +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/GRU.h b/compiler/luci-interpreter/src/kernels/GRU.h new file mode 100644 index 00000000000..ac5ec085b26 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/GRU.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_GRU_H +#define LUCI_INTERPRETER_KERNELS_GRU_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class GRU : public KernelWithParams +{ +public: + GRU(const Tensor *input, const Tensor *hidden_hidden, const Tensor *hidden_hidden_bias, + const Tensor *hidden_input, const Tensor *hidden_input_bias, const Tensor *state, + Tensor *output, const GRUParams ¶ms); + + const Tensor *input() const { return _inputs[0]; } + const Tensor *hidden_hidden() const { return _inputs[1]; } + const Tensor *hidden_hidden_bias() const { return _inputs[2]; } + const Tensor *hidden_input() const { return _inputs[3]; } + const Tensor *hidden_input_bias() const { return _inputs[4]; } + const Tensor *state() const { return _inputs[5]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; + +private: + void evalFloat() const; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_ROPE_H diff --git a/compiler/luci-interpreter/src/kernels/GRU.test.cpp b/compiler/luci-interpreter/src/kernels/GRU.test.cpp new file mode 100644 index 00000000000..586286b9189 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/GRU.test.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/GRU.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class GRUTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique(); } + + std::unique_ptr _memory_manager; +}; + +TEST_F(GRUTest, floatTest) +{ + Shape input_shape{2, 1, 2}; + std::vector input_data{0.98045033, 0.39546537, 0.5209594, 0.72873044}; + + Shape ref_output_shape{1, 1, 2}; + std::vector ref_output_data{0.22777566, -0.1976251}; + + Shape hidden_hidden_shape{6, 2}; + std::vector hidden_hidden_data{ + 0.8073279857635498, -0.5218740105628967, 0.1166749969124794, 0.33110499382019043, + 0.2770330011844635, 0.23767800629138947, 0.1293960064649582, 0.17175200581550598, + -0.15584999322891235, 0.8137810230255127, -0.2667199969291687, -0.23028500378131866}; + Shape hidden_input_shape{6, 2}; + std::vector hidden_input_data{ + -0.1928129941225052, -0.4582270085811615, -0.17884500324726105, -0.27543601393699646, + 0.704787015914917, 0.1874309927225113, -0.28071099519729614, -0.40605801343917847, + -0.4156219959259033, 0.6752780079841614, 0.4272859990596771, -0.24114100635051727}; + + Shape state_shape{1, 2}; + std::vector state_data{0.0, 0.0}; + + Tensor input_tensor = + makeInputTensor(input_shape, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Tensor hidden_hidden_tensor = makeInputTensor( + hidden_hidden_shape, hidden_hidden_data, _memory_manager.get()); + + Tensor hidden_input_tensor = makeInputTensor( + hidden_input_shape, hidden_input_data, _memory_manager.get()); + + Tensor state_tensor = + makeInputTensor(state_shape, state_data, _memory_manager.get()); + + GRUParams params{}; + + GRU kernel(&input_tensor, &hidden_hidden_tensor, nullptr, &hidden_input_tensor, nullptr, + &state_tensor, &output_tensor, params); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); +} + +TEST_F(GRUTest, Unmatched_io_type_NEG) +{ + Shape input_shape{2, 1, 2}; + std::vector input_data{0.98045033, 0.39546537, 0.5209594, 0.72873044}; + + Shape ref_output_shape{1, 1, 2}; + std::vector ref_output_data{0.22777566, -0.1976251}; + + Shape hidden_hidden_shape{6, 2}; + std::vector hidden_hidden_data{ + 0.8073279857635498, -0.5218740105628967, 0.1166749969124794, 0.33110499382019043, + 0.2770330011844635, 0.23767800629138947, 0.1293960064649582, 0.17175200581550598, + -0.15584999322891235, 0.8137810230255127, -0.2667199969291687, -0.23028500378131866}; + Shape hidden_input_shape{6, 2}; + std::vector hidden_input_data{ + -0.1928129941225052, -0.4582270085811615, -0.17884500324726105, -0.27543601393699646, + 0.704787015914917, 0.1874309927225113, -0.28071099519729614, -0.40605801343917847, + -0.4156219959259033, 0.6752780079841614, 0.4272859990596771, -0.24114100635051727}; + + Shape state_shape{1, 2}; + std::vector state_data{0.0, 0.0}; + + Tensor input_tensor = + makeInputTensor(input_shape, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U32); + + Tensor hidden_hidden_tensor = makeInputTensor( + hidden_hidden_shape, hidden_hidden_data, _memory_manager.get()); + + Tensor hidden_input_tensor = makeInputTensor( + hidden_input_shape, hidden_input_data, _memory_manager.get()); + + Tensor state_tensor = + makeInputTensor(state_shape, state_data, _memory_manager.get()); + + GRUParams params{}; + + GRU kernel(&input_tensor, &hidden_hidden_tensor, nullptr, &hidden_input_tensor, nullptr, + &state_tensor, &output_tensor, params); + + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GRUTest, Unmatched_weight_size_NEG) +{ + Shape input_shape{2, 1, 2}; + std::vector input_data{0.98045033, 0.39546537, 0.5209594, 0.72873044}; + + Shape ref_output_shape{1, 1, 2}; + std::vector ref_output_data{0.22777566, -0.1976251}; + + Shape hidden_hidden_shape{1, 2}; + std::vector hidden_hidden_data{-0.2667199969291687, -0.23028500378131866}; + Shape hidden_input_shape{6, 2}; + std::vector hidden_input_data{ + -0.1928129941225052, -0.4582270085811615, -0.17884500324726105, -0.27543601393699646, + 0.704787015914917, 0.1874309927225113, -0.28071099519729614, -0.40605801343917847, + -0.4156219959259033, 0.6752780079841614, 0.4272859990596771, -0.24114100635051727}; + + Shape state_shape{1, 2}; + std::vector state_data{0.0, 0.0}; + + Tensor input_tensor = + makeInputTensor(input_shape, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Tensor hidden_hidden_tensor = makeInputTensor( + hidden_hidden_shape, hidden_hidden_data, _memory_manager.get()); + + Tensor hidden_input_tensor = makeInputTensor( + hidden_input_shape, hidden_input_data, _memory_manager.get()); + + Tensor state_tensor = + makeInputTensor(state_shape, state_data, _memory_manager.get()); + + GRUParams params{}; + + GRU kernel(&input_tensor, &hidden_hidden_tensor, nullptr, &hidden_input_tensor, nullptr, + &state_tensor, &output_tensor, params); + + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/GRU.cpp b/compiler/luci-interpreter/src/loader/nodes/GRU.cpp new file mode 100644 index 00000000000..f6e5ddbc000 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/GRU.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/GRU.h" + +namespace luci_interpreter +{ + +std::unique_ptr build_kernel_CircleGRU(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = loco::must_cast(circle_node); + assert(node->arity() == 6); + + const Tensor *input = helper.getInputTensor(node->input()); + const Tensor *hidden_hidden = helper.getInputTensor(node->hidden_hidden()); + const Tensor *hidden_hidden_bias = helper.getInputTensor(node->hidden_hidden_bias()); + const Tensor *hidden_input = helper.getInputTensor(node->hidden_input()); + const Tensor *hidden_input_bias = helper.getInputTensor(node->hidden_input_bias()); + const Tensor *state = helper.getInputTensor(node->state()); + + Tensor *output = helper.getOutputTensor(node); + + GRUParams params{}; + params.fused_act_function = node->fusedActivationFunction(); + params.return_sequences = node->returnSequences(); + params.time_major = node->timeMajor(); + + return std::make_unique(input, hidden_hidden, hidden_hidden_bias, hidden_input, + hidden_input_bias, state, output, params); +} + +} // namespace luci_interpreter