diff --git a/runtime/onert/api/nnfw/include/nnfw_experimental.h b/runtime/onert/api/nnfw/include/nnfw_experimental.h index 27dc0c6f7dd..7ac10f95958 100644 --- a/runtime/onert/api/nnfw/include/nnfw_experimental.h +++ b/runtime/onert/api/nnfw/include/nnfw_experimental.h @@ -558,6 +558,58 @@ NNFW_STATUS nnfw_set_codegen_model_path(nnfw_session *session, const char *path) */ NNFW_STATUS nnfw_codegen(nnfw_session *session, const char *target, NNFW_CODEGEN_PREF pref); +/** + * @brief Set MinMax records count in auto compilation mode with on-device compiler + * + * This function set MinMax records count for quantization in auto compilation mode. + * To enable automatic compilation mode, use {@link nnfw_run_with_auto_compilation} + * + * @param[in] session nnfw_session + * @param[in] minmax_records_count minmax records count + * @return @c NNFW_STATUS_NO_ERROR if successful, otherwise return @c NNFW_STATUS_ERROR + */ +NNFW_STATUS nnfw_set_odc_param_minmax_records_count(nnfw_session *session, + int minmax_records_count); + +/** + * @brief Delete MinMax file for on-device compiler + * + * @param[in] session nnfw_session + * @return @c NNFW_STATUS_NO_ERROR if successful, otherwise return @c NNFW_STATUS_ERROR + */ +NNFW_STATUS nnfw_odc_delete_minmax_file(nnfw_session *session); + +/** + * @brief Run inference with auto compilation + * + *

This function runs inference with automatic compilation and replaces + * the original model with a quantized or compiled model inside. + * During the inference the minmax statistics is collected and after that quantization is performed. + * If quantization was successful, try to code generating for target backend, otherwise run original + float model. + * If compilation was successful, run compiled model, otherwise run quantized model. + * On-device compiler (ODC) provides quantization and compilation functionality. + * Function should be called after model is loaded by {@link nnfw_load_model_from_file}, + * session is prepared for inference by {@link nnfw_prepare}, set input and output buffers + * by {@link nnfw_set_input} and {@link nnfw_set_output}. + * + * Additionally the following parameters should be set up : + * 1. Quantization type {@link nnfw_set_quantization_type } + * 2. Quantizated model path {@link nnfw_set_quantized_model_path } + * 3. Minmax records threshold for quantization {@link nnfw_set_odc_param_minmax_records_count } + * 3. File with minMax statistics can be removed by {@link nnfw_odc_delete_minmax_file} + * 4. Compiled model path {@link nnfw_set_codegen_model_path} + *

+ * + * @param[in] session nnfw_session + * @param[in] target Target backend to generate code as in {@link nnfw_codegen} + * @param[in] pref @c NNFW_CODEGEN_PREF + + * @return @c NNFW_STATUS_NO_ERROR if successful, otherwise return @c NNFW_STATUS_ERROR + */ +NNFW_STATUS nnfw_run_with_auto_compilation(nnfw_session *session, const char *target, + NNFW_CODEGEN_PREF pref); + ////////////////////////////////////////////// // APIs for configuration ////////////////////////////////////////////// diff --git a/runtime/onert/api/nnfw/src/nnfw_api.cc b/runtime/onert/api/nnfw/src/nnfw_api.cc index 7720ed27584..ea217c50e9f 100644 --- a/runtime/onert/api/nnfw/src/nnfw_api.cc +++ b/runtime/onert/api/nnfw/src/nnfw_api.cc @@ -508,6 +508,25 @@ NNFW_STATUS nnfw_codegen(nnfw_session *session, const char *target, NNFW_CODEGEN return session->codegen(target, pref); } +NNFW_STATUS nnfw_set_odc_param_minmax_records_count(nnfw_session *session, int minmax_records_count) +{ + NNFW_RETURN_ERROR_IF_NULL(session); + return session->set_odc_param_minmax_records_count(minmax_records_count); +} + +NNFW_STATUS nnfw_odc_delete_minmax_file(nnfw_session *session) +{ + NNFW_RETURN_ERROR_IF_NULL(session); + return session->delete_odc_minmax_file(); +} + +NNFW_STATUS nnfw_run_with_auto_compilation(nnfw_session *session, const char *target, + NNFW_CODEGEN_PREF pref) +{ + NNFW_RETURN_ERROR_IF_NULL(session); + return session->run_with_auto_compilation(target, pref); +} + // Configuration NNFW_STATUS nnfw_set_prepare_config(nnfw_session *session, const NNFW_PREPARE_CONFIG key, diff --git a/runtime/onert/api/nnfw/src/nnfw_api_internal.cc b/runtime/onert/api/nnfw/src/nnfw_api_internal.cc index 084579490fb..3a579c2ab59 100644 --- a/runtime/onert/api/nnfw/src/nnfw_api_internal.cc +++ b/runtime/onert/api/nnfw/src/nnfw_api_internal.cc @@ -2028,3 +2028,300 @@ NNFW_STATUS nnfw_session::reset_execute_config() return NNFW_STATUS_NO_ERROR; } + +NNFW_STATUS nnfw_session::set_odc_param_minmax_records_count(int minmax_records_count) +{ + if (isStateInitialized() || isStateRunning()) + { + std::cerr << "invalid state" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + + if (_quant_manager->setMinMaxRecordsThreshold(minmax_records_count)) + return NNFW_STATUS_NO_ERROR; + else + return NNFW_STATUS_ERROR; +} + +NNFW_STATUS nnfw_session::delete_odc_minmax_file() +{ + if (isStateRunning()) + { + std::cerr << "invalid state" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + + if (_quant_manager->deleteMinMaxFile()) + return NNFW_STATUS_NO_ERROR; + else + return NNFW_STATUS_ERROR; +} + +// run with auto compilation +NNFW_STATUS nnfw_session::run_with_auto_compilation(const char *target, NNFW_CODEGEN_PREF pref) +{ + + if (!isStatePreparedOrFinishedRun()) + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " + << "run should be after preparation" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + + // Check quantization and code-generation parameters + std::string target_str{target}; + if (_quant_manager->exportModelPath().empty() || _codegen_manager->exportModelPath().empty() || + target_str.empty() || target_str.substr(target_str.size() - 4) != "-gen") + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " + << "quantization and code generation parameters should be set" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + + // Odc: auto compilation with hidden switching mechanizm + // Check is model already quantized or compiled + std::ifstream file_quantized_model(_quant_manager->exportModelPath()); + std::ifstream file_compiled_model(_codegen_manager->exportModelPath()); + + if (!file_quantized_model.good() && !file_compiled_model.good()) + { + // Run float model and try to quantize it + { + // Save execution options + auto saved_options = _execution->executionOptions(); + // turn on minmax recording + _execution->executionOptions().dump_minmax = true; + + try + { + _execution->execute(); + } + catch (const onert::InsufficientBufferSizeException &e) + { + // Currently insufficient buffer always means output buffer. + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what() + << std::endl; + return NNFW_STATUS_INSUFFICIENT_OUTPUT_SIZE; + } + catch (const std::exception &e) + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what() + << std::endl; + return NNFW_STATUS_ERROR; + } + + _state = State::FINISHED_RUN; + + // restore min_max option to user defined state + _execution->executionOptions().dump_minmax = saved_options.dump_minmax; + + // if enough statistics are collected, then run the quantization + if (_quant_manager->readyForQuantize()) + { + try + { + if (isStateInitialized() || isStateRunning()) + { + std::cerr << "invalid state" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + + auto result = _quant_manager->quantize(_model_path); + if (!result) + return NNFW_STATUS_INVALID_STATE; + + // remove minmax file + result = _quant_manager->deleteMinMaxFile(); + if (!result) + return NNFW_STATUS_INVALID_STATE; + } + catch (const std::exception &e) + { + std::cerr + << "Error during nnfw_session::run_with_auto_compilation in quantize operation: " + << e.what() << std::endl; + return NNFW_STATUS_ERROR; + } + } + } + } + else + { + // run compiled or quantized model + NNFW_STATUS status; + + // turn off minmax recording + _execution->executionOptions().dump_minmax = false; + + // save initial buffers if quantized model or compiled model is not loaded + if (_autoCompilationState == nnfw_session::AutoCompilationState::INITIAL_STATE) + { + auto dotidx = _codegen_manager->exportModelPath().rfind('.'); + if (dotidx == std::string::npos) + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : Invalid compiled " + "model path. Please use a " + "path that includes the extension." + << std::endl; + return NNFW_STATUS_ERROR; + } + + std::string compiled_model_type = + _codegen_manager->exportModelPath().substr(dotidx + 1); // + 1 to exclude dot + + dotidx = _quant_manager->exportModelPath().rfind('.'); + if (dotidx == std::string::npos) + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : Invalid quantized " + "model path. Please use a " + "path that includes the extension." + << std::endl; + return NNFW_STATUS_ERROR; + } + std::string quantized_model_type = + _quant_manager->exportModelPath().substr(dotidx + 1); // + 1 to exclude dot + + // Save initial (float) input and output buffers + auto input_size = _compiler_artifact->_executors->inputSize(); + auto output_size = _compiler_artifact->_executors->outputSize(); + + std::vector _input_buffers; + std::vector _output_buffers; + + // Save Inputs buffers + for (size_t input_index = 0; input_index < input_size; input_index++) + { + auto io_input_index = onert::ir::IOIndex(input_index); + auto input_Shape = _execution->getInputShape(io_input_index); + auto input_buffer = _execution->getInputBuffer(io_input_index); + + _input_buffers.push_back(input_buffer); + } + + // Save Outputs buffers + for (size_t output_index = 0; output_index < output_size; output_index++) + { + auto io_output_index = onert::ir::IOIndex(output_index); + + auto output_Shape = _execution->getOutputShape(io_output_index); + auto output_buffer = _execution->getOutputBuffer(io_output_index); + + _output_buffers.push_back(output_buffer); + } + + // Save execution options + auto saved_options = _execution->executionOptions(); + + // if there is compiled model - try to load it + if (file_compiled_model.good()) + { + // load compiled model + status = loadModelFile(_codegen_manager->exportModelPath(), compiled_model_type); + if (status == NNFW_STATUS_NO_ERROR) + { + _autoCompilationState = nnfw_session::AutoCompilationState::COMPILED_MODEL_LOADED; + } + } + else // there is no compiled model - try to compile and load it + { + + // avoiding code duplication use existing "codegen" function. Set up _model_path for the + // codegen function. + // TODO: change it if codegen function will be generalized + _model_path = _quant_manager->exportModelPath(); + + // try to compile and load compiled model + status = codegen(target, pref); + if (status == NNFW_STATUS_NO_ERROR) + { + _autoCompilationState = nnfw_session::AutoCompilationState::COMPILED_MODEL_LOADED; + // TODO delete quantized model + } + } + + // loading compiled model is fail - try to load quantized model + if (_autoCompilationState != nnfw_session::AutoCompilationState::COMPILED_MODEL_LOADED) + { + // load quantized model + status = loadModelFile(_quant_manager->exportModelPath(), quantized_model_type); + if (status != NNFW_STATUS_NO_ERROR) + return status; + else + _autoCompilationState = nnfw_session::AutoCompilationState::QUANTIZED_MODEL_LOADED; + } + + status = prepare(); + if (status != NNFW_STATUS_NO_ERROR) + return status; + + // Restore execution options + _execution->executionOptions() = saved_options; + + // Restore inputs to the quantized or compiled model + for (uint32_t input_index = 0; input_index < _input_buffers.size(); input_index++) + { + nnfw_tensorinfo ti; + status = input_tensorinfo(input_index, &ti); + if (status != NNFW_STATUS_NO_ERROR) + return status; + + ti.dtype = NNFW_TYPE_TENSOR_FLOAT32; + auto input_size_in_bytes = getBufSize(&ti); + + status = set_input(input_index, ti.dtype, _input_buffers[input_index], input_size_in_bytes); + + if (status != NNFW_STATUS_NO_ERROR) + return status; + } + + // Restore outputs to the quantized or compiled model + for (uint32_t output_index = 0; output_index < _output_buffers.size(); output_index++) + { + + nnfw_tensorinfo ti; + status = output_tensorinfo(output_index, &ti); + if (status != NNFW_STATUS_NO_ERROR) + return status; + + ti.dtype = NNFW_TYPE_TENSOR_FLOAT32; + + uint64_t output_size_in_bytes = getBufSize(&ti); + + status = + set_output(output_index, ti.dtype, _output_buffers[output_index], output_size_in_bytes); + if (status != NNFW_STATUS_NO_ERROR) + return status; + } + } + + // Run quantized model + if (!isStatePreparedOrFinishedRun()) + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " + << "run should be run after prepare" << std::endl; + return NNFW_STATUS_INVALID_STATE; + } + + try + { + _execution->execute(); + } + catch (const onert::InsufficientBufferSizeException &e) + { + // Currently insufficient buffer always means output buffer. + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what() + << std::endl; + return NNFW_STATUS_INSUFFICIENT_OUTPUT_SIZE; + } + catch (const std::exception &e) + { + std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what() + << std::endl; + return NNFW_STATUS_ERROR; + } + + _state = State::FINISHED_RUN; + } + + return NNFW_STATUS_NO_ERROR; +} diff --git a/runtime/onert/api/nnfw/src/nnfw_api_internal.h b/runtime/onert/api/nnfw/src/nnfw_api_internal.h index badfb24849c..56471c14b59 100644 --- a/runtime/onert/api/nnfw/src/nnfw_api_internal.h +++ b/runtime/onert/api/nnfw/src/nnfw_api_internal.h @@ -86,6 +86,13 @@ struct nnfw_session FINISHED_TRAINING //< Trained at least once }; + enum class AutoCompilationState + { + INITIAL_STATE, //< Initial state + QUANTIZED_MODEL_LOADED, //< Qunatized model is loaded + COMPILED_MODEL_LOADED //< Compiled model is loaded + }; + public: /** * @brief Factory method. It creates and initialize nnfw_session @@ -141,6 +148,14 @@ struct nnfw_session NNFW_STATUS register_custom_operation(const std::string &id, nnfw_custom_eval eval_func); NNFW_STATUS input_tensorindex(const char *tensorname, uint32_t *index); NNFW_STATUS output_tensorindex(const char *tensorname, uint32_t *index); + + // Run inference with auto compilation + NNFW_STATUS run_with_auto_compilation(const char *target, NNFW_CODEGEN_PREF pref); + // Set odc parameter: minmax_records_count for quantization in auto compilation mode + NNFW_STATUS set_odc_param_minmax_records_count(int minmax_records_count); + // delete MinMax File of on-device compiler + NNFW_STATUS delete_odc_minmax_file(); + /** * @brief Set backends with string-encoded mapping from operation index to backend type * (cpu, acl_cl) @@ -203,6 +218,7 @@ struct nnfw_session std::unique_ptr _train_info; std::unique_ptr _quant_manager; std::unique_ptr _codegen_manager; + AutoCompilationState _autoCompilationState = AutoCompilationState::INITIAL_STATE; // Remember path to loaded original model // It may be used for on-device compiler / on-device training. //