From 85a1c46c6194ccd76c5ee434d56db33e5709aa17 Mon Sep 17 00:00:00 2001 From: Guido Reina Date: Tue, 2 Jul 2024 16:15:28 +0200 Subject: [PATCH 1/3] added TableStandardize --- plugins/datatools/src/datatools.cpp | 2 + .../datatools/src/table/TableStandardize.cpp | 238 ++++++++++++++++++ .../datatools/src/table/TableStandardize.h | 103 ++++++++ 3 files changed, 343 insertions(+) create mode 100644 plugins/datatools/src/table/TableStandardize.cpp create mode 100644 plugins/datatools/src/table/TableStandardize.h diff --git a/plugins/datatools/src/datatools.cpp b/plugins/datatools/src/datatools.cpp index 7e85a8387d..67cc7535d6 100644 --- a/plugins/datatools/src/datatools.cpp +++ b/plugins/datatools/src/datatools.cpp @@ -88,6 +88,7 @@ #include "table/TableSelectionTx.h" #include "table/TableSort.h" #include "table/TableSplit.h" +#include "table/TableStandardize.h" #include "table/TableToLines.h" #include "table/TableToParticles.h" #include "table/TableWhere.h" @@ -179,6 +180,7 @@ class DatatoolsPluginInstance : public megamol::core::factories::AbstractPluginI this->module_descriptions.RegisterAutoDescription(); this->module_descriptions.RegisterAutoDescription(); this->module_descriptions.RegisterAutoDescription(); + this->module_descriptions.RegisterAutoDescription(); this->module_descriptions.RegisterAutoDescription(); this->module_descriptions.RegisterAutoDescription(); this->module_descriptions.RegisterAutoDescription(); diff --git a/plugins/datatools/src/table/TableStandardize.cpp b/plugins/datatools/src/table/TableStandardize.cpp new file mode 100644 index 0000000000..e185368067 --- /dev/null +++ b/plugins/datatools/src/table/TableStandardize.cpp @@ -0,0 +1,238 @@ +/* + * TableStandardize.cpp + * + * Copyright (C) 2024 by VISUS (University of Stuttgart) + * Alle Rechte vorbehalten. + */ + +#include "TableStandardize.h" + +#include "mmcore/param/EnumParam.h" +#include "mmcore/utility/log/Log.h" + +#include +#include +#include + +using namespace megamol::datatools; +using namespace megamol::datatools::table; +using namespace megamol; + +std::string TableStandardize::ModuleName = std::string("TableStandardize"); + +template +static inline double Lerp(T v0, T v1, T t) { + return (1 - t) * v0 + t * v1; +} + +// https://stackoverflow.com/a/37708864/705750 +template +static inline std::vector Quantile(const std::vector& inData, const std::vector& probs) { + if (inData.empty()) { + return std::vector(); + } + + if (1 == inData.size()) { + return std::vector(1, inData[0]); + } + + std::vector data = inData; + std::sort(data.begin(), data.end()); + std::vector quantiles; + + for (size_t i = 0; i < probs.size(); ++i) { + T poi = Lerp(-0.5, data.size() - 0.5, probs[i]); + + size_t left = std::max(int64_t(std::floor(poi)), int64_t(0)); + size_t right = std::min(int64_t(std::ceil(poi)), int64_t(data.size() - 1)); + + T datLeft = data.at(left); + T datRight = data.at(right); + + T quantile = Lerp(datLeft, datRight, poi - left); + + quantiles.push_back(quantile); + } + + return quantiles; +} + +TableStandardize::TableStandardize() + : core::Module() + , dataOutSlot("dataOut", "Output") + , dataInSlot("dataIn", "Input") + , stratSlot("strategy", "which standardization approach to use") + , frameID(-1) + , in_datahash(std::numeric_limits::max()) + , out_datahash(0) { + + this->dataInSlot.SetCompatibleCall(); + this->MakeSlotAvailable(&this->dataInSlot); + + this->dataOutSlot.SetCallback( + TableDataCall::ClassName(), TableDataCall::FunctionName(0), &TableStandardize::processData); + this->dataOutSlot.SetCallback( + TableDataCall::ClassName(), TableDataCall::FunctionName(1), &TableStandardize::getExtent); + this->MakeSlotAvailable(&this->dataOutSlot); + + auto enump = new core::param::EnumParam(Strategies::STANDARD); + enump->SetTypePair(Strategies::OFF, "Off (nop)"); + enump->SetTypePair(Strategies::STANDARD, "Standard"); + enump->SetTypePair(Strategies::MINMAX, "MinMax"); + enump->SetTypePair(Strategies::MAXABS, "MaxAbs"); + enump->SetTypePair(Strategies::ROBUST, "Robust"); + this->stratSlot << enump; + this->MakeSlotAvailable(&this->stratSlot); +} + +TableStandardize::~TableStandardize() { + this->Release(); +} + +bool TableStandardize::create() { + return true; +} + +void TableStandardize::release() {} + +bool TableStandardize::processData(core::Call& c) { + try { + TableDataCall* outCall = dynamic_cast(&c); + if (outCall == NULL) + return false; + + TableDataCall* inCall = this->dataInSlot.CallAs(); + if (inCall == NULL) + return false; + + inCall->SetFrameID(outCall->GetFrameID()); + if (!(*inCall)()) + return false; + + if (this->in_datahash != inCall->DataHash() || this->frameID != inCall->GetFrameID() || + this->AnyParameterDirty()) { + this->in_datahash = inCall->DataHash(); + this->frameID = inCall->GetFrameID(); + this->ResetAllDirtyFlags(); + this->out_datahash++; + + column_count = inCall->GetColumnsCount(); + column_infos = inCall->GetColumnsInfos(); + row_count = inCall->GetRowsCount(); + in_data = inCall->GetData(); + + const auto strat = this->stratSlot.Param()->Value(); + + this->info.clear(); + this->info.resize(column_count); + this->data.clear(); + this->data.reserve(column_count * row_count); + this->data.insert(this->data.end(), &in_data[0], &in_data[column_count * row_count]); + + // eigen would have liked column-major, but... + Eigen::Map> out_mat( + this->data.data(), row_count, column_count); + + switch (strat) { + case Strategies::OFF: + // bypass + for (int x = 0; x < column_count; ++x) { + this->info[x].SetName(column_infos[x].Name()); + } + break; + case Strategies::STANDARD: { + for (int x = 0; x < column_count; ++x) { + auto mean = out_mat.col(x).mean(); + out_mat.col(x) -= Eigen::VectorXf::Constant(row_count, mean); + auto std_dev = sqrt(out_mat.col(x).cwiseProduct(out_mat.col(x)).sum() / (row_count - 1)); + out_mat.col(x) /= std_dev; + this->info[x].SetName(column_infos[x].Name() + "_std"); + } + break; + } + case Strategies::MINMAX: { + for (int x = 0; x < column_count; ++x) { + auto min = out_mat.col(x).minCoeff(); + out_mat.col(x) -= Eigen::VectorXf::Constant(row_count, min); + auto max = out_mat.col(x).maxCoeff(); + out_mat.col(x) / max; + this->info[x].SetName(column_infos[x].Name() + "_minmax"); + } + break; + } + case Strategies::MAXABS: { + for (int x = 0; x < column_count; ++x) { + auto maxabs = out_mat.col(x).cwiseAbs().maxCoeff(); + out_mat.col(x) / maxabs; + this->info[x].SetName(column_infos[x].Name() + "_maxabs"); + } + break; + } + case Strategies::ROBUST: { + for (int x = 0; x < column_count; ++x) { + // https://stackoverflow.com/a/62698308/705750 + // auto copy = out_mat.col(x).replicate(1, 1).reshaped(); + //std::sort(copy.begin(), copy.end()); + //auto median = + // copy.size() % 2 == 0 ? copy.segment((copy.size() - 2) / 2, 2).mean() : copy(copy.size() / 2); + auto copy = std::vector(out_mat.col(x).data(), out_mat.col(x).data() + row_count); + auto quartiles = Quantile(copy, {0.25, 0.5, 0.75}); + + out_mat.col(x) -= Eigen::VectorXf::Constant(row_count, quartiles[1]); + out_mat.col(x) /= (quartiles[2] - quartiles[0]); + this->info[x].SetName(column_infos[x].Name() + "_robust"); + } + break; + } + } + + for (int x = 0; x < column_count; ++x) { + this->info[x].SetType(column_infos[x].Type()); + this->info[x].SetMinimumValue(out_mat.col(x).minCoeff()); + this->info[x].SetMaximumValue(out_mat.col(x).maxCoeff()); + } + } + + outCall->SetFrameCount(inCall->GetFrameCount()); + outCall->SetFrameID(this->frameID); + outCall->SetDataHash(this->out_datahash); + + if (!this->info.empty()) { + outCall->Set( + this->info.size(), this->data.size() / this->info.size(), this->info.data(), this->data.data()); + } else { + outCall->Set(0, 0, NULL, NULL); + } + } catch (...) { + megamol::core::utility::log::Log::DefaultLog.WriteError( + _T("%hs: Failed to execute processData\n"), ModuleName.c_str()); + return false; + } + + return true; +} + +bool TableStandardize::getExtent(core::Call& c) { + try { + TableDataCall* outCall = dynamic_cast(&c); + if (outCall == NULL) + return false; + + TableDataCall* inCall = this->dataInSlot.CallAs(); + if (inCall == NULL) + return false; + + inCall->SetFrameID(outCall->GetFrameID()); + if (!(*inCall)(1)) + return false; + + outCall->SetFrameCount(inCall->GetFrameCount()); + outCall->SetDataHash(this->out_datahash); // TODO: this is actually crap if somebody properly checks it + } catch (...) { + megamol::core::utility::log::Log::DefaultLog.WriteError( + _T("Failed to execute %hs::getExtent\n"), ModuleName.c_str()); + return false; + } + + return true; +} diff --git a/plugins/datatools/src/table/TableStandardize.h b/plugins/datatools/src/table/TableStandardize.h new file mode 100644 index 0000000000..571edcd3bc --- /dev/null +++ b/plugins/datatools/src/table/TableStandardize.h @@ -0,0 +1,103 @@ +/* + * TableStandardize.h + * + * Copyright (C) 2024 by VISUS (University of Stuttgart) + * Alle Rechte vorbehalten. + */ + +#pragma once + +#include "mmcore/Call.h" +#include "mmcore/CalleeSlot.h" +#include "mmcore/CallerSlot.h" +#include "mmcore/LuaAPI.h" +#include "mmcore/Module.h" + +#include "mmcore/param/ParamSlot.h" + +#include "datatools/table/TableDataCall.h" + +namespace megamol::datatools::table { + +/* + * Module to manipulate table (copy) via a LUA script. + */ +class TableStandardize : public core::Module { +public: + static std::string ModuleName; + static std::string defaultScript; + + /** Return module class name */ + static const char* ClassName() { + return ModuleName.c_str(); + } + + /** Return module class description */ + static const char* Description() { + return "Standardize table data (copy). Helps dimensionality reduction and clustering by reducing bias towards " + "large values."; + } + + /** Module is always available */ + static bool IsAvailable() { + return true; + } + + /** Ctor */ + TableStandardize(); + + /** Dtor */ + ~TableStandardize() override; + +protected: + /** Lazy initialization of the module */ + bool create() override; + + /** Resource release */ + void release() override; + +private: + /** Data callback */ + bool processData(core::Call& c); + + bool getExtent(core::Call& c); + + /** Data output slot */ + core::CalleeSlot dataOutSlot; + + /** Data output slot */ + core::CallerSlot dataInSlot; + + /** Parameter slot for strategy selection */ + core::param::ParamSlot stratSlot; + + // inspired by SciKit learn + enum Strategies { OFF, STANDARD, MINMAX, MAXABS, ROBUST }; + + /** ID of the current frame */ + int frameID; + + /** Hash of the current data */ + size_t in_datahash, out_datahash; + + /** Vector storing the actual float data */ + std::vector data; + + /** Vector storing information about columns */ + std::vector info; + + /** number of columns coming in */ + int column_count = 0; + + /** info of about columns coming in */ + const TableDataCall::ColumnInfo* column_infos = nullptr; + + /** number of rows coming in */ + int row_count = 0; + + /** the data coming in */ + const float* in_data = nullptr; + +}; /* end class TableStandardize */ + +} // namespace megamol::datatools::table From 3c1bc66b6b60b9ceeefae19e2e39c01f505ab95c Mon Sep 17 00:00:00 2001 From: Guido Reina Date: Tue, 2 Jul 2024 18:52:09 +0200 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: moritz-h <7849248+moritz-h@users.noreply.github.com> --- .../datatools/src/table/TableStandardize.cpp | 18 +++++++++--------- plugins/datatools/src/table/TableStandardize.h | 13 +++++-------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/plugins/datatools/src/table/TableStandardize.cpp b/plugins/datatools/src/table/TableStandardize.cpp index e185368067..90ed03b569 100644 --- a/plugins/datatools/src/table/TableStandardize.cpp +++ b/plugins/datatools/src/table/TableStandardize.cpp @@ -1,19 +1,19 @@ -/* - * TableStandardize.cpp - * - * Copyright (C) 2024 by VISUS (University of Stuttgart) - * Alle Rechte vorbehalten. +/** + * MegaMol + * Copyright (c) 2024, MegaMol Dev Team + * All rights reserved. */ #include "TableStandardize.h" -#include "mmcore/param/EnumParam.h" -#include "mmcore/utility/log/Log.h" - -#include #include #include +#include + +#include "mmcore/param/EnumParam.h" +#include "mmcore/utility/log/Log.h" + using namespace megamol::datatools; using namespace megamol::datatools::table; using namespace megamol; diff --git a/plugins/datatools/src/table/TableStandardize.h b/plugins/datatools/src/table/TableStandardize.h index 571edcd3bc..47423f513b 100644 --- a/plugins/datatools/src/table/TableStandardize.h +++ b/plugins/datatools/src/table/TableStandardize.h @@ -1,22 +1,19 @@ -/* - * TableStandardize.h - * - * Copyright (C) 2024 by VISUS (University of Stuttgart) - * Alle Rechte vorbehalten. +/** + * MegaMol + * Copyright (c) 2024, MegaMol Dev Team + * All rights reserved. */ #pragma once +#include "datatools/table/TableDataCall.h" #include "mmcore/Call.h" #include "mmcore/CalleeSlot.h" #include "mmcore/CallerSlot.h" #include "mmcore/LuaAPI.h" #include "mmcore/Module.h" - #include "mmcore/param/ParamSlot.h" -#include "datatools/table/TableDataCall.h" - namespace megamol::datatools::table { /* From 73db82e69783a6669f36caccaf40957a822c1455 Mon Sep 17 00:00:00 2001 From: Guido Reina Date: Tue, 2 Jul 2024 18:53:27 +0200 Subject: [PATCH 3/3] useless include --- plugins/datatools/src/table/TableStandardize.h | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/datatools/src/table/TableStandardize.h b/plugins/datatools/src/table/TableStandardize.h index 47423f513b..c190459b5d 100644 --- a/plugins/datatools/src/table/TableStandardize.h +++ b/plugins/datatools/src/table/TableStandardize.h @@ -10,7 +10,6 @@ #include "mmcore/Call.h" #include "mmcore/CalleeSlot.h" #include "mmcore/CallerSlot.h" -#include "mmcore/LuaAPI.h" #include "mmcore/Module.h" #include "mmcore/param/ParamSlot.h"