Skip to content

Commit 8270229

Browse files
committed
GDALDataset::ComputeInterBandCovarianceMatrix(): use parallel openmp when available
1 parent f085cc9 commit 8270229

File tree

5 files changed

+191
-136
lines changed

5 files changed

+191
-136
lines changed

cmake/helpers/CheckDependentLibraries.cmake

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,11 @@ gdal_check_package(JXL "JPEG-XL compression" CAN_DISABLE)
434434
define_find_package2(JXL_THREADS jxl/resizable_parallel_runner.h jxl_threads PKGCONFIG_NAME libjxl_threads)
435435
gdal_check_package(JXL_THREADS "JPEG-XL threading" CAN_DISABLE)
436436

437-
# unused for now gdal_check_package(OpenMP "")
437+
# Failure at execution on testperf_gdal_minmax_element with ICC
438+
if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM"))
439+
gdal_check_package(OpenMP "Whether GDAL should use OpenMP" COMPONENTS "C" CAN_DISABLE)
440+
endif()
441+
438442
gdal_check_package(Crnlib "enable gdal_DDS driver" CAN_DISABLE)
439443
gdal_check_package(basisu "Enable BASISU driver" CONFIG CAN_DISABLE)
440444
gdal_check_package(IDB "enable ogr_IDB driver" CAN_DISABLE)

gcore/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,8 @@ if (CMAKE_SYSTEM_NAME MATCHES "Windows")
212212
gdal_target_link_libraries(gcore PRIVATE psapi)
213213
endif ()
214214

215-
if (HAVE_OPENMP_SIMD)
216-
target_compile_definitions(gcore PRIVATE HAVE_OPENMP_SIMD)
215+
if (GDAL_USE_OPENMP AND OpenMP_C_LIBRARIES)
216+
gdal_target_link_libraries(gcore PRIVATE ${OpenMP_C_LIBRARIES})
217217
endif()
218218

219219
include(TargetPublicHeader)

gcore/gdaldataset.cpp

Lines changed: 167 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <new>
2929
#include <set>
3030
#include <string>
31+
#include <type_traits>
3132
#include <utility>
3233

3334
#include "cpl_conv.h"
@@ -70,6 +71,10 @@
7071
#include "../sqlite/ogrsqliteexecutesql.h"
7172
#endif
7273

74+
#ifdef HAVE_OPENMP
75+
#include <omp.h>
76+
#endif
77+
7378
extern const swq_field_type SpecialFieldTypes[SPECIAL_FIELD_COUNT];
7479

7580
CPL_C_START
@@ -12686,12 +12691,19 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1268612691

1268712692
// Allocate temporary matrices and vectors
1268812693
const auto nMatrixSize = static_cast<size_t>(nBandCount) * nBandCount;
12694+
12695+
using MySignedSize_t = std::make_signed_t<size_t>;
12696+
const auto kMax =
12697+
static_cast<MySignedSize_t>(nBandCount) * (nBandCount + 1) / 2;
12698+
std::vector<std::pair<int, int>> anMapLinearIdxToIJ;
1268912699
try
1269012700
{
1269112701
anCount.resize(nMatrixSize);
1269212702
adfMean.resize(nMatrixSize);
1269312703
anCountMean.resize(nMatrixSize);
1269412704

12705+
anMapLinearIdxToIJ.resize(kMax);
12706+
1269512707
adfCurBlockPixelsAllBands.resize(nPixelsInBlock * nBandCount);
1269612708

1269712709
adfCurBlockMean.resize(nBandCount);
@@ -12708,6 +12720,18 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1270812720
return CE_Failure;
1270912721
}
1271012722

12723+
{
12724+
MySignedSize_t nLinearIdx = 0;
12725+
for (int i = 0; i < nBandCount; ++i)
12726+
{
12727+
for (int j = i; j < nBandCount; ++j)
12728+
{
12729+
anMapLinearIdxToIJ[nLinearIdx] = {i, j};
12730+
++nLinearIdx;
12731+
}
12732+
}
12733+
}
12734+
1271112735
// Fetch nodata values and mask bands
1271212736
bool bAllBandsSameMask = false;
1271312737
bool bIsAllInteger = false;
@@ -12767,6 +12791,21 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1276712791
cpl::div_round_up(poDS->GetRasterYSize(), nBlockYSize);
1276812792
uint64_t nCurIter = 0;
1276912793

12794+
#ifdef HAVE_OPENMP
12795+
int nNumThreads = 1;
12796+
if (nBandCount >= 100)
12797+
{
12798+
const int nNumCPUs = CPLGetNumCPUs();
12799+
nNumThreads = std::max(1, nNumCPUs / 2);
12800+
const char *pszThreads =
12801+
CPLGetConfigOption("GDAL_NUM_THREADS", nullptr);
12802+
if (pszThreads && !EQUAL(pszThreads, "ALL_CPUS"))
12803+
{
12804+
nNumThreads = std::clamp(atoi(pszThreads), 1, nNumCPUs);
12805+
}
12806+
}
12807+
#endif
12808+
1277012809
// Iterate over all blocks
1277112810
for (const auto &window : papoBands[panBandList[0] - 1]->IterateWindows())
1277212811
{
@@ -12839,159 +12878,159 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1283912878
}
1284012879
}
1284112880

12842-
for (int i = 0; i < nBandCount; ++i)
12881+
#ifdef HAVE_OPENMP
12882+
omp_set_num_threads(nNumThreads);
12883+
12884+
#pragma omp parallel for schedule(static)
12885+
#endif
12886+
for (MySignedSize_t k = 0; k < kMax; ++k)
1284312887
{
12844-
// The covariance matrix is symmetric. So start at i
12845-
for (int j = i; j < nBandCount; ++j)
12888+
int i, j;
12889+
std::tie(i, j) = anMapLinearIdxToIJ[k];
12890+
12891+
// Now compute the moment of (i, j), but just for this block
12892+
size_t nCount = 0;
12893+
T dfComoment = 0;
12894+
const T *padfI =
12895+
adfCurBlockPixelsAllBands.data() + i * nThisBlockPixelCount;
12896+
const T *padfJ =
12897+
adfCurBlockPixelsAllBands.data() + j * nThisBlockPixelCount;
12898+
12899+
// Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12900+
// for the current block
12901+
if ((anCurBlockCount[i] == nThisBlockPixelCount &&
12902+
anCurBlockCount[j] == nThisBlockPixelCount) ||
12903+
(bNoneHasMaskOrNodata && bIsAllInteger))
1284612904
{
12847-
// Now compute the moment of (i, j), but just for this block
12848-
size_t nCount = 0;
12849-
T dfComoment = 0;
12850-
const T *padfI =
12851-
adfCurBlockPixelsAllBands.data() + i * nThisBlockPixelCount;
12852-
const T *padfJ =
12853-
adfCurBlockPixelsAllBands.data() + j * nThisBlockPixelCount;
12854-
12855-
// Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12856-
// for the current block
12857-
if (bNoneHasMaskOrNodata && bIsAllInteger)
12858-
{
12859-
// Most optimized code path: integer, no nodata, no mask
12905+
// Most optimized code path: integer, no nodata, no mask
1286012906
#ifdef HAVE_OPENMP_SIMD
1286112907
#pragma omp simd reduction(+ : dfComoment)
1286212908
#endif
12863-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12864-
++iPixel)
12865-
{
12866-
dfComoment += padfI[iPixel] * padfJ[iPixel];
12867-
}
12868-
nCount = nThisBlockPixelCount;
12869-
}
12870-
else if (bNoneHasMaskOrNodata)
12909+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1287112910
{
12872-
// Floating-point code path with no nodata and no mask
12911+
dfComoment += padfI[iPixel] * padfJ[iPixel];
12912+
}
12913+
nCount = nThisBlockPixelCount;
12914+
}
12915+
else if (bNoneHasMaskOrNodata)
12916+
{
12917+
// Floating-point code path with no nodata and no mask
1287312918
#ifdef HAVE_OPENMP_SIMD
1287412919
#pragma omp simd reduction(+ : dfComoment)
1287512920
#endif
12876-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12877-
++iPixel)
12878-
{
12879-
const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12880-
const bool bIsValid = !std::isnan(dfAcc);
12881-
nCount += bIsValid;
12882-
dfComoment += bIsValid ? dfAcc : ZERO;
12883-
}
12884-
}
12885-
else if (!std::isnan(adfNoData[i]) && !std::isnan(adfNoData[j]))
12921+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1288612922
{
12887-
// Code path when there are both nodata values
12888-
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12889-
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12923+
const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12924+
const bool bIsValid = !std::isnan(dfAcc);
12925+
nCount += bIsValid;
12926+
dfComoment += bIsValid ? dfAcc : ZERO;
12927+
}
12928+
}
12929+
else if (!std::isnan(adfNoData[i]) && !std::isnan(adfNoData[j]))
12930+
{
12931+
// Code path when there are both nodata values
12932+
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12933+
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1289012934

1289112935
#ifdef HAVE_OPENMP_SIMD
1289212936
#pragma omp simd reduction(+ : dfComoment)
1289312937
#endif
12894-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12895-
++iPixel)
12896-
{
12897-
const T dfI = padfI[iPixel];
12898-
const T dfJ = padfJ[iPixel];
12899-
const T dfAcc = dfI * dfJ;
12900-
const bool bIsValid = !std::isnan(dfAcc) &&
12901-
dfI != shiftedNoDataI &&
12902-
dfJ != shiftedNoDataJ;
12903-
nCount += bIsValid;
12904-
dfComoment += bIsValid ? dfAcc : ZERO;
12905-
}
12906-
}
12907-
else
12938+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1290812939
{
12909-
// Generic code path
12910-
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12911-
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12940+
const T dfI = padfI[iPixel];
12941+
const T dfJ = padfJ[iPixel];
12942+
const T dfAcc = dfI * dfJ;
12943+
const bool bIsValid = !std::isnan(dfAcc) &&
12944+
dfI != shiftedNoDataI &&
12945+
dfJ != shiftedNoDataJ;
12946+
nCount += bIsValid;
12947+
dfComoment += bIsValid ? dfAcc : ZERO;
12948+
}
12949+
}
12950+
else
12951+
{
12952+
// Generic code path
12953+
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12954+
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1291212955

1291312956
#ifdef HAVE_OPENMP_SIMD
1291412957
#pragma omp simd reduction(+ : dfComoment)
1291512958
#endif
12916-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12917-
++iPixel)
12918-
{
12919-
const T dfI = padfI[iPixel];
12920-
const T dfJ = padfJ[iPixel];
12921-
const T dfAcc = dfI * dfJ;
12922-
const bool bIsValid =
12923-
!std::isnan(dfAcc) && dfI != shiftedNoDataI &&
12924-
dfJ != shiftedNoDataJ &&
12925-
(!pabyCurBlockMask[i] ||
12926-
(*pabyCurBlockMask[i])[iPixel]) &&
12927-
(!pabyCurBlockMask[j] ||
12928-
(*pabyCurBlockMask[j])[iPixel]);
12929-
nCount += bIsValid;
12930-
dfComoment += bIsValid ? dfAcc : ZERO;
12931-
}
12932-
}
12933-
12934-
const auto idxInMatrixI =
12935-
static_cast<size_t>(i) * nBandCount + j;
12936-
const auto idxInMatrixJ =
12937-
static_cast<size_t>(j) * nBandCount + i;
12938-
12939-
// Update the total comoment using last formula of paragraph
12940-
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12941-
// CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12942-
// (mean_I(A) - mean_I(B)) *
12943-
// (mean_J(A) - mean_J(B)) *
12944-
// (count(A) * count(B)) / (count(A) + count(B))
12945-
//
12946-
// There might be a small gotcha in the fact that the set of
12947-
// pixels on which the means are computed is not always the
12948-
// same as the the one on which the comoment is computed, if
12949-
// pixels are not valid/invalid at the same indices among bands
12950-
// It is not obvious (to me) what should be the correct behavior.
12951-
// The current approach has the benefit to avoid recomputing
12952-
// the mean for each (i,j) tuple, but only for all i.
12953-
if (anCount[idxInMatrixI] == 0)
12954-
padfComomentMatrix[idxInMatrixI] = 0;
12955-
if (nCount > 0)
12959+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1295612960
{
12957-
padfComomentMatrix[idxInMatrixI] +=
12958-
static_cast<double>(dfComoment);
12959-
padfComomentMatrix[idxInMatrixI] +=
12960-
static_cast<double>(adfMean[idxInMatrixI] -
12961-
adfCurBlockMean[i]) *
12962-
static_cast<double>(adfMean[idxInMatrixJ] -
12963-
adfCurBlockMean[j]) *
12964-
(static_cast<double>(anCount[idxInMatrixI]) *
12965-
static_cast<double>(nCount) /
12966-
static_cast<double>(anCount[idxInMatrixI] + nCount));
12961+
const T dfI = padfI[iPixel];
12962+
const T dfJ = padfJ[iPixel];
12963+
const T dfAcc = dfI * dfJ;
12964+
const bool bIsValid = !std::isnan(dfAcc) &&
12965+
dfI != shiftedNoDataI &&
12966+
dfJ != shiftedNoDataJ &&
12967+
(!pabyCurBlockMask[i] ||
12968+
(*pabyCurBlockMask[i])[iPixel]) &&
12969+
(!pabyCurBlockMask[j] ||
12970+
(*pabyCurBlockMask[j])[iPixel]);
12971+
nCount += bIsValid;
12972+
dfComoment += bIsValid ? dfAcc : ZERO;
1296712973
}
12968-
anCount[idxInMatrixI] += nCount;
12974+
}
1296912975

12970-
// Update means
12971-
if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0)
12972-
{
12973-
adfMean[idxInMatrixI] +=
12974-
(adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
12975-
static_cast<T>(
12976-
static_cast<double>(anCurBlockCount[i]) /
12977-
static_cast<double>(anCountMean[idxInMatrixI] +
12978-
anCurBlockCount[i]));
12979-
}
12980-
anCountMean[idxInMatrixI] += anCurBlockCount[i];
12976+
const auto idxInMatrixI = static_cast<size_t>(i) * nBandCount + j;
12977+
const auto idxInMatrixJ = static_cast<size_t>(j) * nBandCount + i;
12978+
12979+
// Update the total comoment using last formula of paragraph
12980+
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12981+
// CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12982+
// (mean_I(A) - mean_I(B)) *
12983+
// (mean_J(A) - mean_J(B)) *
12984+
// (count(A) * count(B)) / (count(A) + count(B))
12985+
//
12986+
// There might be a small gotcha in the fact that the set of
12987+
// pixels on which the means are computed is not always the
12988+
// same as the the one on which the comoment is computed, if
12989+
// pixels are not valid/invalid at the same indices among bands
12990+
// It is not obvious (to me) what should be the correct behavior.
12991+
// The current approach has the benefit to avoid recomputing
12992+
// the mean for each (i,j) tuple, but only for all i.
12993+
if (anCount[idxInMatrixI] == 0)
12994+
padfComomentMatrix[idxInMatrixI] = 0;
12995+
if (nCount > 0)
12996+
{
12997+
padfComomentMatrix[idxInMatrixI] +=
12998+
static_cast<double>(dfComoment);
12999+
padfComomentMatrix[idxInMatrixI] +=
13000+
static_cast<double>(adfMean[idxInMatrixI] -
13001+
adfCurBlockMean[i]) *
13002+
static_cast<double>(adfMean[idxInMatrixJ] -
13003+
adfCurBlockMean[j]) *
13004+
(static_cast<double>(anCount[idxInMatrixI]) *
13005+
static_cast<double>(nCount) /
13006+
static_cast<double>(anCount[idxInMatrixI] + nCount));
13007+
}
13008+
anCount[idxInMatrixI] += nCount;
13009+
13010+
// Update means
13011+
if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0)
13012+
{
13013+
adfMean[idxInMatrixI] +=
13014+
(adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
13015+
static_cast<T>(
13016+
static_cast<double>(anCurBlockCount[i]) /
13017+
static_cast<double>(anCountMean[idxInMatrixI] +
13018+
anCurBlockCount[i]));
13019+
}
13020+
anCountMean[idxInMatrixI] += anCurBlockCount[i];
1298113021

12982-
if (idxInMatrixI != idxInMatrixJ)
13022+
if (idxInMatrixI != idxInMatrixJ)
13023+
{
13024+
if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0)
1298313025
{
12984-
if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0)
12985-
{
12986-
adfMean[idxInMatrixJ] +=
12987-
(adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
12988-
static_cast<T>(
12989-
static_cast<double>(anCurBlockCount[j]) /
12990-
static_cast<double>(anCountMean[idxInMatrixJ] +
12991-
anCurBlockCount[j]));
12992-
}
12993-
anCountMean[idxInMatrixJ] += anCurBlockCount[j];
13026+
adfMean[idxInMatrixJ] +=
13027+
(adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
13028+
static_cast<T>(
13029+
static_cast<double>(anCurBlockCount[j]) /
13030+
static_cast<double>(anCountMean[idxInMatrixJ] +
13031+
anCurBlockCount[j]));
1299413032
}
13033+
anCountMean[idxInMatrixJ] += anCurBlockCount[j];
1299513034
}
1299613035
}
1299713036

0 commit comments

Comments
 (0)