Skip to content

Commit e6b6d2e

Browse files
committed
GDALDataset::ComputeInterBandCovarianceMatrix(): use parallel openmp when available
1 parent f085cc9 commit e6b6d2e

File tree

4 files changed

+180
-135
lines changed

4 files changed

+180
-135
lines changed

gcore/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,8 @@ if (CMAKE_SYSTEM_NAME MATCHES "Windows")
212212
gdal_target_link_libraries(gcore PRIVATE psapi)
213213
endif ()
214214

215-
if (HAVE_OPENMP_SIMD)
216-
target_compile_definitions(gcore PRIVATE HAVE_OPENMP_SIMD)
215+
if (OpenMP_CXX_LIBRARIES)
216+
gdal_target_link_libraries(gcore PRIVATE ${OpenMP_CXX_LIBRARIES})
217217
endif()
218218

219219
include(TargetPublicHeader)

gcore/gdaldataset.cpp

Lines changed: 162 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@
7070
#include "../sqlite/ogrsqliteexecutesql.h"
7171
#endif
7272

73+
#ifdef HAVE_OPENMP
74+
#include <omp.h>
75+
#endif
76+
7377
extern const swq_field_type SpecialFieldTypes[SPECIAL_FIELD_COUNT];
7478

7579
CPL_C_START
@@ -12686,12 +12690,17 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1268612690

1268712691
// Allocate temporary matrices and vectors
1268812692
const auto nMatrixSize = static_cast<size_t>(nBandCount) * nBandCount;
12693+
12694+
const auto kMax = static_cast<size_t>(nBandCount) * (nBandCount + 1) / 2;
12695+
std::vector<std::pair<int, int>> anMapLinearIdxToIJ;
1268912696
try
1269012697
{
1269112698
anCount.resize(nMatrixSize);
1269212699
adfMean.resize(nMatrixSize);
1269312700
anCountMean.resize(nMatrixSize);
1269412701

12702+
anMapLinearIdxToIJ.resize(kMax);
12703+
1269512704
adfCurBlockPixelsAllBands.resize(nPixelsInBlock * nBandCount);
1269612705

1269712706
adfCurBlockMean.resize(nBandCount);
@@ -12708,6 +12717,18 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1270812717
return CE_Failure;
1270912718
}
1271012719

12720+
{
12721+
size_t nLinearIdx = 0;
12722+
for (int i = 0; i < nBandCount; ++i)
12723+
{
12724+
for (int j = i; j < nBandCount; ++j)
12725+
{
12726+
anMapLinearIdxToIJ[nLinearIdx] = {i, j};
12727+
++nLinearIdx;
12728+
}
12729+
}
12730+
}
12731+
1271112732
// Fetch nodata values and mask bands
1271212733
bool bAllBandsSameMask = false;
1271312734
bool bIsAllInteger = false;
@@ -12767,6 +12788,21 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1276712788
cpl::div_round_up(poDS->GetRasterYSize(), nBlockYSize);
1276812789
uint64_t nCurIter = 0;
1276912790

12791+
#ifdef HAVE_OPENMP
12792+
int nNumThreads = 1;
12793+
if (nBandCount >= 100)
12794+
{
12795+
const int nNumCPUs = CPLGetNumCPUs();
12796+
nNumThreads = std::max(1, nNumCPUs / 2);
12797+
const char *pszThreads =
12798+
CPLGetConfigOption("GDAL_NUM_THREADS", nullptr);
12799+
if (pszThreads && !EQUAL(pszThreads, "ALL_CPUS"))
12800+
{
12801+
nNumThreads = std::clamp(atoi(pszThreads), 1, nNumCPUs);
12802+
}
12803+
}
12804+
#endif
12805+
1277012806
// Iterate over all blocks
1277112807
for (const auto &window : papoBands[panBandList[0] - 1]->IterateWindows())
1277212808
{
@@ -12839,159 +12875,157 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1283912875
}
1284012876
}
1284112877

12842-
for (int i = 0; i < nBandCount; ++i)
12878+
#ifdef HAVE_OPENMP
12879+
omp_set_num_threads(nNumThreads);
12880+
12881+
#pragma omp parallel for schedule(static)
12882+
#endif
12883+
for (size_t k = 0; k < kMax; ++k)
1284312884
{
12844-
// The covariance matrix is symmetric. So start at i
12845-
for (int j = i; j < nBandCount; ++j)
12885+
int i, j;
12886+
std::tie(i, j) = anMapLinearIdxToIJ[k];
12887+
12888+
// Now compute the moment of (i, j), but just for this block
12889+
size_t nCount = 0;
12890+
T dfComoment = 0;
12891+
const T *padfI =
12892+
adfCurBlockPixelsAllBands.data() + i * nThisBlockPixelCount;
12893+
const T *padfJ =
12894+
adfCurBlockPixelsAllBands.data() + j * nThisBlockPixelCount;
12895+
12896+
// Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12897+
// for the current block
12898+
if (bNoneHasMaskOrNodata && bIsAllInteger)
1284612899
{
12847-
// Now compute the moment of (i, j), but just for this block
12848-
size_t nCount = 0;
12849-
T dfComoment = 0;
12850-
const T *padfI =
12851-
adfCurBlockPixelsAllBands.data() + i * nThisBlockPixelCount;
12852-
const T *padfJ =
12853-
adfCurBlockPixelsAllBands.data() + j * nThisBlockPixelCount;
12854-
12855-
// Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12856-
// for the current block
12857-
if (bNoneHasMaskOrNodata && bIsAllInteger)
12858-
{
12859-
// Most optimized code path: integer, no nodata, no mask
12900+
// Most optimized code path: integer, no nodata, no mask
1286012901
#ifdef HAVE_OPENMP_SIMD
1286112902
#pragma omp simd reduction(+ : dfComoment)
1286212903
#endif
12863-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12864-
++iPixel)
12865-
{
12866-
dfComoment += padfI[iPixel] * padfJ[iPixel];
12867-
}
12868-
nCount = nThisBlockPixelCount;
12869-
}
12870-
else if (bNoneHasMaskOrNodata)
12904+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1287112905
{
12872-
// Floating-point code path with no nodata and no mask
12906+
dfComoment += padfI[iPixel] * padfJ[iPixel];
12907+
}
12908+
nCount = nThisBlockPixelCount;
12909+
}
12910+
else if (bNoneHasMaskOrNodata)
12911+
{
12912+
// Floating-point code path with no nodata and no mask
1287312913
#ifdef HAVE_OPENMP_SIMD
1287412914
#pragma omp simd reduction(+ : dfComoment)
1287512915
#endif
12876-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12877-
++iPixel)
12878-
{
12879-
const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12880-
const bool bIsValid = !std::isnan(dfAcc);
12881-
nCount += bIsValid;
12882-
dfComoment += bIsValid ? dfAcc : ZERO;
12883-
}
12884-
}
12885-
else if (!std::isnan(adfNoData[i]) && !std::isnan(adfNoData[j]))
12916+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1288612917
{
12887-
// Code path when there are both nodata values
12888-
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12889-
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12918+
const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12919+
const bool bIsValid = !std::isnan(dfAcc);
12920+
nCount += bIsValid;
12921+
dfComoment += bIsValid ? dfAcc : ZERO;
12922+
}
12923+
}
12924+
else if (!std::isnan(adfNoData[i]) && !std::isnan(adfNoData[j]))
12925+
{
12926+
// Code path when there are both nodata values
12927+
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12928+
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1289012929

1289112930
#ifdef HAVE_OPENMP_SIMD
1289212931
#pragma omp simd reduction(+ : dfComoment)
1289312932
#endif
12894-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12895-
++iPixel)
12896-
{
12897-
const T dfI = padfI[iPixel];
12898-
const T dfJ = padfJ[iPixel];
12899-
const T dfAcc = dfI * dfJ;
12900-
const bool bIsValid = !std::isnan(dfAcc) &&
12901-
dfI != shiftedNoDataI &&
12902-
dfJ != shiftedNoDataJ;
12903-
nCount += bIsValid;
12904-
dfComoment += bIsValid ? dfAcc : ZERO;
12905-
}
12906-
}
12907-
else
12933+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1290812934
{
12909-
// Generic code path
12910-
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12911-
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12935+
const T dfI = padfI[iPixel];
12936+
const T dfJ = padfJ[iPixel];
12937+
const T dfAcc = dfI * dfJ;
12938+
const bool bIsValid = !std::isnan(dfAcc) &&
12939+
dfI != shiftedNoDataI &&
12940+
dfJ != shiftedNoDataJ;
12941+
nCount += bIsValid;
12942+
dfComoment += bIsValid ? dfAcc : ZERO;
12943+
}
12944+
}
12945+
else
12946+
{
12947+
// Generic code path
12948+
const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12949+
const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1291212950

1291312951
#ifdef HAVE_OPENMP_SIMD
1291412952
#pragma omp simd reduction(+ : dfComoment)
1291512953
#endif
12916-
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount;
12917-
++iPixel)
12918-
{
12919-
const T dfI = padfI[iPixel];
12920-
const T dfJ = padfJ[iPixel];
12921-
const T dfAcc = dfI * dfJ;
12922-
const bool bIsValid =
12923-
!std::isnan(dfAcc) && dfI != shiftedNoDataI &&
12924-
dfJ != shiftedNoDataJ &&
12925-
(!pabyCurBlockMask[i] ||
12926-
(*pabyCurBlockMask[i])[iPixel]) &&
12927-
(!pabyCurBlockMask[j] ||
12928-
(*pabyCurBlockMask[j])[iPixel]);
12929-
nCount += bIsValid;
12930-
dfComoment += bIsValid ? dfAcc : ZERO;
12931-
}
12932-
}
12933-
12934-
const auto idxInMatrixI =
12935-
static_cast<size_t>(i) * nBandCount + j;
12936-
const auto idxInMatrixJ =
12937-
static_cast<size_t>(j) * nBandCount + i;
12938-
12939-
// Update the total comoment using last formula of paragraph
12940-
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12941-
// CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12942-
// (mean_I(A) - mean_I(B)) *
12943-
// (mean_J(A) - mean_J(B)) *
12944-
// (count(A) * count(B)) / (count(A) + count(B))
12945-
//
12946-
// There might be a small gotcha in the fact that the set of
12947-
// pixels on which the means are computed is not always the
12948-
// same as the the one on which the comoment is computed, if
12949-
// pixels are not valid/invalid at the same indices among bands
12950-
// It is not obvious (to me) what should be the correct behavior.
12951-
// The current approach has the benefit to avoid recomputing
12952-
// the mean for each (i,j) tuple, but only for all i.
12953-
if (anCount[idxInMatrixI] == 0)
12954-
padfComomentMatrix[idxInMatrixI] = 0;
12955-
if (nCount > 0)
12954+
for (size_t iPixel = 0; iPixel < nThisBlockPixelCount; ++iPixel)
1295612955
{
12957-
padfComomentMatrix[idxInMatrixI] +=
12958-
static_cast<double>(dfComoment);
12959-
padfComomentMatrix[idxInMatrixI] +=
12960-
static_cast<double>(adfMean[idxInMatrixI] -
12961-
adfCurBlockMean[i]) *
12962-
static_cast<double>(adfMean[idxInMatrixJ] -
12963-
adfCurBlockMean[j]) *
12964-
(static_cast<double>(anCount[idxInMatrixI]) *
12965-
static_cast<double>(nCount) /
12966-
static_cast<double>(anCount[idxInMatrixI] + nCount));
12956+
const T dfI = padfI[iPixel];
12957+
const T dfJ = padfJ[iPixel];
12958+
const T dfAcc = dfI * dfJ;
12959+
const bool bIsValid = !std::isnan(dfAcc) &&
12960+
dfI != shiftedNoDataI &&
12961+
dfJ != shiftedNoDataJ &&
12962+
(!pabyCurBlockMask[i] ||
12963+
(*pabyCurBlockMask[i])[iPixel]) &&
12964+
(!pabyCurBlockMask[j] ||
12965+
(*pabyCurBlockMask[j])[iPixel]);
12966+
nCount += bIsValid;
12967+
dfComoment += bIsValid ? dfAcc : ZERO;
1296712968
}
12968-
anCount[idxInMatrixI] += nCount;
12969+
}
1296912970

12970-
// Update means
12971-
if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0)
12972-
{
12973-
adfMean[idxInMatrixI] +=
12974-
(adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
12975-
static_cast<T>(
12976-
static_cast<double>(anCurBlockCount[i]) /
12977-
static_cast<double>(anCountMean[idxInMatrixI] +
12978-
anCurBlockCount[i]));
12979-
}
12980-
anCountMean[idxInMatrixI] += anCurBlockCount[i];
12971+
const auto idxInMatrixI = static_cast<size_t>(i) * nBandCount + j;
12972+
const auto idxInMatrixJ = static_cast<size_t>(j) * nBandCount + i;
12973+
12974+
// Update the total comoment using last formula of paragraph
12975+
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12976+
// CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12977+
// (mean_I(A) - mean_I(B)) *
12978+
// (mean_J(A) - mean_J(B)) *
12979+
// (count(A) * count(B)) / (count(A) + count(B))
12980+
//
12981+
// There might be a small gotcha in the fact that the set of
12982+
// pixels on which the means are computed is not always the
12983+
// same as the the one on which the comoment is computed, if
12984+
// pixels are not valid/invalid at the same indices among bands
12985+
// It is not obvious (to me) what should be the correct behavior.
12986+
// The current approach has the benefit to avoid recomputing
12987+
// the mean for each (i,j) tuple, but only for all i.
12988+
if (anCount[idxInMatrixI] == 0)
12989+
padfComomentMatrix[idxInMatrixI] = 0;
12990+
if (nCount > 0)
12991+
{
12992+
padfComomentMatrix[idxInMatrixI] +=
12993+
static_cast<double>(dfComoment);
12994+
padfComomentMatrix[idxInMatrixI] +=
12995+
static_cast<double>(adfMean[idxInMatrixI] -
12996+
adfCurBlockMean[i]) *
12997+
static_cast<double>(adfMean[idxInMatrixJ] -
12998+
adfCurBlockMean[j]) *
12999+
(static_cast<double>(anCount[idxInMatrixI]) *
13000+
static_cast<double>(nCount) /
13001+
static_cast<double>(anCount[idxInMatrixI] + nCount));
13002+
}
13003+
anCount[idxInMatrixI] += nCount;
13004+
13005+
// Update means
13006+
if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0)
13007+
{
13008+
adfMean[idxInMatrixI] +=
13009+
(adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
13010+
static_cast<T>(
13011+
static_cast<double>(anCurBlockCount[i]) /
13012+
static_cast<double>(anCountMean[idxInMatrixI] +
13013+
anCurBlockCount[i]));
13014+
}
13015+
anCountMean[idxInMatrixI] += anCurBlockCount[i];
1298113016

12982-
if (idxInMatrixI != idxInMatrixJ)
13017+
if (idxInMatrixI != idxInMatrixJ)
13018+
{
13019+
if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0)
1298313020
{
12984-
if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0)
12985-
{
12986-
adfMean[idxInMatrixJ] +=
12987-
(adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
12988-
static_cast<T>(
12989-
static_cast<double>(anCurBlockCount[j]) /
12990-
static_cast<double>(anCountMean[idxInMatrixJ] +
12991-
anCurBlockCount[j]));
12992-
}
12993-
anCountMean[idxInMatrixJ] += anCurBlockCount[j];
13021+
adfMean[idxInMatrixJ] +=
13022+
(adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
13023+
static_cast<T>(
13024+
static_cast<double>(anCurBlockCount[j]) /
13025+
static_cast<double>(anCountMean[idxInMatrixJ] +
13026+
anCurBlockCount[j]));
1299413027
}
13028+
anCountMean[idxInMatrixJ] += anCurBlockCount[j];
1299513029
}
1299613030
}
1299713031

gdal.cmake

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -427,10 +427,21 @@ if(OGR_ENABLE_DRIVER_TAB AND
427427
option(OGR_ENABLE_DRIVER_TAB_PLUGIN "Set ON to build OGR MapInfo TAB and MIF/MID driver as plugin" ON)
428428
endif()
429429

430-
include(CheckCXXCompilerFlag)
431-
check_cxx_compiler_flag(-fopenmp-simd HAVE_OPENMP_SIMD)
432-
if (HAVE_OPENMP_SIMD)
433-
add_compile_options("-fopenmp-simd")
430+
find_package(OpenMP COMPONENTS CXX)
431+
if (OpenMP_CXX_VERSION VERSION_GREATER_EQUAL "2.0")
432+
add_compile_definitions(HAVE_OPENMP)
433+
add_compile_definitions(HAVE_OPENMP_SIMD)
434+
add_compile_options(${OpenMP_CXX_FLAGS})
435+
if (OpenMP_CXX_INCLUDE_DIRS)
436+
include_directories(${OpenMP_CXX_INCLUDE_DIRS})
437+
endif()
438+
else()
439+
include(CheckCXXCompilerFlag)
440+
check_cxx_compiler_flag(-fopenmp-simd HAVE_OPENMP_SIMD)
441+
add_compile_definitions(HAVE_OPENMP_SIMD)
442+
if (HAVE_OPENMP_SIMD)
443+
add_compile_options("-fopenmp-simd")
444+
endif()
434445
endif()
435446

436447
# Precompiled header

0 commit comments

Comments
 (0)