2828#include < new>
2929#include < set>
3030#include < string>
31+ #include < type_traits>
3132#include < utility>
3233
3334#include " cpl_conv.h"
7071#include " ../sqlite/ogrsqliteexecutesql.h"
7172#endif
7273
74+ #ifdef HAVE_OPENMP
75+ #include < omp.h>
76+ #endif
77+
7378extern const swq_field_type SpecialFieldTypes[SPECIAL_FIELD_COUNT];
7479
7580CPL_C_START
@@ -12686,12 +12691,19 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1268612691
1268712692 // Allocate temporary matrices and vectors
1268812693 const auto nMatrixSize = static_cast <size_t >(nBandCount) * nBandCount;
12694+
12695+ using MySignedSize_t = std::make_signed_t <size_t >;
12696+ const auto kMax =
12697+ static_cast <MySignedSize_t>(nBandCount) * (nBandCount + 1 ) / 2 ;
12698+ std::vector<std::pair<int , int >> anMapLinearIdxToIJ;
1268912699 try
1269012700 {
1269112701 anCount.resize (nMatrixSize);
1269212702 adfMean.resize (nMatrixSize);
1269312703 anCountMean.resize (nMatrixSize);
1269412704
12705+ anMapLinearIdxToIJ.resize (kMax );
12706+
1269512707 adfCurBlockPixelsAllBands.resize (nPixelsInBlock * nBandCount);
1269612708
1269712709 adfCurBlockMean.resize (nBandCount);
@@ -12708,6 +12720,18 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1270812720 return CE_Failure;
1270912721 }
1271012722
12723+ {
12724+ MySignedSize_t nLinearIdx = 0 ;
12725+ for (int i = 0 ; i < nBandCount; ++i)
12726+ {
12727+ for (int j = i; j < nBandCount; ++j)
12728+ {
12729+ anMapLinearIdxToIJ[nLinearIdx] = {i, j};
12730+ ++nLinearIdx;
12731+ }
12732+ }
12733+ }
12734+
1271112735 // Fetch nodata values and mask bands
1271212736 bool bAllBandsSameMask = false ;
1271312737 bool bIsAllInteger = false ;
@@ -12767,6 +12791,21 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1276712791 cpl::div_round_up (poDS->GetRasterYSize (), nBlockYSize);
1276812792 uint64_t nCurIter = 0 ;
1276912793
12794+ #ifdef HAVE_OPENMP
12795+ int nNumThreads = 1 ;
12796+ if (nBandCount >= 100 )
12797+ {
12798+ const int nNumCPUs = CPLGetNumCPUs ();
12799+ nNumThreads = std::max (1 , nNumCPUs / 2 );
12800+ const char *pszThreads =
12801+ CPLGetConfigOption (" GDAL_NUM_THREADS" , nullptr );
12802+ if (pszThreads && !EQUAL (pszThreads, " ALL_CPUS" ))
12803+ {
12804+ nNumThreads = std::clamp (atoi (pszThreads), 1 , nNumCPUs);
12805+ }
12806+ }
12807+ #endif
12808+
1277012809 // Iterate over all blocks
1277112810 for (const auto &window : papoBands[panBandList[0 ] - 1 ]->IterateWindows ())
1277212811 {
@@ -12839,159 +12878,159 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1283912878 }
1284012879 }
1284112880
12842- for (int i = 0 ; i < nBandCount; ++i)
12881+ #ifdef HAVE_OPENMP
12882+ omp_set_num_threads (nNumThreads);
12883+
12884+ #pragma omp parallel for schedule(static)
12885+ #endif
12886+ for (MySignedSize_t k = 0 ; k < kMax ; ++k)
1284312887 {
12844- // The covariance matrix is symmetric. So start at i
12845- for (int j = i; j < nBandCount; ++j)
12888+ int i, j;
12889+ std::tie (i, j) = anMapLinearIdxToIJ[k];
12890+
12891+ // Now compute the moment of (i, j), but just for this block
12892+ size_t nCount = 0 ;
12893+ T dfComoment = 0 ;
12894+ const T *padfI =
12895+ adfCurBlockPixelsAllBands.data () + i * nThisBlockPixelCount;
12896+ const T *padfJ =
12897+ adfCurBlockPixelsAllBands.data () + j * nThisBlockPixelCount;
12898+
12899+ // Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12900+ // for the current block
12901+ if ((anCurBlockCount[i] == nThisBlockPixelCount &&
12902+ anCurBlockCount[j] == nThisBlockPixelCount) ||
12903+ (bNoneHasMaskOrNodata && bIsAllInteger))
1284612904 {
12847- // Now compute the moment of (i, j), but just for this block
12848- size_t nCount = 0 ;
12849- T dfComoment = 0 ;
12850- const T *padfI =
12851- adfCurBlockPixelsAllBands.data () + i * nThisBlockPixelCount;
12852- const T *padfJ =
12853- adfCurBlockPixelsAllBands.data () + j * nThisBlockPixelCount;
12854-
12855- // Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12856- // for the current block
12857- if (bNoneHasMaskOrNodata && bIsAllInteger)
12858- {
12859- // Most optimized code path: integer, no nodata, no mask
12905+ // Most optimized code path: integer, no nodata, no mask
1286012906#ifdef HAVE_OPENMP_SIMD
1286112907#pragma omp simd reduction(+ : dfComoment)
1286212908#endif
12863- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12864- ++iPixel)
12865- {
12866- dfComoment += padfI[iPixel] * padfJ[iPixel];
12867- }
12868- nCount = nThisBlockPixelCount;
12869- }
12870- else if (bNoneHasMaskOrNodata)
12909+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1287112910 {
12872- // Floating-point code path with no nodata and no mask
12911+ dfComoment += padfI[iPixel] * padfJ[iPixel];
12912+ }
12913+ nCount = nThisBlockPixelCount;
12914+ }
12915+ else if (bNoneHasMaskOrNodata)
12916+ {
12917+ // Floating-point code path with no nodata and no mask
1287312918#ifdef HAVE_OPENMP_SIMD
1287412919#pragma omp simd reduction(+ : dfComoment)
1287512920#endif
12876- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12877- ++iPixel)
12878- {
12879- const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12880- const bool bIsValid = !std::isnan (dfAcc);
12881- nCount += bIsValid;
12882- dfComoment += bIsValid ? dfAcc : ZERO;
12883- }
12884- }
12885- else if (!std::isnan (adfNoData[i]) && !std::isnan (adfNoData[j]))
12921+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1288612922 {
12887- // Code path when there are both nodata values
12888- const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12889- const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12923+ const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12924+ const bool bIsValid = !std::isnan (dfAcc);
12925+ nCount += bIsValid;
12926+ dfComoment += bIsValid ? dfAcc : ZERO;
12927+ }
12928+ }
12929+ else if (!std::isnan (adfNoData[i]) && !std::isnan (adfNoData[j]))
12930+ {
12931+ // Code path when there are both nodata values
12932+ const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12933+ const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1289012934
1289112935#ifdef HAVE_OPENMP_SIMD
1289212936#pragma omp simd reduction(+ : dfComoment)
1289312937#endif
12894- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12895- ++iPixel)
12896- {
12897- const T dfI = padfI[iPixel];
12898- const T dfJ = padfJ[iPixel];
12899- const T dfAcc = dfI * dfJ;
12900- const bool bIsValid = !std::isnan (dfAcc) &&
12901- dfI != shiftedNoDataI &&
12902- dfJ != shiftedNoDataJ;
12903- nCount += bIsValid;
12904- dfComoment += bIsValid ? dfAcc : ZERO;
12905- }
12906- }
12907- else
12938+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1290812939 {
12909- // Generic code path
12910- const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12911- const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12940+ const T dfI = padfI[iPixel];
12941+ const T dfJ = padfJ[iPixel];
12942+ const T dfAcc = dfI * dfJ;
12943+ const bool bIsValid = !std::isnan (dfAcc) &&
12944+ dfI != shiftedNoDataI &&
12945+ dfJ != shiftedNoDataJ;
12946+ nCount += bIsValid;
12947+ dfComoment += bIsValid ? dfAcc : ZERO;
12948+ }
12949+ }
12950+ else
12951+ {
12952+ // Generic code path
12953+ const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12954+ const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1291212955
1291312956#ifdef HAVE_OPENMP_SIMD
1291412957#pragma omp simd reduction(+ : dfComoment)
1291512958#endif
12916- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12917- ++iPixel)
12918- {
12919- const T dfI = padfI[iPixel];
12920- const T dfJ = padfJ[iPixel];
12921- const T dfAcc = dfI * dfJ;
12922- const bool bIsValid =
12923- !std::isnan (dfAcc) && dfI != shiftedNoDataI &&
12924- dfJ != shiftedNoDataJ &&
12925- (!pabyCurBlockMask[i] ||
12926- (*pabyCurBlockMask[i])[iPixel]) &&
12927- (!pabyCurBlockMask[j] ||
12928- (*pabyCurBlockMask[j])[iPixel]);
12929- nCount += bIsValid;
12930- dfComoment += bIsValid ? dfAcc : ZERO;
12931- }
12932- }
12933-
12934- const auto idxInMatrixI =
12935- static_cast <size_t >(i) * nBandCount + j;
12936- const auto idxInMatrixJ =
12937- static_cast <size_t >(j) * nBandCount + i;
12938-
12939- // Update the total comoment using last formula of paragraph
12940- // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12941- // CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12942- // (mean_I(A) - mean_I(B)) *
12943- // (mean_J(A) - mean_J(B)) *
12944- // (count(A) * count(B)) / (count(A) + count(B))
12945- //
12946- // There might be a small gotcha in the fact that the set of
12947- // pixels on which the means are computed is not always the
12948- // same as the the one on which the comoment is computed, if
12949- // pixels are not valid/invalid at the same indices among bands
12950- // It is not obvious (to me) what should be the correct behavior.
12951- // The current approach has the benefit to avoid recomputing
12952- // the mean for each (i,j) tuple, but only for all i.
12953- if (anCount[idxInMatrixI] == 0 )
12954- padfComomentMatrix[idxInMatrixI] = 0 ;
12955- if (nCount > 0 )
12959+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1295612960 {
12957- padfComomentMatrix[idxInMatrixI] +=
12958- static_cast <double >(dfComoment);
12959- padfComomentMatrix[idxInMatrixI] +=
12960- static_cast <double >(adfMean[idxInMatrixI] -
12961- adfCurBlockMean[i]) *
12962- static_cast <double >(adfMean[idxInMatrixJ] -
12963- adfCurBlockMean[j]) *
12964- (static_cast <double >(anCount[idxInMatrixI]) *
12965- static_cast <double >(nCount) /
12966- static_cast <double >(anCount[idxInMatrixI] + nCount));
12961+ const T dfI = padfI[iPixel];
12962+ const T dfJ = padfJ[iPixel];
12963+ const T dfAcc = dfI * dfJ;
12964+ const bool bIsValid = !std::isnan (dfAcc) &&
12965+ dfI != shiftedNoDataI &&
12966+ dfJ != shiftedNoDataJ &&
12967+ (!pabyCurBlockMask[i] ||
12968+ (*pabyCurBlockMask[i])[iPixel]) &&
12969+ (!pabyCurBlockMask[j] ||
12970+ (*pabyCurBlockMask[j])[iPixel]);
12971+ nCount += bIsValid;
12972+ dfComoment += bIsValid ? dfAcc : ZERO;
1296712973 }
12968- anCount[idxInMatrixI] += nCount;
12974+ }
1296912975
12970- // Update means
12971- if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0 )
12972- {
12973- adfMean[idxInMatrixI] +=
12974- (adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
12975- static_cast <T>(
12976- static_cast <double >(anCurBlockCount[i]) /
12977- static_cast <double >(anCountMean[idxInMatrixI] +
12978- anCurBlockCount[i]));
12979- }
12980- anCountMean[idxInMatrixI] += anCurBlockCount[i];
12976+ const auto idxInMatrixI = static_cast <size_t >(i) * nBandCount + j;
12977+ const auto idxInMatrixJ = static_cast <size_t >(j) * nBandCount + i;
12978+
12979+ // Update the total comoment using last formula of paragraph
12980+ // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12981+ // CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12982+ // (mean_I(A) - mean_I(B)) *
12983+ // (mean_J(A) - mean_J(B)) *
12984+ // (count(A) * count(B)) / (count(A) + count(B))
12985+ //
12986+ // There might be a small gotcha in the fact that the set of
12987+ // pixels on which the means are computed is not always the
12988+ // same as the the one on which the comoment is computed, if
12989+ // pixels are not valid/invalid at the same indices among bands
12990+ // It is not obvious (to me) what should be the correct behavior.
12991+ // The current approach has the benefit to avoid recomputing
12992+ // the mean for each (i,j) tuple, but only for all i.
12993+ if (anCount[idxInMatrixI] == 0 )
12994+ padfComomentMatrix[idxInMatrixI] = 0 ;
12995+ if (nCount > 0 )
12996+ {
12997+ padfComomentMatrix[idxInMatrixI] +=
12998+ static_cast <double >(dfComoment);
12999+ padfComomentMatrix[idxInMatrixI] +=
13000+ static_cast <double >(adfMean[idxInMatrixI] -
13001+ adfCurBlockMean[i]) *
13002+ static_cast <double >(adfMean[idxInMatrixJ] -
13003+ adfCurBlockMean[j]) *
13004+ (static_cast <double >(anCount[idxInMatrixI]) *
13005+ static_cast <double >(nCount) /
13006+ static_cast <double >(anCount[idxInMatrixI] + nCount));
13007+ }
13008+ anCount[idxInMatrixI] += nCount;
13009+
13010+ // Update means
13011+ if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0 )
13012+ {
13013+ adfMean[idxInMatrixI] +=
13014+ (adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
13015+ static_cast <T>(
13016+ static_cast <double >(anCurBlockCount[i]) /
13017+ static_cast <double >(anCountMean[idxInMatrixI] +
13018+ anCurBlockCount[i]));
13019+ }
13020+ anCountMean[idxInMatrixI] += anCurBlockCount[i];
1298113021
12982- if (idxInMatrixI != idxInMatrixJ)
13022+ if (idxInMatrixI != idxInMatrixJ)
13023+ {
13024+ if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0 )
1298313025 {
12984- if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0 )
12985- {
12986- adfMean[idxInMatrixJ] +=
12987- (adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
12988- static_cast <T>(
12989- static_cast <double >(anCurBlockCount[j]) /
12990- static_cast <double >(anCountMean[idxInMatrixJ] +
12991- anCurBlockCount[j]));
12992- }
12993- anCountMean[idxInMatrixJ] += anCurBlockCount[j];
13026+ adfMean[idxInMatrixJ] +=
13027+ (adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
13028+ static_cast <T>(
13029+ static_cast <double >(anCurBlockCount[j]) /
13030+ static_cast <double >(anCountMean[idxInMatrixJ] +
13031+ anCurBlockCount[j]));
1299413032 }
13033+ anCountMean[idxInMatrixJ] += anCurBlockCount[j];
1299513034 }
1299613035 }
1299713036
0 commit comments