7070#include " ../sqlite/ogrsqliteexecutesql.h"
7171#endif
7272
73+ #ifdef HAVE_OPENMP
74+ #include < omp.h>
75+ #endif
76+
7377extern const swq_field_type SpecialFieldTypes[SPECIAL_FIELD_COUNT];
7478
7579CPL_C_START
@@ -12686,12 +12690,17 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1268612690
1268712691 // Allocate temporary matrices and vectors
1268812692 const auto nMatrixSize = static_cast <size_t >(nBandCount) * nBandCount;
12693+
12694+ const auto kMax = static_cast <size_t >(nBandCount) * (nBandCount + 1 ) / 2 ;
12695+ std::vector<std::pair<int , int >> anMapLinearIdxToIJ;
1268912696 try
1269012697 {
1269112698 anCount.resize (nMatrixSize);
1269212699 adfMean.resize (nMatrixSize);
1269312700 anCountMean.resize (nMatrixSize);
1269412701
12702+ anMapLinearIdxToIJ.resize (kMax );
12703+
1269512704 adfCurBlockPixelsAllBands.resize (nPixelsInBlock * nBandCount);
1269612705
1269712706 adfCurBlockMean.resize (nBandCount);
@@ -12708,6 +12717,18 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1270812717 return CE_Failure;
1270912718 }
1271012719
12720+ {
12721+ size_t nLinearIdx = 0 ;
12722+ for (int i = 0 ; i < nBandCount; ++i)
12723+ {
12724+ for (int j = i; j < nBandCount; ++j)
12725+ {
12726+ anMapLinearIdxToIJ[nLinearIdx] = {i, j};
12727+ ++nLinearIdx;
12728+ }
12729+ }
12730+ }
12731+
1271112732 // Fetch nodata values and mask bands
1271212733 bool bAllBandsSameMask = false ;
1271312734 bool bIsAllInteger = false ;
@@ -12767,6 +12788,21 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1276712788 cpl::div_round_up (poDS->GetRasterYSize (), nBlockYSize);
1276812789 uint64_t nCurIter = 0 ;
1276912790
12791+ #ifdef HAVE_OPENMP
12792+ int nNumThreads = 1 ;
12793+ if (nBandCount >= 100 )
12794+ {
12795+ const int nNumCPUs = CPLGetNumCPUs ();
12796+ nNumThreads = std::max (1 , nNumCPUs / 2 );
12797+ const char *pszThreads =
12798+ CPLGetConfigOption (" GDAL_NUM_THREADS" , nullptr );
12799+ if (pszThreads && !EQUAL (pszThreads, " ALL_CPUS" ))
12800+ {
12801+ nNumThreads = std::clamp (atoi (pszThreads), 1 , nNumCPUs);
12802+ }
12803+ }
12804+ #endif
12805+
1277012806 // Iterate over all blocks
1277112807 for (const auto &window : papoBands[panBandList[0 ] - 1 ]->IterateWindows ())
1277212808 {
@@ -12839,159 +12875,157 @@ static CPLErr ComputeInterBandCovarianceMatrixInternal(
1283912875 }
1284012876 }
1284112877
12842- for (int i = 0 ; i < nBandCount; ++i)
12878+ #ifdef HAVE_OPENMP
12879+ omp_set_num_threads (nNumThreads);
12880+
12881+ #pragma omp parallel for schedule(static)
12882+ #endif
12883+ for (size_t k = 0 ; k < kMax ; ++k)
1284312884 {
12844- // The covariance matrix is symmetric. So start at i
12845- for (int j = i; j < nBandCount; ++j)
12885+ int i, j;
12886+ std::tie (i, j) = anMapLinearIdxToIJ[k];
12887+
12888+ // Now compute the moment of (i, j), but just for this block
12889+ size_t nCount = 0 ;
12890+ T dfComoment = 0 ;
12891+ const T *padfI =
12892+ adfCurBlockPixelsAllBands.data () + i * nThisBlockPixelCount;
12893+ const T *padfJ =
12894+ adfCurBlockPixelsAllBands.data () + j * nThisBlockPixelCount;
12895+
12896+ // Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12897+ // for the current block
12898+ if (bNoneHasMaskOrNodata && bIsAllInteger)
1284612899 {
12847- // Now compute the moment of (i, j), but just for this block
12848- size_t nCount = 0 ;
12849- T dfComoment = 0 ;
12850- const T *padfI =
12851- adfCurBlockPixelsAllBands.data () + i * nThisBlockPixelCount;
12852- const T *padfJ =
12853- adfCurBlockPixelsAllBands.data () + j * nThisBlockPixelCount;
12854-
12855- // Use https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass
12856- // for the current block
12857- if (bNoneHasMaskOrNodata && bIsAllInteger)
12858- {
12859- // Most optimized code path: integer, no nodata, no mask
12900+ // Most optimized code path: integer, no nodata, no mask
1286012901#ifdef HAVE_OPENMP_SIMD
1286112902#pragma omp simd reduction(+ : dfComoment)
1286212903#endif
12863- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12864- ++iPixel)
12865- {
12866- dfComoment += padfI[iPixel] * padfJ[iPixel];
12867- }
12868- nCount = nThisBlockPixelCount;
12869- }
12870- else if (bNoneHasMaskOrNodata)
12904+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1287112905 {
12872- // Floating-point code path with no nodata and no mask
12906+ dfComoment += padfI[iPixel] * padfJ[iPixel];
12907+ }
12908+ nCount = nThisBlockPixelCount;
12909+ }
12910+ else if (bNoneHasMaskOrNodata)
12911+ {
12912+ // Floating-point code path with no nodata and no mask
1287312913#ifdef HAVE_OPENMP_SIMD
1287412914#pragma omp simd reduction(+ : dfComoment)
1287512915#endif
12876- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12877- ++iPixel)
12878- {
12879- const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12880- const bool bIsValid = !std::isnan (dfAcc);
12881- nCount += bIsValid;
12882- dfComoment += bIsValid ? dfAcc : ZERO;
12883- }
12884- }
12885- else if (!std::isnan (adfNoData[i]) && !std::isnan (adfNoData[j]))
12916+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1288612917 {
12887- // Code path when there are both nodata values
12888- const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12889- const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12918+ const T dfAcc = padfI[iPixel] * padfJ[iPixel];
12919+ const bool bIsValid = !std::isnan (dfAcc);
12920+ nCount += bIsValid;
12921+ dfComoment += bIsValid ? dfAcc : ZERO;
12922+ }
12923+ }
12924+ else if (!std::isnan (adfNoData[i]) && !std::isnan (adfNoData[j]))
12925+ {
12926+ // Code path when there are both nodata values
12927+ const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12928+ const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1289012929
1289112930#ifdef HAVE_OPENMP_SIMD
1289212931#pragma omp simd reduction(+ : dfComoment)
1289312932#endif
12894- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12895- ++iPixel)
12896- {
12897- const T dfI = padfI[iPixel];
12898- const T dfJ = padfJ[iPixel];
12899- const T dfAcc = dfI * dfJ;
12900- const bool bIsValid = !std::isnan (dfAcc) &&
12901- dfI != shiftedNoDataI &&
12902- dfJ != shiftedNoDataJ;
12903- nCount += bIsValid;
12904- dfComoment += bIsValid ? dfAcc : ZERO;
12905- }
12906- }
12907- else
12933+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1290812934 {
12909- // Generic code path
12910- const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12911- const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
12935+ const T dfI = padfI[iPixel];
12936+ const T dfJ = padfJ[iPixel];
12937+ const T dfAcc = dfI * dfJ;
12938+ const bool bIsValid = !std::isnan (dfAcc) &&
12939+ dfI != shiftedNoDataI &&
12940+ dfJ != shiftedNoDataJ;
12941+ nCount += bIsValid;
12942+ dfComoment += bIsValid ? dfAcc : ZERO;
12943+ }
12944+ }
12945+ else
12946+ {
12947+ // Generic code path
12948+ const T shiftedNoDataI = adfNoData[i] - adfCurBlockMean[i];
12949+ const T shiftedNoDataJ = adfNoData[j] - adfCurBlockMean[j];
1291212950
1291312951#ifdef HAVE_OPENMP_SIMD
1291412952#pragma omp simd reduction(+ : dfComoment)
1291512953#endif
12916- for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount;
12917- ++iPixel)
12918- {
12919- const T dfI = padfI[iPixel];
12920- const T dfJ = padfJ[iPixel];
12921- const T dfAcc = dfI * dfJ;
12922- const bool bIsValid =
12923- !std::isnan (dfAcc) && dfI != shiftedNoDataI &&
12924- dfJ != shiftedNoDataJ &&
12925- (!pabyCurBlockMask[i] ||
12926- (*pabyCurBlockMask[i])[iPixel]) &&
12927- (!pabyCurBlockMask[j] ||
12928- (*pabyCurBlockMask[j])[iPixel]);
12929- nCount += bIsValid;
12930- dfComoment += bIsValid ? dfAcc : ZERO;
12931- }
12932- }
12933-
12934- const auto idxInMatrixI =
12935- static_cast <size_t >(i) * nBandCount + j;
12936- const auto idxInMatrixJ =
12937- static_cast <size_t >(j) * nBandCount + i;
12938-
12939- // Update the total comoment using last formula of paragraph
12940- // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12941- // CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12942- // (mean_I(A) - mean_I(B)) *
12943- // (mean_J(A) - mean_J(B)) *
12944- // (count(A) * count(B)) / (count(A) + count(B))
12945- //
12946- // There might be a small gotcha in the fact that the set of
12947- // pixels on which the means are computed is not always the
12948- // same as the the one on which the comoment is computed, if
12949- // pixels are not valid/invalid at the same indices among bands
12950- // It is not obvious (to me) what should be the correct behavior.
12951- // The current approach has the benefit to avoid recomputing
12952- // the mean for each (i,j) tuple, but only for all i.
12953- if (anCount[idxInMatrixI] == 0 )
12954- padfComomentMatrix[idxInMatrixI] = 0 ;
12955- if (nCount > 0 )
12954+ for (size_t iPixel = 0 ; iPixel < nThisBlockPixelCount; ++iPixel)
1295612955 {
12957- padfComomentMatrix[idxInMatrixI] +=
12958- static_cast <double >(dfComoment);
12959- padfComomentMatrix[idxInMatrixI] +=
12960- static_cast <double >(adfMean[idxInMatrixI] -
12961- adfCurBlockMean[i]) *
12962- static_cast <double >(adfMean[idxInMatrixJ] -
12963- adfCurBlockMean[j]) *
12964- (static_cast <double >(anCount[idxInMatrixI]) *
12965- static_cast <double >(nCount) /
12966- static_cast <double >(anCount[idxInMatrixI] + nCount));
12956+ const T dfI = padfI[iPixel];
12957+ const T dfJ = padfJ[iPixel];
12958+ const T dfAcc = dfI * dfJ;
12959+ const bool bIsValid = !std::isnan (dfAcc) &&
12960+ dfI != shiftedNoDataI &&
12961+ dfJ != shiftedNoDataJ &&
12962+ (!pabyCurBlockMask[i] ||
12963+ (*pabyCurBlockMask[i])[iPixel]) &&
12964+ (!pabyCurBlockMask[j] ||
12965+ (*pabyCurBlockMask[j])[iPixel]);
12966+ nCount += bIsValid;
12967+ dfComoment += bIsValid ? dfAcc : ZERO;
1296712968 }
12968- anCount[idxInMatrixI] += nCount;
12969+ }
1296912970
12970- // Update means
12971- if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0 )
12972- {
12973- adfMean[idxInMatrixI] +=
12974- (adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
12975- static_cast <T>(
12976- static_cast <double >(anCurBlockCount[i]) /
12977- static_cast <double >(anCountMean[idxInMatrixI] +
12978- anCurBlockCount[i]));
12979- }
12980- anCountMean[idxInMatrixI] += anCurBlockCount[i];
12971+ const auto idxInMatrixI = static_cast <size_t >(i) * nBandCount + j;
12972+ const auto idxInMatrixJ = static_cast <size_t >(j) * nBandCount + i;
12973+
12974+ // Update the total comoment using last formula of paragraph
12975+ // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online :
12976+ // CoMoment(A+B) = CoMoment(A) + CoMoment(B) +
12977+ // (mean_I(A) - mean_I(B)) *
12978+ // (mean_J(A) - mean_J(B)) *
12979+ // (count(A) * count(B)) / (count(A) + count(B))
12980+ //
12981+ // There might be a small gotcha in the fact that the set of
12982+ // pixels on which the means are computed is not always the
12983+ // same as the the one on which the comoment is computed, if
12984+ // pixels are not valid/invalid at the same indices among bands
12985+ // It is not obvious (to me) what should be the correct behavior.
12986+ // The current approach has the benefit to avoid recomputing
12987+ // the mean for each (i,j) tuple, but only for all i.
12988+ if (anCount[idxInMatrixI] == 0 )
12989+ padfComomentMatrix[idxInMatrixI] = 0 ;
12990+ if (nCount > 0 )
12991+ {
12992+ padfComomentMatrix[idxInMatrixI] +=
12993+ static_cast <double >(dfComoment);
12994+ padfComomentMatrix[idxInMatrixI] +=
12995+ static_cast <double >(adfMean[idxInMatrixI] -
12996+ adfCurBlockMean[i]) *
12997+ static_cast <double >(adfMean[idxInMatrixJ] -
12998+ adfCurBlockMean[j]) *
12999+ (static_cast <double >(anCount[idxInMatrixI]) *
13000+ static_cast <double >(nCount) /
13001+ static_cast <double >(anCount[idxInMatrixI] + nCount));
13002+ }
13003+ anCount[idxInMatrixI] += nCount;
13004+
13005+ // Update means
13006+ if (anCountMean[idxInMatrixI] + anCurBlockCount[i] > 0 )
13007+ {
13008+ adfMean[idxInMatrixI] +=
13009+ (adfCurBlockMean[i] - adfMean[idxInMatrixI]) *
13010+ static_cast <T>(
13011+ static_cast <double >(anCurBlockCount[i]) /
13012+ static_cast <double >(anCountMean[idxInMatrixI] +
13013+ anCurBlockCount[i]));
13014+ }
13015+ anCountMean[idxInMatrixI] += anCurBlockCount[i];
1298113016
12982- if (idxInMatrixI != idxInMatrixJ)
13017+ if (idxInMatrixI != idxInMatrixJ)
13018+ {
13019+ if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0 )
1298313020 {
12984- if (anCountMean[idxInMatrixJ] + anCurBlockCount[j] > 0 )
12985- {
12986- adfMean[idxInMatrixJ] +=
12987- (adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
12988- static_cast <T>(
12989- static_cast <double >(anCurBlockCount[j]) /
12990- static_cast <double >(anCountMean[idxInMatrixJ] +
12991- anCurBlockCount[j]));
12992- }
12993- anCountMean[idxInMatrixJ] += anCurBlockCount[j];
13021+ adfMean[idxInMatrixJ] +=
13022+ (adfCurBlockMean[j] - adfMean[idxInMatrixJ]) *
13023+ static_cast <T>(
13024+ static_cast <double >(anCurBlockCount[j]) /
13025+ static_cast <double >(anCountMean[idxInMatrixJ] +
13026+ anCurBlockCount[j]));
1299413027 }
13028+ anCountMean[idxInMatrixJ] += anCurBlockCount[j];
1299513029 }
1299613030 }
1299713031
0 commit comments