From f694c3f308a4aba01d796ef2554ecf75695c7016 Mon Sep 17 00:00:00 2001 From: Torsten Sprenger Date: Mon, 10 Feb 2025 21:15:10 +0100 Subject: [PATCH 1/3] Add geomean aggregation function --- docs/api/sql/aggregate-functions.md | 6 ++++++ .../core/src/preagg/sufficient-statistics.js | 19 ++++++++++++++++++- packages/core/test/preaggregator.test.js | 9 ++++++++- packages/spec/src/config/transforms.js | 1 + packages/spec/src/spec/Transform.ts | 8 ++++++++ packages/sql/src/functions/aggregate.js | 9 +++++++++ packages/sql/src/index.js | 2 +- packages/sql/test/aggregate.test.js | 6 +++++- packages/vgplot/src/api.js | 1 + 9 files changed, 57 insertions(+), 4 deletions(-) diff --git a/docs/api/sql/aggregate-functions.md b/docs/api/sql/aggregate-functions.md index 48e0a01e..4bd73c26 100644 --- a/docs/api/sql/aggregate-functions.md +++ b/docs/api/sql/aggregate-functions.md @@ -99,6 +99,12 @@ Create an aggregate function that calculates the sum of the input _expression_. Create an aggregate function that calculates the product of the input _expression_. +## geomean + +`geomean(expression)` + +Create an aggregate function that calculates the geometric mean of the input _expression_. + ## median `median(expression)` diff --git a/packages/core/src/preagg/sufficient-statistics.js b/packages/core/src/preagg/sufficient-statistics.js index 25f167ee..f73cdb67 100644 --- a/packages/core/src/preagg/sufficient-statistics.js +++ b/packages/core/src/preagg/sufficient-statistics.js @@ -1,4 +1,4 @@ -import { AggregateNode, and, argmax, argmin, count, div, ExprNode, isNotNull, max, min, mul, pow, regrAvgX, regrAvgY, regrCount, sql, sqrt, sub, sum } from '@uwdata/mosaic-sql'; +import { AggregateNode, and, argmax, argmin, count, div, ExprNode, isNotNull, max, min, mul, pow, product, regrAvgX, regrAvgY, regrCount, sql, sqrt, sub, sum } from '@uwdata/mosaic-sql'; import { fnv_hash } from '../util/hash.js'; /** @@ -18,6 +18,8 @@ export function sufficientStatistics(node, preagg, avg) { return sumExpr(preagg, node); case 'avg': return avgExpr(preagg, node); + case 'geomean': + return geomeanExpr(preagg, node); case 'arg_max': return argmaxExpr(preagg, node); case 'arg_min': @@ -155,6 +157,21 @@ function avgExpr(preagg, node) { return div(sum(mul(as, name)), expr); } +/** + * Generate an expression for calculating geometric means over data dimensions. + * As a side effect, this method adds a column to the input *preagg* object + * to track the count of non-null values per-partition. + * @param {Record} preagg A map of columns (such as + * sufficient statistics) to pre-aggregate. + * @param {AggregateNode} [node] The originating aggregate function call. + * @returns {ExprNode} An aggregate expression over pre-aggregated dimensions. + */ +function geomeanExpr(preagg, node) { + const as = addStat(preagg, node); + const { expr, name } = countExpr(preagg, node); + return pow(product(pow(as, name)), div(1, expr)); +} + /** * Generate an expression for calculating argmax over data dimensions. * As a side effect, this method adds a column to the input *preagg* object diff --git a/packages/core/test/preaggregator.test.js b/packages/core/test/preaggregator.test.js index 6f0704f5..8593fa26 100644 --- a/packages/core/test/preaggregator.test.js +++ b/packages/core/test/preaggregator.test.js @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { Query, add, argmax, argmin, avg, corr, count, covarPop, covariance, gt, isNotDistinct, literal, loadObjects, max, min, product, regrAvgX, regrAvgY, regrCount, regrIntercept, regrR2, regrSXX, regrSXY, regrSYY, regrSlope, stddev, stddevPop, sum, varPop, variance } from '@uwdata/mosaic-sql'; +import { Query, add, argmax, argmin, avg, corr, count, covarPop, covariance, geomean, gt, isNotDistinct, literal, loadObjects, max, min, product, regrAvgX, regrAvgY, regrCount, regrIntercept, regrR2, regrSXX, regrSXY, regrSYY, regrSlope, stddev, stddevPop, sum, varPop, variance } from '@uwdata/mosaic-sql'; import { Coordinator, Selection } from '../src/index.js'; import { nodeConnector } from './util/node-connector.js'; import { TestClient } from './util/test-client.js'; @@ -61,6 +61,13 @@ describe('PreAggregator', () => { expect(await run(avg('x'))).toStrictEqual([3.5, true]); }); + it('supports geomean aggregate', async () => { + const [result, optimized] = await run(geomean('x')); + + expect(result).toBeCloseTo(Math.pow(12, 1 / 2), 10); + expect(optimized).toBe(true); + }); + it('supports min aggregate', async () => { expect(await run(min('x'))).toStrictEqual([3, true]); }); diff --git a/packages/spec/src/config/transforms.js b/packages/spec/src/config/transforms.js index 680e9c6c..f09d6beb 100644 --- a/packages/spec/src/config/transforms.js +++ b/packages/spec/src/config/transforms.js @@ -18,6 +18,7 @@ export function transformNames(overrides = []) { 'dateMonthDay', 'dateDay', 'first', + 'geomean', 'geojson', 'last', 'max', diff --git a/packages/spec/src/spec/Transform.ts b/packages/spec/src/spec/Transform.ts index c7a78e4e..432bd216 100644 --- a/packages/spec/src/spec/Transform.ts +++ b/packages/spec/src/spec/Transform.ts @@ -229,6 +229,14 @@ export interface First extends AggregateOptions, WindowOptions { first: Arg1; } +/* A geometric mean aggregate transform. */ +export interface Geomean extends AggregateOptions, WindowOptions { + /** + * Compute the geometric mean value of the given column. + */ + first: Arg1; +} + /* A last aggregate transform. */ export interface Last extends AggregateOptions, WindowOptions { /** diff --git a/packages/sql/src/functions/aggregate.js b/packages/sql/src/functions/aggregate.js index f602c4a8..4cff1dcd 100644 --- a/packages/sql/src/functions/aggregate.js +++ b/packages/sql/src/functions/aggregate.js @@ -98,6 +98,15 @@ export function first(expr) { return aggFn('first', expr); } +/** + * Compute a geomean aggregate. + * @param {import('../types.js').ExprValue} expr The expression to aggregate. + * @returns {AggregateNode} A SQL aggregate function call. + */ +export function geomean(expr) { + return aggFn('geomean', expr); +} + /** * Compute a sample kurtosis aggregate. * @param {import('../types.js').ExprValue} expr The expression to aggregate. diff --git a/packages/sql/src/index.js b/packages/sql/src/index.js index 015373b2..c5f588f0 100644 --- a/packages/sql/src/index.js +++ b/packages/sql/src/index.js @@ -24,7 +24,7 @@ export { VerbatimNode } from './ast/verbatim.js'; export { WindowClauseNode, WindowDefNode, WindowFrameNode, WindowFunctionNode, WindowNode } from './ast/window.js'; export { WithClauseNode } from './ast/with.js'; -export { argmax, argmin, arrayAgg, avg, corr, count, covariance, covarPop, entropy, first, kurtosis, mad, max, median, min, mode, last, product, quantile, regrAvgX, regrAvgY, regrCount, regrIntercept, regrR2, regrSXX, regrSXY, regrSYY, regrSlope, skewness, stddev, stddevPop, stringAgg, sum, variance, varPop } from './functions/aggregate.js'; +export { argmax, argmin, arrayAgg, avg, corr, count, covariance, covarPop, entropy, first, geomean, kurtosis, mad, max, median, min, mode, last, product, quantile, regrAvgX, regrAvgY, regrCount, regrIntercept, regrR2, regrSXX, regrSXY, regrSYY, regrSlope, skewness, stddev, stddevPop, stringAgg, sum, variance, varPop } from './functions/aggregate.js'; export { cond } from './functions/case.js'; export { cast, float32, float64, int32 } from './functions/cast.js'; export { column } from './functions/column.js'; diff --git a/packages/sql/test/aggregate.test.js b/packages/sql/test/aggregate.test.js index 5395d80d..c8d72ed1 100644 --- a/packages/sql/test/aggregate.test.js +++ b/packages/sql/test/aggregate.test.js @@ -1,6 +1,6 @@ import { expect, describe, it } from 'vitest'; import { columns } from './util/columns.js'; -import { argmax, argmin, arrayAgg, avg, column, corr, count, covariance, covarPop, entropy, first, gt, kurtosis, last, mad, max, median, min, mode, product, quantile, regrAvgX, regrAvgY, regrCount, regrIntercept, regrR2, regrSlope, regrSXX, regrSXY, regrSYY, skewness, stddev, stddevPop, stringAgg, sum, variance, varPop } from '../src/index.js'; +import { argmax, argmin, arrayAgg, avg, column, corr, count, covariance, covarPop, entropy, first, geomean, gt, kurtosis, last, mad, max, median, min, mode, product, quantile, regrAvgX, regrAvgY, regrCount, regrIntercept, regrR2, regrSlope, regrSXX, regrSXY, regrSYY, skewness, stddev, stddevPop, stringAgg, sum, variance, varPop } from '../src/index.js'; describe('Aggregate functions', () => { it('include accessible metadata', () => { @@ -63,6 +63,10 @@ describe('Aggregate functions', () => { expect(String(first('foo'))).toBe('first("foo")'); }); + it('include geomean', () => { + expect(String(geomean('foo'))).toBe('geomean("foo")'); + }); + it('include kurtosis', () => { expect(String(kurtosis('foo'))).toBe('kurtosis("foo")'); }); diff --git a/packages/vgplot/src/api.js b/packages/vgplot/src/api.js index 9291bda0..1732a28e 100644 --- a/packages/vgplot/src/api.js +++ b/packages/vgplot/src/api.js @@ -28,6 +28,7 @@ export { covarPop, entropy, first, + geomean, kurtosis, mad, max, From 8b7fc53bdf773f4d11f291d4090796216bb343fe Mon Sep 17 00:00:00 2001 From: Torsten Sprenger Date: Mon, 10 Feb 2025 21:50:41 +0100 Subject: [PATCH 2/3] Fix geomean in Transform.ts --- packages/spec/src/spec/Transform.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/spec/src/spec/Transform.ts b/packages/spec/src/spec/Transform.ts index 432bd216..4cd9099c 100644 --- a/packages/spec/src/spec/Transform.ts +++ b/packages/spec/src/spec/Transform.ts @@ -234,7 +234,7 @@ export interface Geomean extends AggregateOptions, WindowOptions { /** * Compute the geometric mean value of the given column. */ - first: Arg1; + geomean: Arg1; } /* A last aggregate transform. */ From a14853b42a1d10ee2b59431137b0478a9b1aa9a4 Mon Sep 17 00:00:00 2001 From: Torsten Sprenger Date: Tue, 11 Feb 2025 05:50:00 +0100 Subject: [PATCH 3/3] Use logarithmic approach to compute geomean --- .../core/src/preagg/sufficient-statistics.js | 17 ++++++++++------- packages/core/test/preaggregator.test.js | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/packages/core/src/preagg/sufficient-statistics.js b/packages/core/src/preagg/sufficient-statistics.js index f73cdb67..8cb8976f 100644 --- a/packages/core/src/preagg/sufficient-statistics.js +++ b/packages/core/src/preagg/sufficient-statistics.js @@ -1,4 +1,4 @@ -import { AggregateNode, and, argmax, argmin, count, div, ExprNode, isNotNull, max, min, mul, pow, product, regrAvgX, regrAvgY, regrCount, sql, sqrt, sub, sum } from '@uwdata/mosaic-sql'; +import { AggregateNode, and, argmax, argmin, count, div, exp, ExprNode, isNotNull, ln, max, min, mul, pow, regrAvgX, regrAvgY, regrCount, sql, sqrt, sub, sum } from '@uwdata/mosaic-sql'; import { fnv_hash } from '../util/hash.js'; /** @@ -159,17 +159,20 @@ function avgExpr(preagg, node) { /** * Generate an expression for calculating geometric means over data dimensions. - * As a side effect, this method adds a column to the input *preagg* object - * to track the count of non-null values per-partition. + * This method uses log-based computations to ensure numerical stability. The + * geomean calculation uses two sufficient statistics: the sum of log values + * and the count of non-null values. As a side effect, this method adds columns + * for these statistics to the input *preagg* object. * @param {Record} preagg A map of columns (such as * sufficient statistics) to pre-aggregate. - * @param {AggregateNode} [node] The originating aggregate function call. + * @param {AggregateNode} node The originating aggregate function call. * @returns {ExprNode} An aggregate expression over pre-aggregated dimensions. */ function geomeanExpr(preagg, node) { - const as = addStat(preagg, node); - const { expr, name } = countExpr(preagg, node); - return pow(product(pow(as, name)), div(1, expr)); + const x = node.args[0]; + const expr = addStat(preagg, sum(ln(x)), node); + const { expr: n } = countExpr(preagg, node); + return exp(div(sum(expr), n)); } /** diff --git a/packages/core/test/preaggregator.test.js b/packages/core/test/preaggregator.test.js index 8593fa26..928a4a58 100644 --- a/packages/core/test/preaggregator.test.js +++ b/packages/core/test/preaggregator.test.js @@ -64,7 +64,7 @@ describe('PreAggregator', () => { it('supports geomean aggregate', async () => { const [result, optimized] = await run(geomean('x')); - expect(result).toBeCloseTo(Math.pow(12, 1 / 2), 10); + expect(result).toBeCloseTo(Math.sqrt(12), 10); expect(optimized).toBe(true); });