Skip to content

Commit

Permalink
Add third data quality metric (#11939)
Browse files Browse the repository at this point in the history
  • Loading branch information
marthasharkey authored Jan 30, 2025
1 parent caec1e6 commit 96b3a97
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 54 deletions.
19 changes: 17 additions & 2 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ polyglot java import org.enso.base.Time_Utils
polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator
polyglot java import org.enso.table.data.column.operation.CountNothing
polyglot java import org.enso.table.data.column.operation.CountUntrimmed
polyglot java import org.enso.table.data.column.operation.CountNonTrivialWhitespace
polyglot java import org.enso.table.data.column.operation.SampleOperation
polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation
polyglot java import org.enso.table.data.column.operation.unary.DateTruncateOperation
polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation
Expand Down Expand Up @@ -2234,13 +2236,26 @@ type Column
Used for data quality indicator in Table Viz.
count_untrimmed : Integer -> Integer | Nothing
count_untrimmed self sample_size:Integer=Column.default_sample_size =
if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
if (Column.can_contain_text self.value_type).not then Nothing else
CountUntrimmed.apply self.java_column sample_size

## PRIVATE
Counts the number of text values with non trivial whitespace.
Used for data quality indicator in Table Viz.
count_non_trivial_whitespace : Integer -> Integer | Nothing
count_non_trivial_whitespace self sample_size:Integer=Column.default_sample_size =
if (Column.can_contain_text self.value_type).not then Nothing else
CountNonTrivialWhitespace.apply self.java_column sample_size

## PRIVATE
Determines if a value type is eligable for a text data quality count
private can_contain_text value_type:Value_Type -> Boolean =
value_type == Value_Type.Mixed || value_type.is_text

## PRIVATE
Default size for sampling data quality indicators.
default_sample_size -> Integer =
CountUntrimmed.DEFAULT_SAMPLE_SIZE
SampleOperation.DEFAULT_SAMPLE_SIZE

## GROUP Standard.Base.Metadata
ICON metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,11 @@ make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_ta
child_label = ["child_label", "row"]
data_quality_metrics = if is_db_table then [] else
number_nothing = JS_Object.from_pairs [["name", "Count nothings"], ["percentage_value", columns.map .count_nothing]]
number_untrimmed = case all_rows_count > Column.default_sample_size of
False -> JS_Object.from_pairs [["name", "Count untrimmed whitespace"], ["percentage_value", columns.map .count_untrimmed]]
True -> JS_Object.from_pairs [["name", "Count untrimmed whitespace (sampled)"], ["percentage_value", columns.map .count_untrimmed]]
[number_nothing, number_untrimmed]
sampled_label = if all_rows_count > Column.default_sample_size then " (sampled)" else ""
number_untrimmed = JS_Object.from_pairs [["name", "Count untrimmed whitespace" + sampled_label], ["percentage_value", columns.map .count_untrimmed]]
number_non_triv = JS_Object.from_pairs [["name", "Count non trivial whitespace" + sampled_label], ["percentage_value", columns.map .count_non_trivial_whitespace]]
JS_Object.from_pairs
[number_nothing, number_untrimmed, number_non_triv]
pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_metrics", data_quality_metrics] ,["type", "Table"], child_label]
JS_Object.from_pairs pairs

Expand Down
79 changes: 59 additions & 20 deletions std-bits/base/src/main/java/org/enso/base/Text_Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,19 @@ public static boolean has_leading_trailing_whitespace(String s) {
if (trailing != null && is_all_whitespace(trailing)) {
return true;
}

return false;
}

/**
* Checks if the string contains any non trivial whitespace.
*
* @param s the string to check
* @return whether the string contains any of the non trivial whitespace listed
*/
public static boolean has_non_trivial_whitespace(String s) {
return s.chars().mapToObj(c -> (char) c).anyMatch(c -> UCharacter.isUWhiteSpace(c) && c != ' ');
}

/**
* Returns a new string containing characters starting at the given UTF-16 index.
*
Expand Down Expand Up @@ -240,8 +249,12 @@ public static int compare_normalized_ignoring_case(String a, String b, Locale lo
public static boolean contains(String string, String substring) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (substring.isEmpty()) return true;
if (string.isEmpty()) return false;
if (substring.isEmpty()) {
return true;
}
if (string.isEmpty()) {
return false;
}
StringSearch searcher = new StringSearch(substring, string);
return searcher.first() != StringSearch.DONE;
}
Expand All @@ -268,8 +281,12 @@ public static boolean ends_with(String string, String suffix) {
public static boolean contains_case_insensitive(String string, String substring, Locale locale) {
// {@code StringSearch} does not handle empty strings as we would want, so we need these special
// cases.
if (substring.isEmpty()) return true;
if (string.isEmpty()) return false;
if (substring.isEmpty()) {
return true;
}
if (string.isEmpty()) {
return false;
}

Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
StringSearch searcher = new StringSearch(fold.apply(substring), fold.apply(string));
Expand Down Expand Up @@ -335,12 +352,18 @@ public static String take_suffix(String str, long grapheme_length) {
* @return a UTF-16 code unit span of the first needle or null if not found.
*/
public static Utf16Span span_of(String haystack, String needle) {
if (needle.isEmpty()) return new Utf16Span(0, 0);
if (haystack.isEmpty()) return null;
if (needle.isEmpty()) {
return new Utf16Span(0, 0);
}
if (haystack.isEmpty()) {
return null;
}

StringSearch search = new StringSearch(needle, haystack);
int pos = search.first();
if (pos == StringSearch.DONE) return null;
if (pos == StringSearch.DONE) {
return null;
}
return new Utf16Span(pos, pos + search.getMatchLength());
}

Expand All @@ -356,11 +379,15 @@ public static Utf16Span last_span_of(String haystack, String needle) {
int afterLast = haystack.length();
return new Utf16Span(afterLast, afterLast);
}
if (haystack.isEmpty()) return null;
if (haystack.isEmpty()) {
return null;
}

StringSearch search = new StringSearch(needle, haystack);
int pos = search.last();
if (pos == StringSearch.DONE) return null;
if (pos == StringSearch.DONE) {
return null;
}
return new Utf16Span(pos, pos + search.getMatchLength());
}

Expand All @@ -372,10 +399,13 @@ public static Utf16Span last_span_of(String haystack, String needle) {
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
*/
public static List<Utf16Span> span_of_all(String haystack, String needle) {
if (needle.isEmpty())
if (needle.isEmpty()) {
throw new IllegalArgumentException(
"The operation `span_of_all` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
}
if (haystack.isEmpty()) {
return List.of();
}

StringSearch search = new StringSearch(needle, haystack);
ArrayList<Utf16Span> occurrences = new ArrayList<>();
Expand All @@ -396,10 +426,13 @@ public static List<Utf16Span> span_of_all(String haystack, String needle) {
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
*/
public static List<Utf16Span> span_of_all_multiple(String haystack, List<String> needles) {
if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty))
if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty)) {
throw new IllegalArgumentException(
"The operation `span_of_all_multiple` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
}
if (haystack.isEmpty()) {
return List.of();
}

StringSearch stringSearches[] =
IntStream.range(0, needles.size())
Expand Down Expand Up @@ -514,10 +547,13 @@ public static long[] utf16_indices_to_grapheme_indices(String text, List<Long> c
*/
public static GraphemeSpan span_of_case_insensitive(
String haystack, String needle, Locale locale, boolean searchForLast) {
if (needle.isEmpty())
if (needle.isEmpty()) {
throw new IllegalArgumentException(
"The operation `span_of_case_insensitive` does not support searching for an empty term.");
if (haystack.isEmpty()) return null;
}
if (haystack.isEmpty()) {
return null;
}

CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
Expand Down Expand Up @@ -545,11 +581,14 @@ public static GraphemeSpan span_of_case_insensitive(
*/
public static List<GraphemeSpan> span_of_all_case_insensitive(
String haystack, String needle, Locale locale) {
if (needle.isEmpty())
if (needle.isEmpty()) {
throw new IllegalArgumentException(
"The operation `span_of_all_case_insensitive` does not support searching for an empty"
+ " term.");
if (haystack.isEmpty()) return List.of();
}
if (haystack.isEmpty()) {
return List.of();
}

CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
String foldedNeedle = CaseFoldedString.simpleFold(needle, locale);
Expand Down Expand Up @@ -647,11 +686,11 @@ public static String normalize(String str) {
/**
* Normalizes the string to its canonical Unicode form using the specified name and mode.
*
* @param name the normalization name, must be "nfc", "nfkc", or "nfkc_cf"
* @param mode the normalization mode
* @see https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.html
* @see
* https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/com/ibm/icu/text/Normalizer2.Mode.html
* @param name the normalization name, must be "nfc", "nfkc", or "nfkc_cf"
* @param mode the normalization mode
*/
public static String normalizeWithMode(String str, String name, Mode mode) {
return Normalizer2.getInstance(null, name, mode).normalize(str);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.enso.table.data.column.operation;

import static org.enso.table.data.column.operation.SampleOperation.DEFAULT_SAMPLE_SIZE;
import static org.enso.table.data.column.operation.SampleOperation.RANDOM_SEED;

import java.util.Random;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.storage.ColumnStorage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.table.Column;
import org.graalvm.polyglot.Context;

public class CountNonTrivialWhitespace {
/** Counts the number of cells in the columns with non trivial whitespace */
public static Long apply(Column column, long sampleSize) throws InterruptedException {
ColumnStorage storage = column.getStorage();
return applyToStorage(storage, sampleSize);
}

/**
* Counts the number of cells in the given storage with non trivial whitespace
*
* @return
*/
public static Long applyToStorage(ColumnStorage storage, long sampleSize)
throws InterruptedException {
return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage)
? stringStorage.cachedWhitespaceCount()
: (Long) compute(storage, sampleSize, Context.getCurrent());
}

/** Internal method performing the calculation on a storage. */
public static long compute(ColumnStorage storage, long sampleSize, Context context) {
long size = storage.getSize();

long count = 0;
if (sampleSize < size) {
var rng = new Random(RANDOM_SEED);
for (int i = 0; i < sampleSize; i++) {
long idx = rng.nextInt(Math.toIntExact(size));
var val = storage.getItemBoxed(idx);
if (val instanceof String str && Text_Utils.has_non_trivial_whitespace(str)) {
count++;
}

if (context != null) {
context.safepoint();
}
}
count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size));
} else {
for (long i = 0; i < storage.getSize(); i++) {
var val = storage.getItemBoxed(i);
if (val instanceof String str && Text_Utils.has_non_trivial_whitespace(str)) {
count++;
}

if (context != null) {
context.safepoint();
}
}
}

return count;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.enso.table.data.column.operation;

import static org.enso.table.data.column.operation.SampleOperation.DEFAULT_SAMPLE_SIZE;
import static org.enso.table.data.column.operation.SampleOperation.RANDOM_SEED;

import java.util.Random;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.storage.ColumnStorage;
Expand All @@ -8,13 +11,6 @@
import org.graalvm.polyglot.Context;

public class CountUntrimmed {
// Default seed for random number generation (no specific reason for this value, just stability on
// result).
private static final long RANDOM_SEED = 677280131;

// Default sample size for counting untrimmed cells.
public static final long DEFAULT_SAMPLE_SIZE = 10000;

/** Counts the number of cells in the columns with leading or trailing whitespace. */
public static Long apply(Column column, long sampleSize) throws InterruptedException {
var storage = column.getStorage();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.enso.table.data.column.operation;

// Base class for untrimmed and non-trivial whitespace counts
public class SampleOperation {

// Default seed for random number generation (no specific reason for this value, just stability on
// results).
public static final long RANDOM_SEED = 677280131;

// Default sample size for counting untrimmed cells.
public static final long DEFAULT_SAMPLE_SIZE = 10000;
}
Loading

0 comments on commit 96b3a97

Please sign in to comment.