-
Notifications
You must be signed in to change notification settings - Fork 326
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add third data quality metric (#11939)
- closes #6332  
- Loading branch information
1 parent
caec1e6
commit 96b3a97
Showing
8 changed files
with
237 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
...s/table/src/main/java/org/enso/table/data/column/operation/CountNonTrivialWhitespace.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.enso.table.data.column.operation; | ||
|
||
import static org.enso.table.data.column.operation.SampleOperation.DEFAULT_SAMPLE_SIZE; | ||
import static org.enso.table.data.column.operation.SampleOperation.RANDOM_SEED; | ||
|
||
import java.util.Random; | ||
import org.enso.base.Text_Utils; | ||
import org.enso.table.data.column.storage.ColumnStorage; | ||
import org.enso.table.data.column.storage.StringStorage; | ||
import org.enso.table.data.table.Column; | ||
import org.graalvm.polyglot.Context; | ||
|
||
public class CountNonTrivialWhitespace { | ||
/** Counts the number of cells in the columns with non trivial whitespace */ | ||
public static Long apply(Column column, long sampleSize) throws InterruptedException { | ||
ColumnStorage storage = column.getStorage(); | ||
return applyToStorage(storage, sampleSize); | ||
} | ||
|
||
/** | ||
* Counts the number of cells in the given storage with non trivial whitespace | ||
* | ||
* @return | ||
*/ | ||
public static Long applyToStorage(ColumnStorage storage, long sampleSize) | ||
throws InterruptedException { | ||
return (sampleSize == DEFAULT_SAMPLE_SIZE && storage instanceof StringStorage stringStorage) | ||
? stringStorage.cachedWhitespaceCount() | ||
: (Long) compute(storage, sampleSize, Context.getCurrent()); | ||
} | ||
|
||
/** Internal method performing the calculation on a storage. */ | ||
public static long compute(ColumnStorage storage, long sampleSize, Context context) { | ||
long size = storage.getSize(); | ||
|
||
long count = 0; | ||
if (sampleSize < size) { | ||
var rng = new Random(RANDOM_SEED); | ||
for (int i = 0; i < sampleSize; i++) { | ||
long idx = rng.nextInt(Math.toIntExact(size)); | ||
var val = storage.getItemBoxed(idx); | ||
if (val instanceof String str && Text_Utils.has_non_trivial_whitespace(str)) { | ||
count++; | ||
} | ||
|
||
if (context != null) { | ||
context.safepoint(); | ||
} | ||
} | ||
count = Math.min(size, (long) Math.ceil((double) count / sampleSize * size)); | ||
} else { | ||
for (long i = 0; i < storage.getSize(); i++) { | ||
var val = storage.getItemBoxed(i); | ||
if (val instanceof String str && Text_Utils.has_non_trivial_whitespace(str)) { | ||
count++; | ||
} | ||
|
||
if (context != null) { | ||
context.safepoint(); | ||
} | ||
} | ||
} | ||
|
||
return count; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 12 additions & 0 deletions
12
std-bits/table/src/main/java/org/enso/table/data/column/operation/SampleOperation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package org.enso.table.data.column.operation; | ||
|
||
// Base class for untrimmed and non-trivial whitespace counts | ||
public class SampleOperation { | ||
|
||
// Default seed for random number generation (no specific reason for this value, just stability on | ||
// results). | ||
public static final long RANDOM_SEED = 677280131; | ||
|
||
// Default sample size for counting untrimmed cells. | ||
public static final long DEFAULT_SAMPLE_SIZE = 10000; | ||
} |
Oops, something went wrong.