Skip to content

Commit

Permalink
Improved fallback mechanism of FastDoubleParser to take into account …
Browse files Browse the repository at this point in the history
…all other locales, not just ROOT. Finished parse tests
  • Loading branch information
Jolanrensen committed Feb 11, 2025
1 parent 6594189 commit 89e0d41
Show file tree
Hide file tree
Showing 4 changed files with 255 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDateTime
import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime
import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
import java.math.BigDecimal
import java.math.BigInteger
import java.net.URL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.columns.size
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
import org.jetbrains.kotlinx.dataframe.hasNulls
import org.jetbrains.kotlinx.dataframe.impl.asNullable
import org.jetbrains.kotlinx.dataframe.impl.canParse
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols
import io.github.oshai.kotlinlogging.KotlinLogging
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
import org.jetbrains.kotlinx.dataframe.api.parser
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
import java.nio.charset.Charset
import java.text.DecimalFormatSymbols
Expand All @@ -15,19 +14,24 @@ import java.util.Locale

private val logger = KotlinLogging.logger {}

// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
private val INFINITIES = arrayOf("", "inf", "infinity", "infty")
private val PLUS_INFINITIES = INFINITIES.map { "+$it" }
private val MINUS_INFINITIES = INFINITIES.map { "-$it" }
private val NANS = arrayOf("nan", "na", "n/a")

/**
* Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double].
*
* If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the
* fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser).
* If not, or if it fails, it will use [NumberFormat] to parse the input.
*
* The [locale][locale] used by the double parser is defined like:
*
* [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault]
*
* [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations
* of any locale recognized by Java as long as that symbol does not conflict with the given locale.
*
* For example, if your locale uses ',' as decimal separator, it will NOT recognize ',' as thousands separator,
* but it will recognize ' ', '٬', '_', ' ', etc. as such.
* The same holds for characters like "e", "inf", "×10^", "NaN", etc.
*
* Public, so it can be used in other modules.
*
* @param parserOptions can be supplied to configure the parser.
Expand All @@ -41,106 +45,103 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)

private val useFastDoubleParser = parserOptions?.useFastDoubleParser ?: Parsers.useFastDoubleParser
private val locale = parserOptions?.locale ?: Parsers.locale
private val fallbackLocale = Locale.ROOT

private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale)
private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale)

private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true)

/**
* Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on
* [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols].
* the [locale] with fallbacks from all other locales.
*
* Fallback characters/strings are only added if they're not clashing with local characters/strings.
*/
private fun setupNumberFormatSymbols(): NumberFormatSymbols {
// collect all chars and strings that are locale-specific such that we can check whether
// fallback chars and strings are safe to add
val localChars = with(localDecimalFormatSymbols) {
buildSet {
add(decimalSeparator.lowercaseChar())
add(groupingSeparator.lowercaseChar())
add(minusSign.lowercaseChar())
add('+')
add(zeroDigit.lowercaseChar())
private fun setupNumberFormatSymbols(): NumberFormatSymbols =
numberFormatSymbolsCache.getOrPut(locale) {
val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale)

// collect all chars and strings that are locale-specific such that we can check whether
// fallback chars and strings are safe to add
val localChars = with(localDecimalFormatSymbols) {
buildSet {
add(decimalSeparator.lowercaseChar())
add(groupingSeparator.lowercaseChar())
add(minusSign.lowercaseChar())
add('+')
// we don't include zeroDigit here, for notations like ×10^
}
}
}
val localStrings = with(localDecimalFormatSymbols) {
buildSet {
add(exponentSeparator.lowercase())
add(infinity.lowercase())
add(naN.lowercase())
val localStrings = with(localDecimalFormatSymbols) {
buildSet {
add(exponentSeparator.lowercase())
add(infinity.lowercase())
add(naN.lowercase())
}
}
}

/**
* Builds a set with the specified char from [localDecimalFormatSymbols] and
* its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so.
* [additionals] will be added to the set too, when they're safe to add.
*/
fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set<Char> =
buildSet {
val getChar = this@fromLocalWithFallBack
val char = getChar(localDecimalFormatSymbols).lowercaseChar()
add(char)

// add fallback char if it's safe to do so
val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar()
if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) {
add(fallbackChar)
}
/**
* Builds a set with the specified char from [this] and
* [fallbackChars] will be added to the set too, when they're safe to add.
*/
fun Char.withFallback(fallbackChars: CharArray): Set<Char> =
buildSet {
val char = this@withFallback.lowercaseChar()
add(char)

// Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
if (char.isWhitespace()) add(' ')
// Treat NBSP and other whitespace characters the same.
if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable())

// add additional chars if needed
for (additional in additionals) {
val lowercase = additional.lowercaseChar()
if (lowercase !in localChars && !localStrings.any { lowercase in it }) {
add(lowercase)
// add fallback chars if needed
for (char in fallbackChars) {
val lowercase = char.lowercaseChar()
if (lowercase !in localChars && !localStrings.any { lowercase in it }) {
add(lowercase)
}

// Treat NBSP and other whitespace characters the same.
if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable())
}
}
}

/**
* Builds a set with the specified string from [localDecimalFormatSymbols] and
* its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so.
* [additionals] will be added to the set too, when they're safe to add.
*/
fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set<String> =
buildSet {
val getString = this@fromLocalWithFallBack
val string = getString(localDecimalFormatSymbols).lowercase()
add(string)

// add fallback string if it's safe to do so
val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase()
if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) {
add(fallbackString)
}
/**
* Builds a set with the specified string from [this] and
* [fallbackStrings] will be added to the set too, when they're safe to add.
*/
fun String.withFallback(fallbackStrings: Array<String>): Set<String> =
buildSet {
val string = this@withFallback.lowercase()
add(string)

// Treat NBSP and other whitespace characters the same.
if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() })

// Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
if (string.isBlank()) add(" ")
// add fallback strings if needed
for (string in fallbackStrings) {
val lowercase = string.lowercase()
if (!lowercase.any { it in localChars } && lowercase !in localStrings) {
add(lowercase)
}

// add additional strings if needed
for (additional in additionals) {
val lowercase = additional.lowercase()
if (!lowercase.any { it in localChars } && lowercase !in localStrings) {
add(lowercase)
// Treat NBSP and other whitespace characters the same.
if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() })
}
}
}

return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols)
.withPlusSign(setOf('+'))
.withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack())
.withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack())
.withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack())
.withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack())
.withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES))
.withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS))
}
NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols)
.withPlusSign(
setOf('+'),
).withDecimalSeparator(
localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS),
).withGroupingSeparator(
localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS),
).withExponentSeparator(
localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS),
).withMinusSign(
localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS),
).withInfinity(
localDecimalFormatSymbols.infinity.withFallback(INFINITIES),
).withNaN(
localDecimalFormatSymbols.naN.withFallback(NANS),
)
}

/** Fallback method for parsing doubles. */
private fun String.parseToDoubleOrNullFallback(): Double? =
Expand All @@ -152,7 +153,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
in NANS -> Double.NaN

else -> {
// not thread safe; must be created here
// NumberFormat is not thread safe; must be created in the function body
val numberFormat = NumberFormat.getInstance(locale)
val parsePosition = ParsePosition(0)
val result = numberFormat.parse(this, parsePosition)?.toDouble()
Expand Down Expand Up @@ -235,4 +236,49 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
}
return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback()
}

/**
* Here we store all possible decimal format symbols of all locales on the system.
* These will be used as fallbacks for the selected locale.
* They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale]
* (so ',' is not added as grouping separator if '.' is already the locale's decimal separator).
*/
internal companion object {
private val allDecimalFormatSymbols by lazy {
Locale.getAvailableLocales().map { DecimalFormatSymbols.getInstance(it) }
}
val MINUS_SIGNS by lazy {
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.minusSign }.toCharArray()
}
val INFINITIES by lazy {
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.infinity }
.plus(arrayOf("", "inf", "infinity", "infty"))
.toTypedArray()
}
val PLUS_INFINITIES by lazy { INFINITIES.map { "+$it" }.toTypedArray() }
val MINUS_INFINITIES by lazy {
INFINITIES.flatMap { inf -> MINUS_SIGNS.map { min -> min + inf } }.toTypedArray()
}
val NANS by lazy {
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.naN }
.plus(arrayOf("nan", "na", "n/a"))
.toTypedArray()
}
val WHITE_SPACES = charArrayOf(' ', '\u00A0', '\u2009', '\u202F', '\t')
val GROUPING_SEPARATORS by lazy {
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.groupingSeparator }
.plus(arrayOf('\'', '˙', *WHITE_SPACES.toTypedArray()))
.toCharArray()
}
val DECIMAL_SEPARATORS by lazy {
allDecimalFormatSymbols.flatMapTo(mutableSetOf()) {
listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator)
}.plus(arrayOf('·', ''))
.toCharArray()
}
val EXPONENTS by lazy {
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.exponentSeparator }.toTypedArray()
}
val numberFormatSymbolsCache = mutableMapOf<Locale, NumberFormatSymbols>()
}
}
Loading

0 comments on commit 89e0d41

Please sign in to comment.