Skip to content

Commit 903f58b

Browse files
authored
Merge pull request #935 from Kotlin/fast-double-parser
Fast double parser
2 parents 6aab5fa + 0d6083b commit 903f58b

File tree

11 files changed

+655
-80
lines changed

11 files changed

+655
-80
lines changed

core/api/core.api

+21-2
Original file line numberDiff line numberDiff line change
@@ -3817,9 +3817,13 @@ public final class org/jetbrains/kotlinx/dataframe/api/ConvertKt {
38173817
public static final fun convertToByteFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38183818
public static final fun convertToDouble (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38193819
public static final fun convertToDoubleFromString (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
3820+
public static final fun convertToDoubleFromString (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;Z)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38203821
public static synthetic fun convertToDoubleFromString$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
3822+
public static synthetic fun convertToDoubleFromString$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38213823
public static final fun convertToDoubleFromStringNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
3824+
public static final fun convertToDoubleFromStringNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;Z)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38223825
public static synthetic fun convertToDoubleFromStringNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
3826+
public static synthetic fun convertToDoubleFromStringNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Ljava/util/Locale;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38233827
public static final fun convertToDoubleFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38243828
public static final fun convertToFloat (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
38253829
public static final fun convertToFloatFromT (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
@@ -6482,19 +6486,25 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt {
64826486

64836487
public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions {
64846488
public fun <init> ()V
6485-
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
6489+
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
64866490
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V
6491+
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)V
6492+
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
64876493
public final fun component1 ()Ljava/util/Locale;
64886494
public final fun component2 ()Ljava/time/format/DateTimeFormatter;
64896495
public final fun component3 ()Ljava/lang/String;
64906496
public final fun component4 ()Ljava/util/Set;
6491-
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6497+
public final fun component5 ()Z
6498+
public final synthetic fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6499+
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
64926500
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6501+
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
64936502
public fun equals (Ljava/lang/Object;)Z
64946503
public final fun getDateTimeFormatter ()Ljava/time/format/DateTimeFormatter;
64956504
public final fun getDateTimePattern ()Ljava/lang/String;
64966505
public final fun getLocale ()Ljava/util/Locale;
64976506
public final fun getNullStrings ()Ljava/util/Set;
6507+
public final fun getUseFastDoubleParser ()Z
64986508
public fun hashCode ()I
64996509
public fun toString ()Ljava/lang/String;
65006510
}
@@ -10198,6 +10208,15 @@ public final class org/jetbrains/kotlinx/dataframe/impl/columns/UtilsKt {
1019810208
public static final fun asAnyFrameColumn (Lorg/jetbrains/kotlinx/dataframe/DataColumn;)Lorg/jetbrains/kotlinx/dataframe/columns/FrameColumn;
1019910209
}
1020010210

10211+
public final class org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser {
10212+
public fun <init> (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)V
10213+
public final fun parseOrNull (Ljava/lang/CharSequence;)Ljava/lang/Double;
10214+
public final fun parseOrNull ([BIILjava/nio/charset/Charset;)Ljava/lang/Double;
10215+
public final fun parseOrNull ([CII)Ljava/lang/Double;
10216+
public static synthetic fun parseOrNull$default (Lorg/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser;[BIILjava/nio/charset/Charset;ILjava/lang/Object;)Ljava/lang/Double;
10217+
public static synthetic fun parseOrNull$default (Lorg/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser;[CIIILjava/lang/Object;)Ljava/lang/Double;
10218+
}
10219+
1020110220
public final class org/jetbrains/kotlinx/dataframe/impl/schema/DataFrameSchemaImpl : org/jetbrains/kotlinx/dataframe/schema/DataFrameSchema {
1020210221
public fun <init> (Ljava/util/Map;)V
1020310222
public fun compare (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema;)Lorg/jetbrains/kotlinx/dataframe/schema/CompareResult;

core/build.gradle.kts

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ dependencies {
7272
implementation(libs.commonsIo)
7373
implementation(libs.serialization.core)
7474
implementation(libs.serialization.json)
75+
implementation(libs.fastDoubleParser)
7576

7677
implementation(libs.fuel)
7778

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

+33-10
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
2424
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
2525
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
2626
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
27+
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
2728
import org.jetbrains.kotlinx.dataframe.exceptions.CellConversionException
2829
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
2930
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
@@ -185,21 +186,43 @@ public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = conve
185186
public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()
186187

187188
/**
188-
* Parse String column to Double considering locale (number format).
189+
* Parses a String column to Double considering locale (number format).
189190
* If [locale] parameter is defined, it's number format is used for parsing.
190-
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
191+
* If [locale] parameter is null, the current system locale is used.
192+
* If the column cannot be parsed, then the POSIX format is used.
191193
*/
194+
@ExcludeFromSources
195+
private interface DataColumnStringConvertToDoubleDoc
196+
197+
/** @include [DataColumnStringConvertToDoubleDoc] */
192198
@JvmName("convertToDoubleFromString")
193199
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> =
194-
this.castToNullable().convertToDouble(locale).castToNotNullable()
200+
convertToDouble(locale = locale, useFastDoubleParser = false)
195201

196202
/**
197-
* Parse String column to Double considering locale (number format).
198-
* If [locale] parameter is defined, it's number format is used for parsing.
199-
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
203+
* @include [DataColumnStringConvertToDoubleDoc]
204+
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
205+
*/
206+
@JvmName("convertToDoubleFromString")
207+
public fun DataColumn<String>.convertToDouble(
208+
locale: Locale? = null,
209+
useFastDoubleParser: Boolean,
210+
): DataColumn<Double> = this.castToNullable().convertToDouble(locale, useFastDoubleParser).castToNotNullable()
211+
212+
/** @include [DataColumnStringConvertToDoubleDoc] */
213+
@JvmName("convertToDoubleFromStringNullable")
214+
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> =
215+
convertToDouble(locale = locale, useFastDoubleParser = false)
216+
217+
/**
218+
* @include [DataColumnStringConvertToDoubleDoc]
219+
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
200220
*/
201221
@JvmName("convertToDoubleFromStringNullable")
202-
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> {
222+
public fun DataColumn<String?>.convertToDouble(
223+
locale: Locale? = null,
224+
useFastDoubleParser: Boolean,
225+
): DataColumn<Double?> {
203226
fun applyParser(parser: (String) -> Double?): DataColumn<Double?> {
204227
var currentRow = 0
205228
try {
@@ -220,14 +243,14 @@ public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColu
220243
}
221244

222245
return if (locale != null) {
223-
val explicitParser = Parsers.getDoubleParser(locale)
246+
val explicitParser = Parsers.getDoubleParser(locale, useFastDoubleParser)
224247
applyParser(explicitParser)
225248
} else {
226249
try {
227-
val defaultParser = Parsers.getDoubleParser()
250+
val defaultParser = Parsers.getDoubleParser(useFastDoubleParser = useFastDoubleParser)
228251
applyParser(defaultParser)
229252
} catch (e: TypeConversionException) {
230-
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"))
253+
val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8"), useFastDoubleParser)
231254
applyParser(posixParser)
232255
}
233256
}

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

+57
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
1111
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
1212
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
1313
import org.jetbrains.kotlinx.dataframe.typeClass
14+
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS
15+
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
1416
import java.time.format.DateTimeFormatter
1517
import java.util.Locale
1618
import kotlin.reflect.KProperty
@@ -40,13 +42,68 @@ public interface GlobalParserOptions {
4042
public var locale: Locale
4143
}
4244

45+
/**
46+
* ### Options for parsing [String]`?` columns
47+
*
48+
* @param locale locale to use for parsing dates and numbers, defaults to the System default locale.
49+
* If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern]
50+
* to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse
51+
* locale-specific dates!
52+
* @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created
53+
* from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified,
54+
* [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used.
55+
* @param dateTimePattern a pattern to use for parsing dates. If specified instead of [dateTimeFormatter],
56+
* it will be used to create a [DateTimeFormatter].
57+
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's
58+
* ["null", "NULL", "NA", "N/A"].
59+
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
60+
*/
4361
public data class ParserOptions(
4462
val locale: Locale? = null,
4563
// TODO, migrate to kotlinx.datetime.format.DateTimeFormat? https://github.com/Kotlin/dataframe/issues/876
4664
val dateTimeFormatter: DateTimeFormatter? = null,
4765
val dateTimePattern: String? = null,
4866
val nullStrings: Set<String>? = null,
67+
val useFastDoubleParser: Boolean = false,
4968
) {
69+
70+
/** For binary compatibility. */
71+
@Deprecated(
72+
message = PARSER_OPTIONS,
73+
level = DeprecationLevel.HIDDEN,
74+
)
75+
public constructor(
76+
locale: Locale? = null,
77+
dateTimeFormatter: DateTimeFormatter? = null,
78+
dateTimePattern: String? = null,
79+
nullStrings: Set<String>? = null,
80+
) : this(
81+
locale = locale,
82+
dateTimeFormatter = dateTimeFormatter,
83+
dateTimePattern = dateTimePattern,
84+
nullStrings = nullStrings,
85+
useFastDoubleParser = false,
86+
)
87+
88+
/** For binary compatibility. */
89+
@Deprecated(
90+
message = PARSER_OPTIONS_COPY,
91+
level = DeprecationLevel.HIDDEN,
92+
)
93+
public fun copy(
94+
locale: Locale? = this.locale,
95+
dateTimeFormatter: DateTimeFormatter? = this.dateTimeFormatter,
96+
dateTimePattern: String? = this.dateTimePattern,
97+
nullStrings: Set<String>? = this.nullStrings,
98+
): ParserOptions =
99+
ParserOptions(
100+
locale = locale,
101+
dateTimeFormatter = dateTimeFormatter,
102+
dateTimePattern = dateTimePattern,
103+
nullStrings = nullStrings,
104+
useFastDoubleParser = useFastDoubleParser,
105+
)
106+
50107
internal fun getDateTimeFormatter(): DateTimeFormatter? =
51108
when {
52109
dateTimeFormatter != null -> dateTimeFormatter

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/utils.kt

+7
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,31 @@ import kotlin.annotation.AnnotationTarget.VALUE_PARAMETER
1919
* {@include [Indent]}
2020
*
2121
*/
22+
@ExcludeFromSources
2223
internal interface LineBreak
2324

2425
/** &nbsp; */
26+
@ExcludeFromSources
2527
internal interface QuarterIndent
2628

2729
/** &nbsp;&nbsp; */
30+
@ExcludeFromSources
2831
internal interface HalfIndent
2932

3033
/** &nbsp;&nbsp;&nbsp;&nbsp; */
34+
@ExcludeFromSources
3135
internal interface Indent
3236

3337
/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
38+
@ExcludeFromSources
3439
internal interface DoubleIndent
3540

3641
/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
42+
@ExcludeFromSources
3743
internal interface TripleIndent
3844

3945
/** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; */
46+
@ExcludeFromSources
4047
internal interface QuadrupleIndent
4148

4249
/**

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

+10-29
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ import org.jetbrains.kotlinx.dataframe.hasNulls
3434
import org.jetbrains.kotlinx.dataframe.impl.canParse
3535
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
3636
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
37+
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
3738
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse
3839
import org.jetbrains.kotlinx.dataframe.io.isURL
3940
import org.jetbrains.kotlinx.dataframe.io.readJsonStr
4041
import org.jetbrains.kotlinx.dataframe.values
4142
import java.math.BigDecimal
4243
import java.net.URL
43-
import java.text.NumberFormat
4444
import java.text.ParsePosition
4545
import java.time.format.DateTimeFormatter
4646
import java.time.format.DateTimeFormatterBuilder
@@ -275,29 +275,6 @@ internal object Parsers : GlobalParserOptions {
275275
null
276276
}
277277

278-
private fun String.parseDouble(format: NumberFormat) =
279-
when (uppercase(Locale.getDefault())) {
280-
"NAN" -> Double.NaN
281-
282-
"INF" -> Double.POSITIVE_INFINITY
283-
284-
"-INF" -> Double.NEGATIVE_INFINITY
285-
286-
"INFINITY" -> Double.POSITIVE_INFINITY
287-
288-
"-INFINITY" -> Double.NEGATIVE_INFINITY
289-
290-
else -> {
291-
val parsePosition = ParsePosition(0)
292-
val result: Double? = format.parse(this, parsePosition)?.toDouble()
293-
if (parsePosition.index != this.length) {
294-
null
295-
} else {
296-
result
297-
}
298-
}
299-
}
300-
301278
inline fun <reified T : Any> stringParser(
302279
catch: Boolean = false,
303280
coveredBy: Set<KType> = emptySet(),
@@ -317,11 +294,15 @@ internal object Parsers : GlobalParserOptions {
317294
): StringParserWithFormat<T> = StringParserWithFormat(typeOf<T>(), coveredBy, body)
318295

319296
private val parserToDoubleWithOptions = stringParserWithOptions { options ->
320-
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
321-
val parser = { it: String -> it.parseDouble(numberFormat) }
297+
val fastDoubleParser = FastDoubleParser(options ?: ParserOptions())
298+
val parser = { it: String -> fastDoubleParser.parseOrNull(it) }
322299
parser
323300
}
324301

302+
private val posixDoubleParser = FastDoubleParser(
303+
ParserOptions(locale = Locale.forLanguageTag("C.UTF-8")),
304+
)
305+
325306
internal val parsersOrder = listOf(
326307
// Int
327308
stringParser<Int> { it.toIntOrNull() },
@@ -384,7 +365,7 @@ internal object Parsers : GlobalParserOptions {
384365
// Double, with explicit number format or taken from current locale
385366
parserToDoubleWithOptions,
386367
// Double, with POSIX format
387-
stringParser<Double> { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) },
368+
stringParser<Double> { posixDoubleParser.parseOrNull(it) },
388369
// Boolean
389370
stringParser<Boolean> { it.toBooleanOrNull() },
390371
// BigDecimal
@@ -449,9 +430,9 @@ internal object Parsers : GlobalParserOptions {
449430
return parser.applyOptions(options)
450431
}
451432

452-
internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? {
433+
internal fun getDoubleParser(locale: Locale? = null, useFastDoubleParser: Boolean): (String) -> Double? {
453434
val options = if (locale != null) {
454-
ParserOptions(locale = locale)
435+
ParserOptions(locale = locale, useFastDoubleParser = useFastDoubleParser)
455436
} else {
456437
null
457438
}

0 commit comments

Comments
 (0)