Skip to content

Commit 117e958

Browse files
Automated commit of generated code
1 parent a8cee48 commit 117e958

File tree

4 files changed

+219
-65
lines changed

4 files changed

+219
-65
lines changed

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

+27
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
77
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
88
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
99
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
10+
import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
1011
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
1112
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
1213
import org.jetbrains.kotlinx.dataframe.typeClass
@@ -55,13 +56,39 @@ public data class ParserOptions(
5556
}
5657
}
5758

59+
/** Tries to parse a column of strings into a column of a different type.
60+
* Each parser in [Parsers][org.jetbrains.kotlinx.dataframe.impl.api.Parsers] is run in order until a valid parser is found,
61+
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
62+
* fails to parse any value, the next parser is tried. If all the others fail, the final parser
63+
* simply returns the original string, leaving the column unchanged.
64+
*
65+
* Parsers that are [covered by][org.jetbrains.kotlinx.dataframe.impl.api.StringParser.coveredBy] other parsers are skipped.
66+
*
67+
* @param options options for parsing, like providing a locale or a custom date-time formatter
68+
* @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled)
69+
* @return a new column with parsed values */
5870
public fun DataColumn<String?>.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options)
5971

6072
public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T> =
6173
parse(options) {
6274
colsAtAnyDepth { !it.isColumnGroup() }
6375
}
6476

77+
/**
78+
* Tries to parse a column of strings into a column of a different type.
79+
* Each parser in [Parsers] is run in order until a valid parser is found,
80+
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
81+
* fails to parse any value, the next parser is tried.
82+
*
83+
* If all fail [IllegalStateException] is thrown. If you don't want this exception to be thrown,
84+
* use [tryParse] instead.
85+
*
86+
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
87+
*
88+
* @param options options for parsing, like providing a locale or a custom date-time formatter
89+
* @throws IllegalStateException if no valid parser is found
90+
* @return a new column with parsed values
91+
*/
6592
public fun DataColumn<String?>.parse(options: ParserOptions? = null): DataColumn<*> =
6693
tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") }
6794

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
package org.jetbrains.kotlinx.dataframe.impl
2+
3+
import kotlin.time.Duration
4+
import kotlin.time.DurationUnit
5+
6+
/**
7+
* Checks if the string can be parsed as a duration without throwing an exception.
8+
*
9+
* The logic is taken from [Duration.parse] (Kotlin version 2.0.20),
10+
* so it should return the same result.
11+
*
12+
* @param value the string to check
13+
*/
14+
internal fun Duration.Companion.canParse(value: String): Boolean {
15+
var length = value.length
16+
if (length == 0) return false
17+
var index = 0
18+
val infinityString = "Infinity"
19+
when (value[index]) {
20+
'+', '-' -> index++
21+
}
22+
val hasSign = index > 0
23+
when {
24+
length <= index -> return false
25+
26+
value[index] == 'P' -> {
27+
if (++index == length) return false
28+
val nonDigitSymbols = "+-."
29+
var isTimeComponent = false
30+
var prevUnit: DurationUnit? = null
31+
while (index < length) {
32+
if (value[index] == 'T') {
33+
if (isTimeComponent || ++index == length) return false
34+
isTimeComponent = true
35+
continue
36+
}
37+
val component = value.substringWhile(index) { it in '0'..'9' || it in nonDigitSymbols }
38+
if (component.isEmpty()) return false
39+
index += component.length
40+
val unitChar = value.getOrElse(index) { return false }
41+
index++
42+
val unit = durationUnitByIsoCharOrNull(unitChar, isTimeComponent) ?: return false
43+
if (prevUnit != null && prevUnit <= unit) return false
44+
prevUnit = unit
45+
}
46+
}
47+
48+
value.regionMatches(
49+
thisOffset = index,
50+
other = infinityString,
51+
otherOffset = 0,
52+
length = maxOf(length - index, infinityString.length),
53+
ignoreCase = true,
54+
) -> return true
55+
56+
else -> {
57+
// parse default string format
58+
var prevUnit: DurationUnit? = null
59+
var afterFirst = false
60+
var allowSpaces = !hasSign
61+
if (hasSign && value[index] == '(' && value.last() == ')') {
62+
allowSpaces = true
63+
if (++index == --length) return false
64+
}
65+
while (index < length) {
66+
if (afterFirst && allowSpaces) {
67+
index = value.skipWhile(index) { it == ' ' }
68+
}
69+
afterFirst = true
70+
val component = value.substringWhile(index) { it in '0'..'9' || it == '.' }
71+
if (component.isEmpty()) return false
72+
index += component.length
73+
val unitName = value.substringWhile(index) { it in 'a'..'z' }
74+
index += unitName.length
75+
val unit = durationUnitByShortNameOrNull(unitName) ?: return false
76+
if (prevUnit != null && prevUnit <= unit) return false
77+
prevUnit = unit
78+
val dotIndex = component.indexOf('.')
79+
if (dotIndex > 0) {
80+
if (index < length) return false
81+
}
82+
}
83+
}
84+
}
85+
return true
86+
}
87+
88+
/**
89+
* Checks if the string can be parsed as a java duration without throwing an exception.
90+
*/
91+
internal fun javaDurationCanParse(value: String): Boolean = isoDurationRegex.matches(value)
92+
93+
/**
94+
* regex from [java.time.Duration.Lazy.PATTERN], it represents the ISO-8601 duration format.
95+
*/
96+
private val isoDurationRegex = Regex(
97+
"""([-+]?)P(?:([-+]?[0-9]+)D)?(T(?:([-+]?[0-9]+)H)?(?:([-+]?[0-9]+)M)?(?:([-+]?[0-9]+)(?:[.,]([0-9]{0,9}))?S)?)?""",
98+
RegexOption.IGNORE_CASE,
99+
)
100+
101+
/**
102+
* Copy of [kotlin.time.substringWhile] (Kotlin version 2.0.20).
103+
*/
104+
private inline fun String.substringWhile(startIndex: Int, predicate: (Char) -> Boolean): String =
105+
substring(startIndex, skipWhile(startIndex, predicate))
106+
107+
/**
108+
* Copy of [kotlin.time.skipWhile] (Kotlin version 2.0.20).
109+
*/
110+
private inline fun String.skipWhile(startIndex: Int, predicate: (Char) -> Boolean): Int {
111+
var i = startIndex
112+
while (i < length && predicate(this[i])) i++
113+
return i
114+
}
115+
116+
/**
117+
* Copy of [kotlin.time.durationUnitByIsoChar] (Kotlin version 2.0.20).
118+
*/
119+
private fun durationUnitByIsoCharOrNull(isoChar: Char, isTimeComponent: Boolean): DurationUnit? =
120+
when {
121+
!isTimeComponent -> {
122+
when (isoChar) {
123+
'D' -> DurationUnit.DAYS
124+
else -> null
125+
}
126+
}
127+
128+
else -> {
129+
when (isoChar) {
130+
'H' -> DurationUnit.HOURS
131+
'M' -> DurationUnit.MINUTES
132+
'S' -> DurationUnit.SECONDS
133+
else -> null
134+
}
135+
}
136+
}
137+
138+
/**
139+
* Copy of [kotlin.time.durationUnitByShortName] (Kotlin version 2.0.20).
140+
*/
141+
private fun durationUnitByShortNameOrNull(shortName: String): DurationUnit? =
142+
when (shortName) {
143+
"ns" -> DurationUnit.NANOSECONDS
144+
"us" -> DurationUnit.MICROSECONDS
145+
"ms" -> DurationUnit.MILLISECONDS
146+
"s" -> DurationUnit.SECONDS
147+
"m" -> DurationUnit.MINUTES
148+
"h" -> DurationUnit.HOURS
149+
"d" -> DurationUnit.DAYS
150+
else -> null
151+
}

core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

+39-58
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
package org.jetbrains.kotlinx.dataframe.impl.api
22

3-
import kotlinx.coroutines.async
4-
import kotlinx.coroutines.awaitAll
5-
import kotlinx.coroutines.coroutineScope
6-
import kotlinx.coroutines.runBlocking
73
import kotlinx.datetime.Instant
84
import kotlinx.datetime.LocalDate
95
import kotlinx.datetime.LocalDateTime
@@ -325,7 +321,7 @@ internal object Parsers : GlobalParserOptions {
325321
parser
326322
}
327323

328-
private val parsersOrder = listOf(
324+
internal val parsersOrder = listOf(
329325
// Int
330326
stringParser<Int> { it.toIntOrNull() },
331327
// Long
@@ -415,7 +411,7 @@ internal object Parsers : GlobalParserOptions {
415411
stringParser<String> { it },
416412
)
417413

418-
internal val parsersMap = parsersOrder.associateBy { it.type }
414+
private val parsersMap = parsersOrder.associateBy { it.type }
419415

420416
val size: Int = parsersOrder.size
421417

@@ -478,16 +474,16 @@ internal object Parsers : GlobalParserOptions {
478474
internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColumn<*> {
479475
val columnSize = size
480476
val parsedValues = ArrayList<Any?>(columnSize)
481-
var hasNulls: Boolean = false
482-
var hasNotNulls: Boolean = false
483-
var nullStringParsed: Boolean = false
477+
var hasNulls = false
478+
var hasNotNulls = false
479+
var nullStringParsed = false
484480
val nulls = options?.nullStrings ?: Parsers.nulls
485481

486-
val parsersToCheck = Parsers.parsersMap
487-
val parserTypesToCheck = parsersToCheck.keys
482+
val parsersToCheck = Parsers.parsersOrder
483+
val parserTypesToCheck = parsersToCheck.map { it.type }.toSet()
488484

489485
var correctParser: StringParser<*>? = null
490-
for ((_, parser) in parsersToCheck) {
486+
for (parser in parsersToCheck) {
491487
if (parser.coveredBy.any { it in parserTypesToCheck }) continue
492488

493489
val parserWithOptions = parser.applyOptions(options)
@@ -496,24 +492,21 @@ internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColu
496492
hasNotNulls = false
497493
nullStringParsed = false
498494
for (str in this) {
499-
when {
500-
str == null -> {
495+
when (str) {
496+
null -> {
501497
parsedValues += null
502498
hasNulls = true
503499
}
504500

505-
str in nulls -> {
501+
in nulls -> {
506502
parsedValues += null
507503
hasNulls = true
508504
nullStringParsed = true
509505
}
510506

511507
else -> {
512508
val trimmed = str.trim()
513-
val res = parserWithOptions(trimmed)
514-
if (res == null) {
515-
continue
516-
}
509+
val res = parserWithOptions(trimmed) ?: continue
517510
parsedValues += res
518511
hasNotNulls = true
519512
}
@@ -545,44 +538,32 @@ internal fun <T> DataColumn<String?>.parse(parser: StringParser<T>, options: Par
545538
return DataColumn.createValueColumn(name(), parsedValues, parser.type.withNullability(hasNulls)) as DataColumn<T?>
546539
}
547540

548-
internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: ColumnsSelector<T, Any?>): DataFrame<T> =
549-
runBlocking { parseParallel(options, columns) }
550-
551-
private suspend fun <T> DataFrame<T>.parseParallel(
552-
options: ParserOptions?,
553-
columns: ColumnsSelector<T, Any?>,
554-
): DataFrame<T> =
555-
coroutineScope {
556-
val convertedCols = getColumnsWithPaths(columns).map { col ->
557-
async {
558-
when {
559-
// when a frame column is requested to be parsed,
560-
// parse each value/frame column at any depth inside each DataFrame in the frame column
561-
col.isFrameColumn() ->
562-
col.values.map {
563-
async {
564-
it.parseParallel(options) {
565-
colsAtAnyDepth { !it.isColumnGroup() }
566-
}
567-
}
568-
}.awaitAll()
569-
.toColumn(col.name)
570-
571-
// when a column group is requested to be parsed,
572-
// parse each column in the group
573-
col.isColumnGroup() ->
574-
col.parseParallel(options) { all() }
575-
.asColumnGroup(col.name())
576-
.asDataColumn()
577-
578-
// Base case, parse the column if it's a `String?` column
579-
col.isSubtypeOf<String?>() ->
580-
col.cast<String?>().tryParse(options)
581-
582-
else -> col
583-
}.let { ColumnToInsert(col.path, it) }
584-
}
585-
}.awaitAll()
541+
internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: ColumnsSelector<T, Any?>): DataFrame<T> {
542+
val convertedCols = getColumnsWithPaths(columns).map { col ->
543+
when {
544+
// when a frame column is requested to be parsed,
545+
// parse each value/frame column at any depth inside each DataFrame in the frame column
546+
col.isFrameColumn() ->
547+
col.values.map {
548+
it.parseImpl(options) {
549+
colsAtAnyDepth { !it.isColumnGroup() }
550+
}
551+
}.toColumn(col.name)
552+
553+
// when a column group is requested to be parsed,
554+
// parse each column in the group
555+
col.isColumnGroup() ->
556+
col.parseImpl(options) { all() }
557+
.asColumnGroup(col.name())
558+
.asDataColumn()
586559

587-
emptyDataFrame<T>().insertImpl(convertedCols)
560+
// Base case, parse the column if it's a `String?` column
561+
col.isSubtypeOf<String?>() ->
562+
col.cast<String?>().tryParse(options)
563+
564+
else -> col
565+
}.let { ColumnToInsert(col.path, it) }
588566
}
567+
568+
return emptyDataFrame<T>().insertImpl(convertedCols)
569+
}

docs/StardustDocs/snippets/org.jetbrains.kotlinx.dataframe.samples.api.Modify.parseSome.html

+2-7
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@
183183
<p class="dataframe_description"></p>
184184
</details>
185185
<details>
186-
<summary>Output DataFrame: rowsCount = 7, columnsCount = 5</summary>
186+
<summary>Output DataFrame: rowsCount = 7, columnsCount = 2</summary>
187187
<table class="dataframe" id="df_1"></table>
188188

189189
<p class="dataframe_description"></p>
@@ -478,13 +478,8 @@
478478
call_DataFrame(function() { DataFrame.renderTable(0) });
479479

480480
/*<!--*/
481-
call_DataFrame(function() { DataFrame.addTable({ cols: [{ name: "<span title=\"firstName: String\">firstName</span>", children: [], rightAlign: false, values: ["Alice","Bob","Charlie","Charlie","Bob","Alice","Charlie"] },
482-
{ name: "<span title=\"lastName: String\">lastName</span>", children: [], rightAlign: false, values: ["Cooper","Dylan","Daniels","Chaplin","Marley","Wolf","Byrd"] },
483-
{ name: "<span title=\"name: DataRow<*>\">name</span>", children: [0, 1], rightAlign: false, values: ["<span class=\"formatted\" title=\"firstName: Alice\nlastName: Cooper\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Alice<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Cooper<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Bob\nlastName: Dylan\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Bob<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Dylan<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Charlie\nlastName: Daniels\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Charlie<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Dan<span class=\"structural\">...</span><span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Charlie\nlastName: Chaplin\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Charlie<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Cha<span class=\"structural\">...</span><span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Bob\nlastName: Marley\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Bob<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Marley<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Alice\nlastName: Wolf\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Alice<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Wolf<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Charlie\nlastName: Byrd\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Charlie<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Byrd<span class=\"structural\"> }</span></span>"] },
484-
{ name: "<span title=\"age: Int\">age</span>", children: [], rightAlign: true, values: ["<span class=\"formatted\" title=\"\"><span class=\"numbers\">15</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">45</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">40</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>"] },
485-
{ name: "<span title=\"city: String?\">city</span>", children: [], rightAlign: false, values: ["London","Dubai","Moscow","Milan","Tokyo","<span class=\"formatted\" title=\"\"><span class=\"null\">null</span></span>","Moscow"] },
481+
call_DataFrame(function() { DataFrame.addTable({ cols: [{ name: "<span title=\"age: Int\">age</span>", children: [], rightAlign: true, values: ["<span class=\"formatted\" title=\"\"><span class=\"numbers\">15</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">45</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">40</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>"] },
486482
{ name: "<span title=\"weight: Int?\">weight</span>", children: [], rightAlign: true, values: ["<span class=\"formatted\" title=\"\"><span class=\"numbers\">54</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">87</span></span>","<span class=\"formatted\" title=\"\"><span class=\"null\">null</span></span>","<span class=\"formatted\" title=\"\"><span class=\"null\">null</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">68</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">55</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">90</span></span>"] },
487-
{ name: "<span title=\"isHappy: Boolean\">isHappy</span>", children: [], rightAlign: false, values: ["true","true","false","true","true","false","true"] },
488483
], id: 1, rootId: 1, totalRows: 7 } ) });
489484
/*-->*/
490485

0 commit comments

Comments
 (0)