Skip to content

Commit 8d8dad7

Browse files
committed
Parser can now parse Chars. Parser can now skipTypes, extra arg in ParserOptions. Fixed nullStrings not being recognized on old csv implementation by using tryParse() instead of specific StringParser.parse(). Preparing :core for csv-implementation: Added extra supported ColTypes, made ColumnNameGenerator public, added TODOs in places the new csv implementation will need to replace the old one eventually.
1 parent 439f65d commit 8d8dad7

File tree

11 files changed

+95
-26
lines changed

11 files changed

+95
-26
lines changed

core/api/core.api

+29-5
Original file line numberDiff line numberDiff line change
@@ -6485,30 +6485,37 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt {
64856485
}
64866486

64876487
public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions {
6488+
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions$Companion;
64886489
public fun <init> ()V
64896490
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)V
64906491
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V
6491-
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)V
6492-
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
6492+
public fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Z)V
6493+
public synthetic fun <init> (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V
64936494
public final fun component1 ()Ljava/util/Locale;
64946495
public final fun component2 ()Ljava/time/format/DateTimeFormatter;
64956496
public final fun component3 ()Ljava/lang/String;
64966497
public final fun component4 ()Ljava/util/Set;
6497-
public final fun component5 ()Z
6498+
public final fun component5 ()Ljava/util/Set;
6499+
public final fun component6 ()Z
64986500
public final synthetic fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6499-
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Z)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6501+
public final fun copy (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Z)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
65006502
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6501-
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
6503+
public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;
65026504
public fun equals (Ljava/lang/Object;)Z
65036505
public final fun getDateTimeFormatter ()Ljava/time/format/DateTimeFormatter;
65046506
public final fun getDateTimePattern ()Ljava/lang/String;
65056507
public final fun getLocale ()Ljava/util/Locale;
65066508
public final fun getNullStrings ()Ljava/util/Set;
6509+
public final fun getSkipTypes ()Ljava/util/Set;
65076510
public final fun getUseFastDoubleParser ()Z
65086511
public fun hashCode ()I
65096512
public fun toString ()Ljava/lang/String;
65106513
}
65116514

6515+
public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions$Companion {
6516+
public final fun allTypesExcept ([Lkotlin/reflect/KType;)Ljava/util/Set;
6517+
}
6518+
65126519
public abstract interface class org/jetbrains/kotlinx/dataframe/api/Pivot : org/jetbrains/kotlinx/dataframe/aggregation/Aggregatable {
65136520
}
65146521

@@ -9946,6 +9953,16 @@ public final class org/jetbrains/kotlinx/dataframe/impl/ColumnAccessTrackerKt {
99469953
public static final fun trackColumnAccess (Lkotlin/jvm/functions/Function0;)Ljava/util/List;
99479954
}
99489955

9956+
public final class org/jetbrains/kotlinx/dataframe/impl/ColumnNameGenerator {
9957+
public fun <init> ()V
9958+
public fun <init> (Ljava/util/List;)V
9959+
public synthetic fun <init> (Ljava/util/List;ILkotlin/jvm/internal/DefaultConstructorMarker;)V
9960+
public final fun addIfAbsent (Ljava/lang/String;)V
9961+
public final fun addUnique (Ljava/lang/String;)Ljava/lang/String;
9962+
public final fun contains (Ljava/lang/String;)Z
9963+
public final fun getNames ()Ljava/util/List;
9964+
}
9965+
99499966
public final class org/jetbrains/kotlinx/dataframe/impl/DataFrameSize {
99509967
public fun <init> (II)V
99519968
public final fun component1 ()I
@@ -10271,13 +10288,19 @@ public final class org/jetbrains/kotlinx/dataframe/io/CSVType : java/lang/Enum {
1027110288
public final class org/jetbrains/kotlinx/dataframe/io/ColType : java/lang/Enum {
1027210289
public static final field BigDecimal Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027310290
public static final field Boolean Lorg/jetbrains/kotlinx/dataframe/io/ColType;
10291+
public static final field Char Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027410292
public static final field Double Lorg/jetbrains/kotlinx/dataframe/io/ColType;
10293+
public static final field Duration Lorg/jetbrains/kotlinx/dataframe/io/ColType;
10294+
public static final field Instant Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027510295
public static final field Int Lorg/jetbrains/kotlinx/dataframe/io/ColType;
10296+
public static final field JsonArray Lorg/jetbrains/kotlinx/dataframe/io/ColType;
10297+
public static final field JsonObject Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027610298
public static final field LocalDate Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027710299
public static final field LocalDateTime Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027810300
public static final field LocalTime Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1027910301
public static final field Long Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1028010302
public static final field String Lorg/jetbrains/kotlinx/dataframe/io/ColType;
10303+
public static final field Url Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1028110304
public static fun getEntries ()Lkotlin/enums/EnumEntries;
1028210305
public static fun valueOf (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/ColType;
1028310306
public static fun values ()[Lorg/jetbrains/kotlinx/dataframe/io/ColType;
@@ -10313,6 +10336,7 @@ public final class org/jetbrains/kotlinx/dataframe/io/CsvKt {
1031310336
public static synthetic fun readDelimStr$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;CLjava/util/Map;ILjava/lang/Integer;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
1031410337
public static final fun toCsv (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/commons/csv/CSVFormat;)Ljava/lang/String;
1031510338
public static synthetic fun toCsv$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Lorg/apache/commons/csv/CSVFormat;ILjava/lang/Object;)Ljava/lang/String;
10339+
public static final fun toKType (Lorg/jetbrains/kotlinx/dataframe/io/ColType;)Lkotlin/reflect/KType;
1031610340
public static final fun toType (Lorg/jetbrains/kotlinx/dataframe/io/ColType;)Lkotlin/reflect/KClass;
1031710341
public static final fun writeCSV (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/io/File;Lorg/apache/commons/csv/CSVFormat;)V
1031810342
public static final fun writeCSV (Lorg/jetbrains/kotlinx/dataframe/DataFrame;Ljava/lang/Appendable;Lorg/apache/commons/csv/CSVFormat;)V

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/annotations/ImportDataSchema.kt

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ public enum class DataSchemaVisibility {
4747
EXPLICIT_PUBLIC,
4848
}
4949

50+
// TODO add more options
5051
public annotation class CsvOptions(public val delimiter: Char)
5152

5253
/**

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt

+14
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
1616
import java.time.format.DateTimeFormatter
1717
import java.util.Locale
1818
import kotlin.reflect.KProperty
19+
import kotlin.reflect.KType
1920

2021
public val DataFrame.Companion.parser: GlobalParserOptions get() = Parsers
2122

@@ -56,6 +57,8 @@ public interface GlobalParserOptions {
5657
* it will be used to create a [DateTimeFormatter].
5758
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's
5859
* ["null", "NULL", "NA", "N/A"].
60+
* @param skipTypes a set of types that should be skipped during parsing. Parsing will be attempted for all other types.
61+
* By default, it's an empty set. To skip all types except some specified ones, use [allTypesExcept].
5962
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now.
6063
*/
6164
public data class ParserOptions(
@@ -64,8 +67,17 @@ public data class ParserOptions(
6467
val dateTimeFormatter: DateTimeFormatter? = null,
6568
val dateTimePattern: String? = null,
6669
val nullStrings: Set<String>? = null,
70+
val skipTypes: Set<KType> = emptySet(),
6771
val useFastDoubleParser: Boolean = false,
6872
) {
73+
public companion object {
74+
/**
75+
* Small helper function to get all types except the ones specified.
76+
* Useful in combination with the [skipTypes] parameter.
77+
*/
78+
public fun allTypesExcept(vararg types: KType): Set<KType> =
79+
Parsers.parsersOrder.map { it.type }.toSet() - types.toSet()
80+
}
6981

7082
/** For binary compatibility. */
7183
@Deprecated(
@@ -82,6 +94,7 @@ public data class ParserOptions(
8294
dateTimeFormatter = dateTimeFormatter,
8395
dateTimePattern = dateTimePattern,
8496
nullStrings = nullStrings,
97+
skipTypes = emptySet(),
8598
useFastDoubleParser = false,
8699
)
87100

@@ -101,6 +114,7 @@ public data class ParserOptions(
101114
dateTimeFormatter = dateTimeFormatter,
102115
dateTimePattern = dateTimePattern,
103116
nullStrings = nullStrings,
117+
skipTypes = skipTypes,
104118
useFastDoubleParser = useFastDoubleParser,
105119
)
106120

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/ColumnNameGenerator.kt

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@ package org.jetbrains.kotlinx.dataframe.impl
22

33
import org.jetbrains.kotlinx.dataframe.AnyFrame
44

5-
internal class ColumnNameGenerator(columnNames: List<String> = emptyList()) {
5+
public class ColumnNameGenerator(columnNames: List<String> = emptyList()) {
66

77
private val usedNames = columnNames.toMutableSet()
88

99
private val colNames = columnNames.toMutableList()
1010

11-
fun addUnique(preferredName: String): String {
11+
public fun addUnique(preferredName: String): String {
1212
var name = preferredName
1313
var k = 1
1414
while (usedNames.contains(name)) {
@@ -19,17 +19,17 @@ internal class ColumnNameGenerator(columnNames: List<String> = emptyList()) {
1919
return name
2020
}
2121

22-
fun addIfAbsent(name: String) {
22+
public fun addIfAbsent(name: String) {
2323
if (!usedNames.contains(name)) {
2424
usedNames.add(name)
2525
colNames.add(name)
2626
}
2727
}
2828

29-
val names: List<String>
29+
public val names: List<String>
3030
get() = colNames
3131

32-
fun contains(name: String) = usedNames.contains(name)
32+
public operator fun contains(name: String): Boolean = usedNames.contains(name)
3333
}
3434

3535
internal fun AnyFrame.nameGenerator() = ColumnNameGenerator(columnNames())

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

+4
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,8 @@ internal object Parsers : GlobalParserOptions {
388388
null
389389
}
390390
},
391+
// Char
392+
stringParser<Char> { it.singleOrNull() },
391393
// No parser found, return as String
392394
// must be last in the list of parsers to return original unparsed string
393395
stringParser<String> { it },
@@ -461,7 +463,9 @@ internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColu
461463
var nullStringParsed = false
462464
val nulls = options?.nullStrings ?: Parsers.nulls
463465

466+
val parserTypesToSkip = options?.skipTypes ?: emptySet()
464467
val parsersToCheck = Parsers.parsersOrder
468+
.filterNot { it.type in parserTypesToSkip }
465469
val parserTypesToCheck = parsersToCheck.map { it.type }.toSet()
466470

467471
var correctParser: StringParser<*>? = null

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt

+36-13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import kotlinx.datetime.Instant
34
import kotlinx.datetime.LocalDate
45
import kotlinx.datetime.LocalDateTime
56
import kotlinx.datetime.LocalTime
@@ -10,6 +11,7 @@ import org.jetbrains.kotlinx.dataframe.AnyFrame
1011
import org.jetbrains.kotlinx.dataframe.AnyRow
1112
import org.jetbrains.kotlinx.dataframe.DataColumn
1213
import org.jetbrains.kotlinx.dataframe.DataFrame
14+
import org.jetbrains.kotlinx.dataframe.DataRow
1315
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
1416
import org.jetbrains.kotlinx.dataframe.annotations.OptInRefine
1517
import org.jetbrains.kotlinx.dataframe.annotations.Refine
@@ -20,7 +22,6 @@ import org.jetbrains.kotlinx.dataframe.api.tryParse
2022
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadCsvMethod
2123
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
2224
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
23-
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
2425
import org.jetbrains.kotlinx.dataframe.impl.api.parse
2526
import org.jetbrains.kotlinx.dataframe.util.DF_READ_NO_CSV
2627
import org.jetbrains.kotlinx.dataframe.util.DF_READ_NO_CSV_REPLACE
@@ -41,8 +42,10 @@ import java.net.URL
4142
import java.nio.charset.Charset
4243
import java.util.zip.GZIPInputStream
4344
import kotlin.reflect.KClass
45+
import kotlin.reflect.KType
4446
import kotlin.reflect.full.withNullability
4547
import kotlin.reflect.typeOf
48+
import kotlin.time.Duration
4649

4750
public class CSV(private val delimiter: Char = ',') : SupportedDataFrameFormat {
4851
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame =
@@ -296,6 +299,9 @@ public fun DataFrame.Companion.readDelim(
296299
)
297300
}
298301

302+
/**
303+
* Column types that DataFrame can [parse] from a [String].
304+
*/
299305
public enum class ColType {
300306
Int,
301307
Long,
@@ -306,19 +312,33 @@ public enum class ColType {
306312
LocalTime,
307313
LocalDateTime,
308314
String,
315+
Instant,
316+
Duration,
317+
Url,
318+
JsonArray,
319+
JsonObject,
320+
Char,
309321
}
310322

311-
public fun ColType.toType(): KClass<out Any> =
323+
public fun ColType.toType(): KClass<*> = toKType().classifier as KClass<*>
324+
325+
public fun ColType.toKType(): KType =
312326
when (this) {
313-
ColType.Int -> Int::class
314-
ColType.Long -> Long::class
315-
ColType.Double -> Double::class
316-
ColType.Boolean -> Boolean::class
317-
ColType.BigDecimal -> BigDecimal::class
318-
ColType.LocalDate -> LocalDate::class
319-
ColType.LocalTime -> LocalTime::class
320-
ColType.LocalDateTime -> LocalDateTime::class
321-
ColType.String -> String::class
327+
ColType.Int -> typeOf<Int>()
328+
ColType.Long -> typeOf<Long>()
329+
ColType.Double -> typeOf<Double>()
330+
ColType.Boolean -> typeOf<Boolean>()
331+
ColType.BigDecimal -> typeOf<BigDecimal>()
332+
ColType.LocalDate -> typeOf<LocalDate>()
333+
ColType.LocalTime -> typeOf<LocalTime>()
334+
ColType.LocalDateTime -> typeOf<LocalDateTime>()
335+
ColType.String -> typeOf<String>()
336+
ColType.Instant -> typeOf<Instant>()
337+
ColType.Duration -> typeOf<Duration>()
338+
ColType.Url -> typeOf<URL>()
339+
ColType.JsonArray -> typeOf<DataFrame<*>>()
340+
ColType.JsonObject -> typeOf<DataRow<*>>()
341+
ColType.Char -> typeOf<Char>()
322342
}
323343

324344
public fun DataFrame.Companion.readDelim(
@@ -377,8 +397,11 @@ public fun DataFrame.Companion.readDelim(
377397
null -> column.tryParse(parserOptions)
378398

379399
else -> {
380-
val parser = Parsers[colType.toType()]!!
381-
column.parse(parser, parserOptions)
400+
column.tryParse(
401+
(parserOptions ?: ParserOptions()).copy(
402+
skipTypes = ParserOptions.allTypesExcept(colType.toKType()),
403+
),
404+
)
382405
}
383406
}
384407
}

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class CsvTests {
7070
df.nrow shouldBe 5
7171
df.columnNames()[5] shouldBe "duplicate1"
7272
df.columnNames()[6] shouldBe "duplicate11"
73-
df["duplicate1"].type() shouldBe typeOf<String?>()
73+
df["duplicate1"].type() shouldBe typeOf<Char?>()
7474
df["double"].type() shouldBe typeOf<Double?>()
7575
df["time"].type() shouldBe typeOf<LocalDateTime>()
7676

@@ -89,7 +89,7 @@ class CsvTests {
8989
df.nrow shouldBe 5
9090
df.columnNames()[5] shouldBe "duplicate1"
9191
df.columnNames()[6] shouldBe "duplicate11"
92-
df["duplicate1"].type() shouldBe typeOf<String?>()
92+
df["duplicate1"].type() shouldBe typeOf<Char?>()
9393
df["double"].type() shouldBe typeOf<Double?>()
9494
df["number"].type() shouldBe typeOf<Double>()
9595
df["time"].type() shouldBe typeOf<LocalDateTime>()

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class ParserTests {
4848

4949
@Test(expected = IllegalStateException::class)
5050
fun `parse should throw`() {
51-
val col by columnOf("a", "b")
51+
val col by columnOf("a", "bc")
5252
col.parse()
5353
}
5454

plugins/dataframe-gradle-plugin/src/main/kotlin/org/jetbrains/dataframe/gradle/GenerateDataSchemaTask.kt

+1
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ abstract class GenerateDataSchemaTask : DefaultTask() {
116116
val url = urlOf(data.get())
117117

118118
val formats = listOf(
119+
// TODO new Csv() and Tsv()
119120
CSV(delimiter = csvOptions.delimiter),
120121
JSON(typeClashTactic = jsonOptions.typeClashTactic, keyValuePaths = jsonOptions.keyValuePaths),
121122
Excel(),

plugins/dataframe-gradle-plugin/src/main/kotlin/org/jetbrains/dataframe/gradle/SchemaGeneratorExtension.kt

+1
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ class Schema(
123123
}
124124

125125
// Without Serializable GradleRunner tests fail
126+
// TODO add more options
126127
data class CsvOptionsDsl(var delimiter: Char = ',') : Serializable
127128

128129
data class JsonOptionsDsl(

plugins/symbol-processor/src/main/kotlin/org/jetbrains/dataframe/ksp/DataSchemaGenerator.kt

+1
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ class DataSchemaGenerator(
155155
codeGenerator.createNewFile(Dependencies(true, importStatement.origin), packageName, "$name.Generated")
156156

157157
val formats = listOf(
158+
// TODO new Csv() and Tsv()
158159
CSV(delimiter = importStatement.csvOptions.delimiter),
159160
JSON(
160161
typeClashTactic = importStatement.jsonOptions.typeClashTactic,

0 commit comments

Comments
 (0)