-
Notifications
You must be signed in to change notification settings - Fork 73
New CSV implementation #903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
e26e51a
21028f3
16c60b5
d436086
7feac5a
8c9c56f
6e34b73
766cad1
f751b0d
50f4f33
1cfd7cd
6bb3502
4ebb5a0
53e2f64
cdef312
860a5c2
a02b9e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY | |
import java.time.format.DateTimeFormatter | ||
import java.util.Locale | ||
import kotlin.reflect.KProperty | ||
import kotlin.reflect.KType | ||
|
||
public val DataFrame.Companion.parser: GlobalParserOptions get() = Parsers | ||
|
||
|
@@ -56,6 +57,8 @@ public interface GlobalParserOptions { | |
* it will be used to create a [DateTimeFormatter]. | ||
* @param nullStrings a set of strings that should be treated as `null` values. By default, it's | ||
* ["null", "NULL", "NA", "N/A"]. | ||
* @param skipTypes a set of types that should be skipped during parsing. Parsing will be attempted for all other types. | ||
* By default, it's an empty set. To skip all types except some specified ones, use [allTypesExcept]. | ||
* @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. | ||
*/ | ||
public data class ParserOptions( | ||
|
@@ -64,8 +67,17 @@ public data class ParserOptions( | |
val dateTimeFormatter: DateTimeFormatter? = null, | ||
val dateTimePattern: String? = null, | ||
val nullStrings: Set<String>? = null, | ||
val skipTypes: Set<KType> = emptySet(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really weird feature, let's live a little bit with that solution to reflect later in code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's mostly a performance thing, but the problem is that we don't have a user-facing way to refer to parsers, so we cannot say: "please parse this string column and try this type but not this type". I'm not sure how the API should look here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I managed to leverage |
||
val useFastDoubleParser: Boolean = false, | ||
) { | ||
public companion object { | ||
/** | ||
* Small helper function to get all types except the ones specified. | ||
* Useful in combination with the [skipTypes] parameter. | ||
*/ | ||
public fun allTypesExcept(vararg types: KType): Set<KType> = | ||
Parsers.parsersOrder.map { it.type }.toSet() - types.toSet() | ||
} | ||
|
||
/** For binary compatibility. */ | ||
@Deprecated( | ||
|
@@ -82,6 +94,7 @@ public data class ParserOptions( | |
dateTimeFormatter = dateTimeFormatter, | ||
dateTimePattern = dateTimePattern, | ||
nullStrings = nullStrings, | ||
skipTypes = emptySet(), | ||
useFastDoubleParser = false, | ||
) | ||
|
||
|
@@ -101,6 +114,7 @@ public data class ParserOptions( | |
dateTimeFormatter = dateTimeFormatter, | ||
dateTimePattern = dateTimePattern, | ||
nullStrings = nullStrings, | ||
skipTypes = skipTypes, | ||
useFastDoubleParser = useFastDoubleParser, | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,7 @@ import org.jetbrains.kotlinx.dataframe.impl.catchSilent | |
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType | ||
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser | ||
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse | ||
import org.jetbrains.kotlinx.dataframe.io.isURL | ||
import org.jetbrains.kotlinx.dataframe.io.isUrl | ||
import org.jetbrains.kotlinx.dataframe.io.readJsonStr | ||
import org.jetbrains.kotlinx.dataframe.values | ||
import java.math.BigDecimal | ||
|
@@ -209,7 +209,7 @@ internal object Parsers : GlobalParserOptions { | |
toJavaLocalDateTimeOrNull(formatter) // since we accept a Java DateTimeFormatter | ||
?.toKotlinLocalDateTime() | ||
|
||
private fun String.toUrlOrNull(): URL? = if (isURL(this)) catchSilent { URL(this) } else null | ||
private fun String.toUrlOrNull(): URL? = if (isUrl(this)) catchSilent { URL(this) } else null | ||
|
||
private fun String.toBooleanOrNull() = | ||
when (uppercase(Locale.getDefault())) { | ||
|
@@ -387,6 +387,8 @@ internal object Parsers : GlobalParserOptions { | |
null | ||
} | ||
}, | ||
// Char | ||
stringParser<Char> { it.singleOrNull() }, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's a bit too specific to be useful, wouldn't you agree? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not necessarily, columns with single characters can be quite common, especially when reading pivoted data, plus it could save memory, which is always a good thing, right? :) |
||
// No parser found, return as String | ||
// must be last in the list of parsers to return original unparsed string | ||
stringParser<String> { it }, | ||
|
@@ -460,7 +462,9 @@ internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColu | |
var nullStringParsed = false | ||
val nulls = options?.nullStrings ?: Parsers.nulls | ||
|
||
val parserTypesToSkip = options?.skipTypes ?: emptySet() | ||
val parsersToCheck = Parsers.parsersOrder | ||
.filterNot { it.type in parserTypesToSkip } | ||
val parserTypesToCheck = parsersToCheck.map { it.type }.toSet() | ||
|
||
var correctParser: StringParser<*>? = null | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: convert to issues or fix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's already part of the umbrella issue todo-list: #827