Skip to content

Deephaven csv as default #1057

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f63c633
including dataframe-csv by default in the dataframe dependency
Jolanrensen Feb 11, 2025
4004b65
deprecating apache-based csv implementation and adding dataframe-csv …
Jolanrensen Feb 11, 2025
5037053
fixed csv tests. Needs dataframe bootstrap bump to function correctly
Jolanrensen Feb 12, 2025
124efd7
Merge branch 'master' into deephaven-csv-default
Jolanrensen Feb 12, 2025
7c86fb3
Merge branch 'master' into deephaven-csv-default
Jolanrensen Feb 12, 2025
d6fe324
Merge branch 'master' into deephaven-csv-default
Jolanrensen Feb 12, 2025
adbd6c2
removing some usages of readCSV
Jolanrensen Feb 13, 2025
187993c
removing some usages of writeCSV and bumping bootstrap version
Jolanrensen Feb 13, 2025
546f021
apidump
Jolanrensen Feb 13, 2025
9a87d0e
fixed tests csv, fixed gradle plugin delimiter for csv
Jolanrensen Feb 13, 2025
40b0206
renaming CSV to Csv in the docs
Jolanrensen Feb 13, 2025
f42bd58
dropping the deephaven datetime parser for issue #1047
Jolanrensen Feb 13, 2025
a6f9758
removed failing compiler plugin readDelimStr test
Jolanrensen Feb 13, 2025
b57ec5b
updating csv docs and examples
Jolanrensen Feb 14, 2025
b24db62
moved Compression to :core and simplified it to a fun interface
Jolanrensen Feb 14, 2025
252aaa3
disable csv test with arabic on windows
Jolanrensen Feb 14, 2025
6055995
fixed dependency on dataframe-csv from gradle plugin integration test
Jolanrensen Feb 14, 2025
9e91046
fixed kdocs
Jolanrensen Feb 17, 2025
ce5d170
fixed ksp test
Jolanrensen Feb 17, 2025
1e2bb06
updating bootstrap version
Jolanrensen Feb 17, 2025
be88531
updating some more usages of readCSV to readCsv
Jolanrensen Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ dependencies {
api(project(":dataframe-excel"))
api(project(":dataframe-openapi"))
api(project(":dataframe-jdbc"))
// TODO enable when it leaves the experimental phase
// api(project(":dataframe-csv"))
api(project(":dataframe-csv"))

kover(project(":core"))
kover(project(":dataframe-arrow"))
Expand Down
137 changes: 98 additions & 39 deletions core/api/core.api

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ dependencies {
testImplementation(libs.kotlin.scriptingJvm)
testImplementation(libs.jsoup)
testImplementation(libs.sl4jsimple)

// for JupyterCodegenTests and samples.api
testImplementation(project(":dataframe-csv"))
}

val samplesImplementation by configurations.getting {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
import org.jetbrains.kotlinx.dataframe.io.readCSV
import org.jetbrains.kotlinx.dataframe.typeClass
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
Expand All @@ -27,7 +26,7 @@ import kotlin.reflect.KType
* These options are used to configure how [DataColumns][DataColumn] of type [String] or [String?][String]
* should be parsed.
* You can always pass a [ParserOptions] object to functions that perform parsing, like [tryParse], [parse],
* or even [DataFrame.readCSV][DataFrame.Companion.readCSV] to override these options.
* or even [DataFrame.readCsv][DataFrame.Companion.readCsv] to override these options.
*/
public val DataFrame.Companion.parser: GlobalParserOptions
get() = Parsers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,35 @@ import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadCsvMethod
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
import org.jetbrains.kotlinx.dataframe.impl.api.parse
import org.jetbrains.kotlinx.dataframe.impl.io.readDelimImpl
import org.jetbrains.kotlinx.dataframe.util.APACHE_CSV
import org.jetbrains.kotlinx.dataframe.util.AS_URL
import org.jetbrains.kotlinx.dataframe.util.AS_URL_IMPORT
import org.jetbrains.kotlinx.dataframe.util.AS_URL_REPLACE
import org.jetbrains.kotlinx.dataframe.util.DF_READ_NO_CSV
import org.jetbrains.kotlinx.dataframe.util.DF_READ_NO_CSV_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_CSV
import org.jetbrains.kotlinx.dataframe.util.READ_CSV_FILE_OR_URL_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_CSV_FILE_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_CSV_IMPORT
import org.jetbrains.kotlinx.dataframe.util.READ_CSV_STREAM_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_CSV_URL_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_DELIM
import org.jetbrains.kotlinx.dataframe.util.READ_DELIM_READER_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_DELIM_STREAM_REPLACE
import org.jetbrains.kotlinx.dataframe.util.TO_CSV
import org.jetbrains.kotlinx.dataframe.util.TO_CSV_IMPORT
import org.jetbrains.kotlinx.dataframe.util.TO_CSV_REPLACE
import org.jetbrains.kotlinx.dataframe.util.WRITE_CSV
import org.jetbrains.kotlinx.dataframe.util.WRITE_CSV_FILE_REPLACE
import org.jetbrains.kotlinx.dataframe.util.WRITE_CSV_IMPORT
import org.jetbrains.kotlinx.dataframe.util.WRITE_CSV_PATH_REPLACE
import org.jetbrains.kotlinx.dataframe.util.WRITE_CSV_WRITER_REPLACE
import org.jetbrains.kotlinx.dataframe.values
import java.io.BufferedInputStream
import java.io.BufferedReader
import java.io.File
import java.io.FileInputStream
import java.io.FileWriter
import java.io.IOException
import java.io.InputStream
import java.io.InputStreamReader
import java.io.Reader
Expand All @@ -46,6 +63,7 @@ import kotlin.reflect.KType
import kotlin.reflect.typeOf
import kotlin.time.Duration

@Deprecated(message = APACHE_CSV, level = DeprecationLevel.WARNING)
public class CSV(private val delimiter: Char = ',') : SupportedDataFrameFormat {
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame =
DataFrame.readCSV(stream = stream, delimiter = delimiter, header = header)
Expand All @@ -57,14 +75,18 @@ public class CSV(private val delimiter: Char = ',') : SupportedDataFrameFormat {

override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough

override val testOrder: Int = 20000
override val testOrder: Int = 20_001

override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod {
val arguments = MethodArguments().add("delimiter", typeOf<Char>(), "'%L'", delimiter)
return DefaultReadCsvMethod(pathRepresentation, arguments)
}
}

@Deprecated(
message = APACHE_CSV,
level = DeprecationLevel.WARNING,
)
public enum class CSVType(public val format: CSVFormat) {
DEFAULT(
CSVFormat.DEFAULT.builder()
Expand All @@ -81,12 +103,19 @@ public enum class CSVType(public val format: CSVFormat) {

private val defaultCharset = Charsets.UTF_8

@Deprecated("", level = DeprecationLevel.WARNING)
internal fun isCompressed(fileOrUrl: String) = listOf("gz", "zip").contains(fileOrUrl.split(".").last())

@Deprecated("", level = DeprecationLevel.WARNING)
internal fun isCompressed(file: File) = listOf("gz", "zip").contains(file.extension)

@Deprecated("", level = DeprecationLevel.WARNING)
internal fun isCompressed(url: URL) = isCompressed(url.path)

@Deprecated(
message = APACHE_CSV,
level = DeprecationLevel.HIDDEN, // clashes with the new readDelim
)
@Refine
@Interpretable("ReadDelimStr")
public fun DataFrame.Companion.readDelimStr(
Expand All @@ -106,7 +135,7 @@ public fun DataFrame.Companion.readDelimStr(

@Deprecated(
message = DF_READ_NO_CSV,
replaceWith = ReplaceWith(DF_READ_NO_CSV_REPLACE),
replaceWith = ReplaceWith(DF_READ_NO_CSV_REPLACE, READ_CSV_IMPORT),
level = DeprecationLevel.ERROR,
)
public fun DataFrame.Companion.read(
Expand All @@ -118,22 +147,13 @@ public fun DataFrame.Companion.read(
readLines: Int? = null,
duplicate: Boolean = true,
charset: Charset = Charsets.UTF_8,
): DataFrame<*> =
catchHttpResponse(asUrl(fileOrUrl)) {
readDelim(
it,
delimiter,
header,
isCompressed(fileOrUrl),
getCSVType(fileOrUrl),
colTypes,
skipLines,
readLines,
duplicate,
charset,
)
}
): DataFrame<*> = error(DF_READ_NO_CSV)

@Deprecated(
message = READ_CSV,
replaceWith = ReplaceWith(READ_CSV_FILE_OR_URL_REPLACE, READ_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
@OptInRefine
@Interpretable("ReadCSV0")
public fun DataFrame.Companion.readCSV(
Expand Down Expand Up @@ -163,6 +183,11 @@ public fun DataFrame.Companion.readCSV(
)
}

@Deprecated(
message = READ_CSV,
replaceWith = ReplaceWith(READ_CSV_FILE_REPLACE, READ_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readCSV(
file: File,
delimiter: Char = ',',
Expand All @@ -188,6 +213,11 @@ public fun DataFrame.Companion.readCSV(
parserOptions,
)

@Deprecated(
message = READ_CSV,
replaceWith = ReplaceWith(READ_CSV_URL_REPLACE, READ_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readCSV(
url: URL,
delimiter: Char = ',',
Expand All @@ -212,6 +242,11 @@ public fun DataFrame.Companion.readCSV(
parserOptions,
)

@Deprecated(
message = READ_CSV,
replaceWith = ReplaceWith(READ_CSV_STREAM_REPLACE, READ_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readCSV(
stream: InputStream,
delimiter: Char = ',',
Expand All @@ -238,13 +273,6 @@ public fun DataFrame.Companion.readCSV(
parserOptions,
)

private fun getCSVType(path: String): CSVType =
when (path.substringAfterLast('.').lowercase()) {
"csv" -> CSVType.DEFAULT
"tdf" -> CSVType.TDF
else -> throw IOException("Unknown file format")
}

@Deprecated(
message = AS_URL,
replaceWith = ReplaceWith(AS_URL_REPLACE, AS_URL_IMPORT),
Expand All @@ -264,6 +292,11 @@ private fun getFormat(
.setAllowMissingColumnNames(duplicate)
.build()

@Deprecated(
message = READ_DELIM,
replaceWith = ReplaceWith(READ_DELIM_STREAM_REPLACE),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readDelim(
inStream: InputStream,
delimiter: Char = ',',
Expand Down Expand Up @@ -343,6 +376,11 @@ public fun ColType.toKType(): KType =
ColType.Char -> typeOf<Char>()
}

@Deprecated(
message = READ_DELIM,
replaceWith = ReplaceWith(READ_DELIM_READER_REPLACE),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readDelim(
reader: Reader,
format: CSVFormat = CSVFormat.DEFAULT.builder()
Expand Down Expand Up @@ -371,12 +409,27 @@ public fun DataFrame.Companion.readDelim(
)
}

@Deprecated(
message = WRITE_CSV,
replaceWith = ReplaceWith(WRITE_CSV_FILE_REPLACE, WRITE_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun AnyFrame.writeCSV(file: File, format: CSVFormat = CSVFormat.DEFAULT): Unit =
writeCSV(FileWriter(file), format)

@Deprecated(
message = WRITE_CSV,
replaceWith = ReplaceWith(WRITE_CSV_PATH_REPLACE, WRITE_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun AnyFrame.writeCSV(path: String, format: CSVFormat = CSVFormat.DEFAULT): Unit =
writeCSV(FileWriter(path), format)

@Deprecated(
message = WRITE_CSV,
replaceWith = ReplaceWith(WRITE_CSV_WRITER_REPLACE, WRITE_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun AnyFrame.writeCSV(writer: Appendable, format: CSVFormat = CSVFormat.DEFAULT) {
format.print(writer).use { printer ->
if (!format.skipHeaderRecord) {
Expand All @@ -395,6 +448,11 @@ public fun AnyFrame.writeCSV(writer: Appendable, format: CSVFormat = CSVFormat.D
}
}

@Deprecated(
message = TO_CSV,
replaceWith = ReplaceWith(TO_CSV_REPLACE, TO_CSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun AnyFrame.toCsv(format: CSVFormat = CSVFormat.DEFAULT): String =
StringWriter().use {
this.writeCSV(it, format)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,23 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod
import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadTsvMethod
import org.jetbrains.kotlinx.dataframe.util.APACHE_CSV
import org.jetbrains.kotlinx.dataframe.util.READ_TSV
import org.jetbrains.kotlinx.dataframe.util.READ_TSV_FILE_OR_URL_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_TSV_FILE_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_TSV_IMPORT
import org.jetbrains.kotlinx.dataframe.util.READ_TSV_STREAM_REPLACE
import org.jetbrains.kotlinx.dataframe.util.READ_TSV_URL_REPLACE
import java.io.File
import java.io.FileInputStream
import java.io.InputStream
import java.net.URL
import java.nio.charset.Charset

@Deprecated(
message = APACHE_CSV,
level = DeprecationLevel.WARNING,
)
public class TSV : SupportedDataFrameFormat {
override fun readDataFrame(stream: InputStream, header: List<String>): AnyFrame =
DataFrame.readTSV(stream, header = header)
Expand All @@ -21,14 +32,19 @@ public class TSV : SupportedDataFrameFormat {

override fun acceptsSample(sample: SupportedFormatSample): Boolean = true // Extension is enough

override val testOrder: Int = 30000
override val testOrder: Int = 30_001

override fun createDefaultReadMethod(pathRepresentation: String?): DefaultReadDfMethod =
DefaultReadTsvMethod(pathRepresentation)
}

private const val TAB_CHAR = '\t'

@Deprecated(
message = READ_TSV,
replaceWith = ReplaceWith(READ_TSV_FILE_OR_URL_REPLACE, READ_TSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readTSV(
fileOrUrl: String,
header: List<String> = listOf(),
Expand All @@ -55,6 +71,11 @@ public fun DataFrame.Companion.readTSV(
)
}

@Deprecated(
message = READ_TSV,
replaceWith = ReplaceWith(READ_TSV_FILE_REPLACE, READ_TSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readTSV(
file: File,
header: List<String> = listOf(),
Expand All @@ -77,6 +98,11 @@ public fun DataFrame.Companion.readTSV(
charset,
)

@Deprecated(
message = READ_TSV,
replaceWith = ReplaceWith(READ_TSV_URL_REPLACE, READ_TSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readTSV(
url: URL,
header: List<String> = listOf(),
Expand All @@ -99,6 +125,11 @@ public fun DataFrame.Companion.readTSV(
parserOptions,
)

@Deprecated(
message = READ_TSV,
replaceWith = ReplaceWith(READ_TSV_STREAM_REPLACE, READ_TSV_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun DataFrame.Companion.readTSV(
stream: InputStream,
header: List<String> = listOf(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,15 @@ internal class Integration(private val notebook: Notebook, private val options:
override fun Builder.onLoaded() {
if (version != null) {
if (enableExperimentalCsv?.toBoolean() == true) {
println("Enabling experimental CSV module: dataframe-csv")
dependencies("org.jetbrains.kotlinx:dataframe-csv:$version")
println("CSV module is already enabled by default now.")
}
if (enableExperimentalGeo?.toBoolean() == true) {
println("Enabling experimental Geo module: dataframe-geo")
repositories("https://repo.osgeo.org/repository/release")
dependencies("org.jetbrains.kotlinx:dataframe-geo:$version")
}
dependencies(
"org.jetbrains.kotlinx:dataframe-csv:$version",
"org.jetbrains.kotlinx:dataframe-excel:$version",
"org.jetbrains.kotlinx:dataframe-jdbc:$version",
"org.jetbrains.kotlinx:dataframe-arrow:$version",
Expand Down
Loading