Skip to content

Commit 258f5b7

Browse files
committed
introducing new dataframe-csv module; preprocess KDocs is enabled. introduces read(Csv|Tsv|Delim)(Str) based on Deephaven, write(Csv|Tsv|Delim), and to(Csv|Tsv|Delim)Str based on Apache commons csv. Fixes almost all cases of the umbrella issue and has many tests.
1 parent 8d8dad7 commit 258f5b7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+98178
-7
lines changed

.github/workflows/generated-sources-master.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
run: |
2727
git config --global user.name 'github-actions[bot]'
2828
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
29-
git add './core/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
29+
git add './core/generated-sources' './dataframe-csv/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
3030
git diff --staged --quiet || git commit -m "Automated commit of generated code"
3131
git push
3232
env:

.github/workflows/generated-sources.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,18 @@ jobs:
3838
git config --global user.name "GitHub Actions"
3939
4040
- name: Run Gradle task
41-
run: ./gradlew :core:processKDocsMain korro
41+
run: ./gradlew processKDocsMain korro
4242

4343
- name: Check for changes in generated sources
4444
id: git-diff
45-
run: echo "changed=$(if git diff --quiet './core/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'; then echo 'false'; else echo 'true'; fi)" >> $GITHUB_OUTPUT
45+
run: echo "changed=$(if git diff --quiet './core/generated-sources' './dataframe-csv/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'; then echo 'false'; else echo 'true'; fi)" >> $GITHUB_OUTPUT
4646

4747
- name: Commit and push if changes
4848
id: git-commit
4949
if: steps.git-diff.outputs.changed == 'true'
5050
run: |
5151
git checkout -b generated-sources/docs-update-${{ github.run_number }}
52-
git add './core/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
52+
git add './core/generated-sources' './dataframe-csv/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
5353
git commit -m "Update generated sources with recent changes"
5454
git push origin generated-sources/docs-update-${{ github.run_number }}
5555
echo "commit=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT

build.gradle.kts

+3
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,15 @@ dependencies {
5555
api(project(":dataframe-excel"))
5656
api(project(":dataframe-openapi"))
5757
api(project(":dataframe-jdbc"))
58+
// TODO enable when it leaves the experimental phase
59+
// api(project(":dataframe-csv"))
5860

5961
kover(project(":core"))
6062
kover(project(":dataframe-arrow"))
6163
kover(project(":dataframe-excel"))
6264
kover(project(":dataframe-openapi"))
6365
kover(project(":dataframe-jdbc"))
66+
kover(project(":dataframe-csv"))
6467
kover(project(":plugins:kotlin-dataframe"))
6568
}
6669

dataframe-csv/api/dataframe-csv.api

+181
Large diffs are not rendered by default.

dataframe-csv/build.gradle.kts

+170
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import nl.jolanrensen.docProcessor.defaultProcessors.ARG_DOC_PROCESSOR_LOG_NOT_FOUND
2+
import nl.jolanrensen.docProcessor.gradle.creatingProcessDocTask
3+
import org.gradle.jvm.tasks.Jar
4+
5+
plugins {
6+
with(libs.plugins) {
7+
alias(kotlin.jvm)
8+
alias(publisher)
9+
alias(serialization)
10+
alias(kover)
11+
alias(ktlint)
12+
alias(jupyter.api)
13+
alias(docProcessor)
14+
alias(binary.compatibility.validator)
15+
alias(kotlinx.benchmark)
16+
}
17+
idea
18+
}
19+
20+
group = "org.jetbrains.kotlinx"
21+
22+
val jupyterApiTCRepo: String by project
23+
24+
repositories {
25+
mavenLocal()
26+
mavenCentral()
27+
maven(jupyterApiTCRepo)
28+
}
29+
30+
dependencies {
31+
implementation(project(":core"))
32+
33+
// for csv reading
34+
implementation(libs.deephavenCsv)
35+
// for csv writing
36+
implementation(libs.commonsCsv)
37+
implementation(libs.commonsIo)
38+
implementation(libs.sl4j)
39+
implementation(libs.kotlinLogging)
40+
implementation(libs.kotlin.reflect)
41+
42+
testApi(project(":core"))
43+
testImplementation(libs.kotlinx.benchmark.runtime)
44+
testImplementation(libs.junit)
45+
testImplementation(libs.kotestAssertions) {
46+
exclude("org.jetbrains.kotlin", "kotlin-stdlib-jdk8")
47+
}
48+
}
49+
50+
benchmark {
51+
targets {
52+
register("test")
53+
}
54+
}
55+
56+
val generatedSourcesFolderName = "generated-sources"
57+
58+
// Backup the kotlin source files location
59+
val kotlinMainSources = kotlin.sourceSets.main
60+
.get()
61+
.kotlin.sourceDirectories
62+
.toList()
63+
val kotlinTestSources = kotlin.sourceSets.test
64+
.get()
65+
.kotlin.sourceDirectories
66+
.toList()
67+
68+
fun pathOf(vararg parts: String) = parts.joinToString(File.separator)
69+
70+
// Include both test and main sources for cross-referencing, Exclude generated sources
71+
val processKDocsMainSources = (kotlinMainSources + kotlinTestSources)
72+
.filterNot { pathOf("build", "generated") in it.path }
73+
74+
// sourceset of the generated sources as a result of `processKDocsMain`, this will create linter tasks
75+
val generatedSources by kotlin.sourceSets.creating {
76+
kotlin {
77+
setSrcDirs(
78+
listOf(
79+
"$generatedSourcesFolderName/src/main/kotlin",
80+
"$generatedSourcesFolderName/src/main/java",
81+
),
82+
)
83+
}
84+
}
85+
86+
// Task to generate the processed documentation
87+
val processKDocsMain by creatingProcessDocTask(processKDocsMainSources) {
88+
target = file(generatedSourcesFolderName)
89+
arguments += ARG_DOC_PROCESSOR_LOG_NOT_FOUND to false
90+
91+
// false, so `runKtlintFormatOverGeneratedSourcesSourceSet` can format the output
92+
outputReadOnly = false
93+
94+
exportAsHtml {
95+
dir = file("../docs/StardustDocs/snippets/kdocs")
96+
}
97+
task {
98+
group = "KDocs"
99+
finalizedBy("runKtlintFormatOverGeneratedSourcesSourceSet")
100+
}
101+
}
102+
103+
tasks.named("ktlintGeneratedSourcesSourceSetCheck") {
104+
onlyIf { false }
105+
}
106+
tasks.named("runKtlintCheckOverGeneratedSourcesSourceSet") {
107+
onlyIf { false }
108+
}
109+
110+
// If `changeJarTask` is run, modify all Jar tasks such that before running the Kotlin sources are set to
111+
// the target of `processKdocMain`, and they are returned to normal afterward.
112+
// This is usually only done when publishing
113+
val changeJarTask by tasks.creating {
114+
outputs.upToDateWhen { false }
115+
doFirst {
116+
tasks.withType<Jar> {
117+
doFirst {
118+
require(generatedSources.kotlin.srcDirs.toList().isNotEmpty()) {
119+
logger.error("`processKDocsMain`'s outputs are empty, did `processKDocsMain` run before this task?")
120+
}
121+
kotlin.sourceSets.main {
122+
kotlin.setSrcDirs(generatedSources.kotlin.srcDirs)
123+
}
124+
logger.lifecycle("$this is run with modified sources: \"$generatedSourcesFolderName\"")
125+
}
126+
127+
doLast {
128+
kotlin.sourceSets.main {
129+
kotlin.setSrcDirs(kotlinMainSources)
130+
}
131+
}
132+
}
133+
}
134+
}
135+
136+
// if `processKDocsMain` runs, the Jar tasks must run after it so the generated-sources are there
137+
tasks.withType<Jar> {
138+
mustRunAfter(changeJarTask, processKDocsMain)
139+
}
140+
141+
// modify all publishing tasks to depend on `changeJarTask` so the sources are swapped out with generated sources
142+
tasks.configureEach {
143+
if (name.startsWith("publish")) {
144+
dependsOn(processKDocsMain, changeJarTask)
145+
}
146+
}
147+
148+
// Exclude the generated/processed sources from the IDE
149+
idea {
150+
module {
151+
excludeDirs.add(file(generatedSourcesFolderName))
152+
}
153+
}
154+
155+
kotlinPublications {
156+
publication {
157+
publicationName = "dataframeCsv"
158+
artifactId = project.name
159+
description = "CSV support for Kotlin Dataframe"
160+
packageName = artifactId
161+
}
162+
}
163+
164+
kotlin {
165+
explicitApi()
166+
sourceSets.all {
167+
languageSettings {
168+
}
169+
}
170+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
@file:ExcludeFromSources
2+
3+
package org.jetbrains.kotlinx.dataframe.documentation
4+
5+
import org.jetbrains.kotlinx.dataframe.DataFrame
6+
import org.jetbrains.kotlinx.dataframe.documentation.CommonReadDelimDocs.DataTitleArg
7+
import org.jetbrains.kotlinx.dataframe.documentation.CommonReadDelimDocs.FileTypeArg
8+
import org.jetbrains.kotlinx.dataframe.documentation.CommonReadDelimDocs.FileTypeTitleArg
9+
import org.jetbrains.kotlinx.dataframe.documentation.CommonReadDelimDocs.FunctionLinkArg
10+
import org.jetbrains.kotlinx.dataframe.documentation.CommonReadDelimDocs.OldFunctionLinkArg
11+
import org.jetbrains.kotlinx.dataframe.io.ColType
12+
import org.jetbrains.kotlinx.dataframe.io.DEFAULT_COL_TYPE
13+
import org.jetbrains.kotlinx.dataframe.io.DEFAULT_PARSER_OPTIONS
14+
import org.jetbrains.kotlinx.dataframe.io.ExperimentalCsv
15+
import java.io.File
16+
import java.io.InputStream
17+
import java.net.URL
18+
import java.util.Locale
19+
20+
/**
21+
* ### Read $[FileTypeTitleArg] $[DataTitleArg] to [DataFrame]
22+
*
23+
* Reads any $[FileTypeArg] $[DataArg] to a [DataFrame][DataFrame].
24+
*
25+
* Parameters you can use to customize the reading process include, for instance, \[delimiter\],
26+
* \[header\], \[colTypes\], \[readLines\], and \[parserOptions\].
27+
* See the param list below for all settings.
28+
*
29+
* The integration is built upon {@include [DocumentationUrls.Deephaven]}.
30+
*
31+
* ##### Similar Functions
32+
* With the overloads of $[FunctionLinkArg]`()`, you can read any $[FileTypeArg] by [File][File],
33+
* [URL][URL], or [InputStream][InputStream].
34+
* Reading by file path or URL can also be done by passing a [String].
35+
*
36+
* For example, $[FunctionLinkArg]`("input.$[CommonReadDelimDocs.FileExtensionArg]")` or with some options:
37+
*
38+
* $[FunctionLinkArg]`(`
39+
*
40+
* {@include [Indent]}`file = `[File][File]`("input.$[CommonReadDelimDocs.FileExtensionArg]"),`
41+
*
42+
* {@include [Indent]}`parserOptions = `[DEFAULT_PARSER_OPTIONS][DEFAULT_PARSER_OPTIONS]`.copy(locale = `[Locale][Locale]`.`[US][Locale.US]`),`
43+
*
44+
* {@include [Indent]}`colTypes = `[mapOf][mapOf]`("a" `[to][to]` `[ColType][ColType]`.`[Int][ColType.Int]`, `[DEFAULT_COL_TYPE][DEFAULT_COL_TYPE]` `[to][to]` `[ColType][ColType]`.`[String][ColType.String]`),`
45+
*
46+
* {@include [Indent]}`readLines = 1000L,`
47+
*
48+
* `)`
49+
*
50+
* ZIP (.zip) or GZIP (.gz) files are supported by default. \[compression\] is automatically detected.
51+
*
52+
* You can also read "raw" $[FileTypeArg] data from a [String] like this:
53+
*
54+
* $[StrFunctionLinkArg]`("a,b,c", delimiter = ",")`
55+
*
56+
* _**NOTE EXPERIMENTAL**: This is a new set of functions, replacing the old $[OldFunctionLinkArg]`()` functions.
57+
* They'll hopefully be faster and better. Until they are proven to be so,
58+
* you'll need to [opt in][OptIn] to [ExperimentalCsv][ExperimentalCsv] to be able to use them._
59+
*
60+
* @comment Some helper arguments for the function links
61+
* @set [FunctionLinkArg] \[DataFrame.${[FunctionNameArg]}\]\[${[FunctionNameArg]}\]
62+
* @set [StrFunctionLinkArg] \[DataFrame.${[FunctionNameArg]}Str\]\[${[FunctionNameArg]}Str\]
63+
* @set [OldFunctionLinkArg] \[DataFrame.${[OldFunctionNameArg]}\]\[org.jetbrains.kotlinx.dataframe.io.${[OldFunctionNameArg]}\]
64+
*/
65+
internal interface CommonReadDelimDocs {
66+
67+
/**
68+
* @include [CommonReadDelimDocs]
69+
* @set [FileTypeTitleArg] CSV
70+
* @set [FileTypeArg] CSV
71+
* @set [FileExtensionArg] csv
72+
* @set [FunctionNameArg] readCsv
73+
* @set [OldFunctionNameArg] readCSV
74+
*/
75+
interface CsvDocs
76+
77+
/**
78+
* @include [CommonReadDelimDocs]
79+
* @set [FileTypeTitleArg] TSV
80+
* @set [FileTypeArg] TSV
81+
* @set [FileExtensionArg] tsv
82+
* @set [FunctionNameArg] readTsv
83+
* @set [OldFunctionNameArg] readTSV
84+
*/
85+
interface TsvDocs
86+
87+
/**
88+
* @include [CommonReadDelimDocs]
89+
* @set [FileTypeTitleArg] Delimiter-Separated Text
90+
* @set [FileTypeArg] delimiter-separated text
91+
* @set [FileExtensionArg] txt
92+
* @set [FunctionNameArg] readDelim
93+
* @set [OldFunctionNameArg] readDelim{@comment cannot differentiate between old and new}
94+
*/
95+
interface DelimDocs
96+
97+
/**
98+
* @include [DelimParams.HEADER]
99+
* @include [DelimParams.COL_TYPES]
100+
* @include [DelimParams.SKIP_LINES]
101+
* @include [DelimParams.READ_LINES]
102+
* @include [DelimParams.PARSER_OPTIONS]
103+
* @include [DelimParams.IGNORE_EMPTY_LINES]
104+
* @include [DelimParams.ALLOW_MISSING_COLUMNS]
105+
* @include [DelimParams.IGNORE_EXCESS_COLUMNS]
106+
* @include [DelimParams.QUOTE]
107+
* @include [DelimParams.IGNORE_SURROUNDING_SPACES]
108+
* @include [DelimParams.TRIM_INSIDE_QUOTED]
109+
* @include [DelimParams.PARSE_PARALLEL]
110+
*/
111+
interface CommonReadParams
112+
113+
// something like "File" or "File/URL"
114+
interface DataTitleArg
115+
116+
// something like "file" or "file or url"
117+
interface DataArg
118+
119+
// Like "CSV" or "TSV", capitalized
120+
interface FileTypeTitleArg
121+
122+
// Like "CSV" or "TSV"
123+
interface FileTypeArg
124+
125+
// like "csv" or "txt"
126+
interface FileExtensionArg
127+
128+
// Function name, like "readCsv"
129+
interface FunctionNameArg
130+
131+
// Old function name, like "readCSV"
132+
interface OldFunctionNameArg
133+
134+
// A link to the main function, set by ReadDelim itself
135+
interface FunctionLinkArg
136+
137+
// A link to the str function, set by ReadDelim itself
138+
interface StrFunctionLinkArg
139+
140+
// A link to the old function, set by ReadDelim itself
141+
interface OldFunctionLinkArg
142+
}

0 commit comments

Comments
 (0)