Skip to content

Commit 86dc97f

Browse files
committed
some implementation refactoring of csv
1 parent c811d70 commit 86dc97f

File tree

469 files changed

+93927
-105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

469 files changed

+93927
-105
lines changed

core/build.gradle.kts

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ dependencies {
7070
implementation(libs.serialization.core)
7171
implementation(libs.serialization.json)
7272
// implementation(libs.fastDoubleParser) TODO temp
73-
implementation(files("../libs/fastdoubleparser-2024-10-20_114324cc.jar"))
73+
api(files("../libs/fastdoubleparser-2024-10-20_114324cc.jar"))
7474

7575
implementation(libs.fuel)
7676

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
package org.jetbrains.kotlinx.dataframe
2+
3+
import org.jetbrains.kotlinx.dataframe.api.ColumnSelectionDsl
4+
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
5+
import org.jetbrains.kotlinx.dataframe.api.cast
6+
import org.jetbrains.kotlinx.dataframe.api.castFrameColumn
7+
import org.jetbrains.kotlinx.dataframe.api.getColumn
8+
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
9+
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
10+
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
11+
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
12+
import org.jetbrains.kotlinx.dataframe.impl.columnName
13+
import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn
14+
import kotlin.reflect.KProperty
15+
16+
/**
17+
* Provides access to [columns][DataColumn].
18+
*
19+
* Base interface for [DataFrame] and [ColumnSelectionDsl]
20+
*
21+
* @param T Schema marker. Used to generate extension properties for typed column access.
22+
*/
23+
public interface ColumnsContainer<out T> {
24+
25+
// region columns
26+
27+
public fun columns(): List<AnyCol>
28+
29+
public fun columnsCount(): Int
30+
31+
public fun containsColumn(name: String): Boolean
32+
33+
public fun containsColumn(path: ColumnPath): Boolean
34+
35+
public fun getColumnIndex(name: String): Int
36+
37+
// endregion
38+
39+
// region getColumnOrNull
40+
41+
public fun getColumnOrNull(name: String): AnyCol?
42+
43+
public fun getColumnOrNull(index: Int): AnyCol?
44+
45+
public fun <R> getColumnOrNull(column: ColumnReference<R>): DataColumn<R>?
46+
47+
public fun <R> getColumnOrNull(column: KProperty<R>): DataColumn<R>?
48+
49+
public fun getColumnOrNull(path: ColumnPath): AnyCol?
50+
51+
public fun <R> getColumnOrNull(column: ColumnSelector<T, R>): DataColumn<R>?
52+
53+
// endregion
54+
55+
// region get
56+
57+
public operator fun get(columnName: String): AnyCol = getColumn(columnName)
58+
59+
public operator fun get(columnPath: ColumnPath): AnyCol = getColumn(columnPath)
60+
61+
public operator fun <R> get(column: DataColumn<R>): DataColumn<R> = getColumn(column.name()).cast()
62+
63+
public operator fun <R> get(column: DataColumn<DataRow<R>>): ColumnGroup<R> = getColumn(column)
64+
65+
public operator fun <R> get(column: DataColumn<DataFrame<R>>): FrameColumn<R> = getColumn(column)
66+
67+
public operator fun <R> get(column: ColumnReference<R>): DataColumn<R> = getColumn(column)
68+
69+
public operator fun <R> get(column: ColumnReference<DataRow<R>>): ColumnGroup<R> = getColumn(column)
70+
71+
public operator fun <R> get(column: ColumnReference<DataFrame<R>>): FrameColumn<R> = getColumn(column)
72+
73+
public operator fun <R> get(column: KProperty<R>): DataColumn<R> = get(column.columnName).cast()
74+
75+
public operator fun <R> get(column: KProperty<DataRow<R>>): ColumnGroup<R> =
76+
get(column.columnName).asColumnGroup().cast()
77+
78+
public operator fun <R> get(column: KProperty<DataFrame<R>>): FrameColumn<R> =
79+
get(column.columnName).asAnyFrameColumn().castFrameColumn()
80+
81+
public fun <C> get(columns: ColumnsSelector<T, C>): List<DataColumn<C>>
82+
83+
public fun <C> get(column: ColumnSelector<T, C>): DataColumn<C> = get(column as ColumnsSelector<T, C>).single()
84+
85+
// endregion
86+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
package org.jetbrains.kotlinx.dataframe
2+
3+
import org.jetbrains.kotlinx.dataframe.api.Infer
4+
import org.jetbrains.kotlinx.dataframe.api.asDataColumn
5+
import org.jetbrains.kotlinx.dataframe.api.cast
6+
import org.jetbrains.kotlinx.dataframe.api.concat
7+
import org.jetbrains.kotlinx.dataframe.api.filter
8+
import org.jetbrains.kotlinx.dataframe.api.map
9+
import org.jetbrains.kotlinx.dataframe.api.schema
10+
import org.jetbrains.kotlinx.dataframe.api.take
11+
import org.jetbrains.kotlinx.dataframe.columns.BaseColumn
12+
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
13+
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
14+
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
15+
import org.jetbrains.kotlinx.dataframe.columns.ColumnResolutionContext
16+
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
17+
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
18+
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
19+
import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnGroupImpl
20+
import org.jetbrains.kotlinx.dataframe.impl.columns.FrameColumnImpl
21+
import org.jetbrains.kotlinx.dataframe.impl.columns.ValueColumnImpl
22+
import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
23+
import org.jetbrains.kotlinx.dataframe.impl.columns.guessColumnType
24+
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnKind
25+
import org.jetbrains.kotlinx.dataframe.impl.getValuesType
26+
import org.jetbrains.kotlinx.dataframe.impl.splitByIndices
27+
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
28+
import kotlin.reflect.KClass
29+
import kotlin.reflect.KProperty
30+
import kotlin.reflect.KType
31+
import kotlin.reflect.typeOf
32+
33+
/**
34+
* Column with [name] and [values] of specific [type].
35+
*
36+
* Base interface for [ValueColumn] and [FrameColumn], but not for [ColumnGroup]. However, implementations for all three [column kinds][ColumnKind] derive from DataColumn and can cast to it safely.
37+
* Column operations that have signature clash with [DataFrame] API ([filter], [take], [map] etc.) are defined for [DataColumn] and not for [BaseColumn].
38+
*
39+
* @param T type of values in the column.
40+
*/
41+
public interface DataColumn<out T> : BaseColumn<T> {
42+
43+
public companion object {
44+
45+
/**
46+
* Creates [ValueColumn] using given [name], [values] and [type].
47+
*
48+
* @param name name of the column
49+
* @param values list of column values
50+
* @param type type of the column
51+
* @param infer column type inference mode
52+
*/
53+
public fun <T> createValueColumn(
54+
name: String,
55+
values: List<T>,
56+
type: KType,
57+
infer: Infer = Infer.None,
58+
defaultValue: T? = null,
59+
): ValueColumn<T> = ValueColumnImpl(values, name, getValuesType(values, type, infer), defaultValue)
60+
61+
/**
62+
* Creates [ValueColumn] using given [name], [values] and reified column [type].
63+
*
64+
* Note, that column [type] will be defined at compile-time using [T] argument
65+
*
66+
* @param T type of the column
67+
* @param name name of the column
68+
* @param values list of column values
69+
* @param infer column type inference mode
70+
*/
71+
public inline fun <reified T> createValueColumn(
72+
name: String,
73+
values: List<T>,
74+
infer: Infer = Infer.None,
75+
): ValueColumn<T> =
76+
createValueColumn(
77+
name,
78+
values,
79+
getValuesType(
80+
values,
81+
typeOf<T>(),
82+
infer,
83+
),
84+
)
85+
86+
public fun <T> createColumnGroup(name: String, df: DataFrame<T>): ColumnGroup<T> = ColumnGroupImpl(name, df)
87+
88+
public fun <T> createFrameColumn(name: String, df: DataFrame<T>, startIndices: Iterable<Int>): FrameColumn<T> =
89+
FrameColumnImpl(name, df.splitByIndices(startIndices.asSequence()).toList(), lazy { df.schema() })
90+
91+
public fun <T> createFrameColumn(
92+
name: String,
93+
groups: List<DataFrame<T>>,
94+
schema: Lazy<DataFrameSchema>? = null,
95+
): FrameColumn<T> = FrameColumnImpl(name, groups, schema)
96+
97+
public fun <T> createWithTypeInference(
98+
name: String,
99+
values: List<T>,
100+
nullable: Boolean? = null,
101+
): DataColumn<T> = guessColumnType(name, values, nullable = nullable)
102+
103+
public fun <T> create(
104+
name: String,
105+
values: List<T>,
106+
type: KType,
107+
infer: Infer = Infer.None,
108+
): DataColumn<T> =
109+
when (type.toColumnKind()) {
110+
ColumnKind.Value -> createValueColumn(name, values, type, infer)
111+
ColumnKind.Group -> createColumnGroup(name, (values as List<AnyRow?>).concat()).asDataColumn().cast()
112+
ColumnKind.Frame -> createFrameColumn(name, values as List<AnyFrame>).asDataColumn().cast()
113+
}
114+
115+
public inline fun <reified T> create(name: String, values: List<T>, infer: Infer = Infer.None): DataColumn<T> =
116+
create(name, values, typeOf<T>(), infer)
117+
118+
public fun empty(name: String = ""): AnyCol = createValueColumn(name, emptyList<Unit>(), typeOf<Unit>())
119+
}
120+
121+
public fun hasNulls(): Boolean = type().isMarkedNullable
122+
123+
override fun distinct(): DataColumn<T>
124+
125+
override fun get(indices: Iterable<Int>): DataColumn<T>
126+
127+
override fun rename(newName: String): DataColumn<T>
128+
129+
override fun resolveSingle(context: ColumnResolutionContext): ColumnWithPath<T>? = this.addPath()
130+
131+
override operator fun getValue(thisRef: Any?, property: KProperty<*>): DataColumn<T> =
132+
super.getValue(thisRef, property) as DataColumn<T>
133+
134+
public operator fun iterator(): Iterator<T> = values().iterator()
135+
136+
public override operator fun get(range: IntRange): DataColumn<T>
137+
}
138+
139+
public val AnyCol.name: String get() = name()
140+
public val AnyCol.path: ColumnPath get() = path()
141+
142+
public val <T> DataColumn<T>.values: Iterable<T> get() = values()
143+
public val AnyCol.hasNulls: Boolean get() = hasNulls()
144+
public val AnyCol.size: Int get() = size()
145+
public val AnyCol.indices: IntRange get() = indices()
146+
147+
public val AnyCol.type: KType get() = type()
148+
public val AnyCol.kind: ColumnKind get() = kind()
149+
public val AnyCol.typeClass: KClass<*>
150+
get() = type.classifier as? KClass<*>
151+
?: error("Cannot cast ${type.classifier?.javaClass} to a ${KClass::class}. Column $name: $type")
152+
153+
public fun AnyBaseCol.indices(): IntRange = 0 until size()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package org.jetbrains.kotlinx.dataframe
2+
3+
import org.jetbrains.kotlinx.dataframe.aggregation.Aggregatable
4+
import org.jetbrains.kotlinx.dataframe.aggregation.AggregateGroupedBody
5+
import org.jetbrains.kotlinx.dataframe.annotations.HasSchema
6+
import org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl
7+
import org.jetbrains.kotlinx.dataframe.api.add
8+
import org.jetbrains.kotlinx.dataframe.api.cast
9+
import org.jetbrains.kotlinx.dataframe.api.getRows
10+
import org.jetbrains.kotlinx.dataframe.api.indices
11+
import org.jetbrains.kotlinx.dataframe.api.rows
12+
import org.jetbrains.kotlinx.dataframe.api.select
13+
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
14+
import org.jetbrains.kotlinx.dataframe.columns.UnresolvedColumnsPolicy
15+
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
16+
import org.jetbrains.kotlinx.dataframe.impl.DataFrameImpl
17+
import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
18+
import org.jetbrains.kotlinx.dataframe.impl.getColumnsImpl
19+
import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
20+
import org.jetbrains.kotlinx.dataframe.impl.headPlusIterable
21+
import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrame
22+
import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrameOf
23+
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
24+
import kotlin.reflect.KType
25+
26+
/**
27+
* Readonly interface for an ordered list of [columns][DataColumn].
28+
*
29+
* Columns in `DataFrame` have distinct non-empty [names][DataColumn.name] and equal [sizes][DataColumn.size].
30+
*
31+
* @param T Schema marker. It identifies column schema and is used to generate schema-specific extension properties for typed data access. It is covariant, so `DataFrame<A>` is assignable to variable of type `DataFrame<B>` if `A` is a subtype of `B`.
32+
*/
33+
@HasSchema(schemaArg = 0)
34+
public interface DataFrame<out T> :
35+
Aggregatable<T>,
36+
ColumnsContainer<T> {
37+
38+
public companion object {
39+
public val Empty: AnyFrame = DataFrameImpl<Unit>(emptyList(), 0)
40+
41+
public fun empty(nrow: Int = 0): AnyFrame = if (nrow == 0) Empty else DataFrameImpl<Unit>(emptyList(), nrow)
42+
43+
/**
44+
* Creates a DataFrame with empty columns (rows = 0).
45+
* Can be used as a "null object" in aggregation operations, operations that work on columns (select, reorder, ...)
46+
*
47+
*/
48+
public inline fun <reified T> emptyOf(): DataFrame<T> = createEmptyDataFrameOf(T::class).cast()
49+
50+
/**
51+
* Creates a DataFrame with empty columns (rows = 0).
52+
* Can be used as a "null object" in aggregation operations, operations that work on columns (select, reorder, ...)
53+
*/
54+
public fun empty(schema: DataFrameSchema): AnyFrame = schema.createEmptyDataFrame()
55+
}
56+
57+
// region columns
58+
59+
public fun columnNames(): List<String>
60+
61+
public fun columnTypes(): List<KType>
62+
63+
// endregion
64+
65+
// region rows
66+
67+
public fun rowsCount(): Int
68+
69+
public operator fun iterator(): Iterator<DataRow<T>> = rows().iterator()
70+
71+
// endregion
72+
73+
public fun <R> aggregate(body: AggregateGroupedBody<T, R>): DataRow<T>
74+
75+
// region get columns
76+
77+
/**
78+
* Returns a list of columns selected by [columns], a [ColumnsSelectionDsl].
79+
*
80+
* NOTE: This doesn't work in [ColumnsSelectionDsl], use [ColumnsSelectionDsl.cols] to select columns by predicate.
81+
*/
82+
override fun <C> get(columns: ColumnsSelector<T, C>): List<DataColumn<C>> =
83+
getColumnsImpl(UnresolvedColumnsPolicy.Fail, columns)
84+
85+
// endregion
86+
87+
// region get rows
88+
89+
public operator fun get(index: Int): DataRow<T>
90+
91+
public operator fun get(indices: Iterable<Int>): DataFrame<T> = getRows(indices)
92+
93+
public operator fun get(range: IntRange): DataFrame<T> = getRows(range)
94+
95+
public operator fun get(first: IntRange, vararg ranges: IntRange): DataFrame<T> =
96+
getRows(headPlusArray(first, ranges).asSequence().flatMap { it.asSequence() }.asIterable())
97+
98+
public operator fun get(firstIndex: Int, vararg otherIndices: Int): DataFrame<T> =
99+
get(headPlusIterable(firstIndex, otherIndices.asIterable()))
100+
101+
// endregion
102+
103+
// region plus columns
104+
105+
public operator fun plus(col: AnyBaseCol): DataFrame<T> = add(col)
106+
107+
public operator fun plus(cols: Iterable<AnyBaseCol>): DataFrame<T> = (columns() + cols).toDataFrame().cast()
108+
109+
// endregion
110+
}
111+
112+
// region get columns
113+
114+
/**
115+
* Returns a list of columns selected by [columns], a [ColumnsSelectionDsl].
116+
*/
117+
public operator fun <T, C> DataFrame<T>.get(columns: ColumnsSelector<T, C>): List<DataColumn<C>> = this.get(columns)
118+
119+
public operator fun <T> DataFrame<T>.get(first: AnyColumnReference, vararg other: AnyColumnReference): DataFrame<T> =
120+
select { (listOf(first) + other).toColumnSet() }
121+
122+
public operator fun <T> DataFrame<T>.get(first: String, vararg other: String): DataFrame<T> =
123+
select { (listOf(first) + other).toColumnSet() }
124+
125+
public operator fun <T> DataFrame<T>.get(columnRange: ClosedRange<String>): DataFrame<T> =
126+
select { columnRange.start..columnRange.endInclusive }
127+
128+
// endregion
129+
130+
internal val ColumnsContainer<*>.ncol get() = columnsCount()
131+
internal val AnyFrame.nrow get() = rowsCount()
132+
internal val AnyFrame.indices get() = indices()
133+
internal val AnyFrame.size: DataFrameSize get() = size()
134+
135+
public fun AnyFrame.size(): DataFrameSize = DataFrameSize(ncol, nrow)

0 commit comments

Comments
 (0)