Skip to content

Commit 8b8f706

Browse files
committed
Add support for reading parquet file thanks to arrow-dataset #576
1 parent 7de6022 commit 8b8f706

File tree

7 files changed

+73
-7
lines changed

7 files changed

+73
-7
lines changed

dataframe-arrow/build.gradle.kts

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ dependencies {
1515
implementation(libs.arrow.vector)
1616
implementation(libs.arrow.format)
1717
implementation(libs.arrow.memory)
18+
implementation(libs.arrow.dataset)
1819
implementation(libs.commonsCompress)
1920
implementation(libs.kotlin.reflect)
2021
implementation(libs.kotlin.datetimeJvm)

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt

+9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import org.apache.arrow.dataset.file.FileFormat
34
import org.apache.arrow.memory.RootAllocator
45
import org.apache.arrow.vector.ipc.ArrowReader
56
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel
@@ -186,3 +187,11 @@ public fun DataFrame.Companion.readArrow(
186187
public fun ArrowReader.toDataFrame(
187188
nullability: NullabilityOptions = NullabilityOptions.Infer
188189
): AnyFrame = DataFrame.Companion.readArrowImpl(this, nullability)
190+
191+
/**
192+
* Read [Parquet](https://parquet.apache.org/) data from existing [url] by using [Arrow Dataset](https://arrow.apache.org/docs/java/dataset.html)
193+
*/
194+
public fun DataFrame.Companion.readParquet(
195+
url: URL,
196+
nullability: NullabilityOptions = NullabilityOptions.Infer
197+
): AnyFrame = readArrowDataset(url.toString(), FileFormat.PARQUET, nullability)

dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReadingImpl.kt

+29
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import org.apache.arrow.dataset.file.FileFormat
4+
import org.apache.arrow.dataset.file.FileSystemDatasetFactory
5+
import org.apache.arrow.dataset.jni.DirectReservationListener
6+
import org.apache.arrow.dataset.jni.NativeMemoryPool
7+
import org.apache.arrow.dataset.scanner.ScanOptions
38
import org.apache.arrow.memory.RootAllocator
49
import org.apache.arrow.vector.BigIntVector
510
import org.apache.arrow.vector.BitVector
@@ -330,3 +335,27 @@ internal fun DataFrame.Companion.readArrowImpl(
330335
return flattened.concatKeepingSchema()
331336
}
332337
}
338+
339+
internal fun DataFrame.Companion.readArrowDataset(
340+
fileUri: String,
341+
fileFormat: FileFormat,
342+
nullability: NullabilityOptions = NullabilityOptions.Infer,
343+
): AnyFrame {
344+
val scanOptions = ScanOptions(32768)
345+
RootAllocator().use { allocator ->
346+
FileSystemDatasetFactory(
347+
allocator,
348+
NativeMemoryPool.createListenable(DirectReservationListener.instance()),
349+
fileFormat,
350+
fileUri
351+
).use { datasetFactory ->
352+
datasetFactory.finish().use { dataset ->
353+
dataset.newScan(scanOptions).use { scanner ->
354+
scanner.scanBatches().use { reader ->
355+
return readArrow(reader, nullability)
356+
}
357+
}
358+
}
359+
}
360+
}
361+
}

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ArrowKtTest.kt

+13-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import org.jetbrains.kotlinx.dataframe.api.columnOf
3333
import org.jetbrains.kotlinx.dataframe.api.convertToBoolean
3434
import org.jetbrains.kotlinx.dataframe.api.copy
3535
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
36-
import org.jetbrains.kotlinx.dataframe.api.describe
3736
import org.jetbrains.kotlinx.dataframe.api.map
3837
import org.jetbrains.kotlinx.dataframe.api.pathOf
3938
import org.jetbrains.kotlinx.dataframe.api.remove
@@ -613,4 +612,17 @@ internal class ArrowKtTest {
613612
DataFrame.readArrow(dbArrowReader) shouldBe expected
614613
}
615614
}
615+
616+
@Test
617+
fun testReadParquet(){
618+
val path = testResource("test.arrow.parquet").path
619+
val dataFrame = DataFrame.readParquet(URL("file:$path"))
620+
dataFrame.rowsCount() shouldBe 300
621+
assertEstimations(
622+
exampleFrame = dataFrame,
623+
expectedNullable = false,
624+
hasNulls = false,
625+
fromParquet = true
626+
)
627+
}
616628
}

dataframe-arrow/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/exampleEstimatesAssertions.kt

+19-5
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,12 @@ import kotlin.reflect.typeOf
1818
* Assert that we have got the same data that was originally saved on example creation.
1919
* Example generation project is currently located at https://github.com/Kopilov/arrow_example
2020
*/
21-
internal fun assertEstimations(exampleFrame: AnyFrame, expectedNullable: Boolean, hasNulls: Boolean) {
21+
internal fun assertEstimations(
22+
exampleFrame: AnyFrame,
23+
expectedNullable: Boolean,
24+
hasNulls: Boolean,
25+
fromParquet: Boolean = false
26+
) {
2227
/**
2328
* In [exampleFrame] we get two concatenated batches. To assert the estimations, we should transform frame row number to batch row number
2429
*/
@@ -129,10 +134,19 @@ internal fun assertEstimations(exampleFrame: AnyFrame, expectedNullable: Boolean
129134
assertValueOrNull(iBatch(i), element, LocalDate.ofEpochDay(iBatch(i).toLong() * 30))
130135
}
131136

132-
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDateTime?>
133-
datetimeCol.type() shouldBe typeOf<LocalDateTime>().withNullability(expectedNullable)
134-
datetimeCol.forEachIndexed { i, element ->
135-
assertValueOrNull(iBatch(i), element, LocalDateTime.ofEpochSecond(iBatch(i).toLong() * 60 * 60 * 24 * 30, 0, ZoneOffset.UTC))
137+
if (fromParquet){
138+
//parquet format have only one type of date: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date without time
139+
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDate?>
140+
datetimeCol.type() shouldBe typeOf<LocalDate>().withNullability(expectedNullable)
141+
datetimeCol.forEachIndexed { i, element ->
142+
assertValueOrNull(iBatch(i), element, LocalDate.ofEpochDay(iBatch(i).toLong() * 30))
143+
}
144+
}else {
145+
val datetimeCol = exampleFrame["date64"] as DataColumn<LocalDateTime?>
146+
datetimeCol.type() shouldBe typeOf<LocalDateTime>().withNullability(expectedNullable)
147+
datetimeCol.forEachIndexed { i, element ->
148+
assertValueOrNull(iBatch(i), element, LocalDateTime.ofEpochSecond(iBatch(i).toLong() * 60 * 60 * 24 * 30, 0, ZoneOffset.UTC))
149+
}
136150
}
137151

138152
val timeSecCol = exampleFrame["time32_seconds"] as DataColumn<LocalTime?>
Binary file not shown.

gradle/libs.versions.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ junit-platform = "1.10.2"
4747
kotestAsserions = "5.5.4"
4848

4949
jsoup = "1.17.2"
50-
arrow = "15.0.0"
50+
arrow = "16.0.0"
5151
docProcessor = "0.3.5"
5252
simpleGit = "2.0.3"
5353
dependencyVersions = "0.51.0"
@@ -98,6 +98,7 @@ jsoup = { group = "org.jsoup", name = "jsoup", version.ref = "jsoup" }
9898
arrow-format = { group = "org.apache.arrow", name = "arrow-format", version.ref = "arrow" }
9999
arrow-vector = { group = "org.apache.arrow", name = "arrow-vector", version.ref = "arrow" }
100100
arrow-memory = { group = "org.apache.arrow", name = "arrow-memory-unsafe", version.ref = "arrow" }
101+
arrow-dataset = { group = "org.apache.arrow", name = "arrow-dataset", version.ref = "arrow" }
101102
arrow-c-data = { group = "org.apache.arrow", name = "arrow-c-data", version.ref = "arrow" }
102103

103104

0 commit comments

Comments
 (0)