|
2 | 2 |
|
3 | 3 | This section provides instructions on how to configure Apache Spark to use the Spark Dialect Extension, enabling custom handling of JDBC data types.
|
4 | 4 |
|
5 |
| -### Add the JAR to Spark |
| 5 | +### Using onETL with PySpark |
6 | 6 |
|
7 |
| -#### Using release version |
8 |
| - |
9 |
| -##### Using SparkConf |
10 |
| - |
11 |
| -For PySpark: |
| 7 | +See [onETL documentation](https://onetl.readthedocs.io) for installation instructions. |
12 | 8 |
|
13 | 9 | ```python
|
14 | 10 | from pyspark.sql import SparkSession
|
| 11 | +from onetl.connection import Clickhouse |
15 | 12 |
|
| 13 | +# describe packages should be loaded by Spark |
| 14 | +maven_packages = [ |
| 15 | + "io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.1", |
| 16 | + *Clickhouse.get_packages(), |
| 17 | +] |
| 18 | + |
| 19 | +# Create Spark session |
16 | 20 | spark = (
|
17 | 21 | SparkSession.builder
|
18 | 22 | .appName("My Spark App")
|
19 |
| - .config("spark.jars.packages", "io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.1") |
| 23 | + .config("spark.jars.packages", ",".join(maven_packages)) |
20 | 24 | .getOrCreate()
|
21 | 25 | )
|
22 |
| -``` |
23 |
| - |
24 |
| -For Spark on Scala: |
25 |
| - |
26 |
| -```scala |
27 |
| -import org.apache.spark.sql.SparkSession |
28 |
| - |
29 |
| -val spark = SparkSession.builder() |
30 |
| -.appName("My Spark App") |
31 |
| -.config("spark.jars.packages", "io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.1") |
32 |
| -.getOrCreate() |
33 |
| -``` |
34 | 26 |
|
35 |
| -##### Using Spark Submit |
36 |
| - |
37 |
| -```bash |
38 |
| -spark-submit --conf spark.jars.packages=io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.1 |
39 |
| -``` |
40 |
| - |
41 |
| -#### Compile from source |
42 |
| - |
43 |
| -##### Build .jar file |
44 |
| - |
45 |
| -See [CONTRIBUTING.md](../CONTRIBUTING.md) for build instructions. |
| 27 | +# Register custom Clickhouse dialect |
| 28 | +ClickhouseDialectRegistry = spark._jvm.io.github.mtsongithub.doetl.sparkdialectextensions.clickhouse.ClickhouseDialectRegistry |
| 29 | +ClickhouseDialectRegistry.register() |
46 | 30 |
|
47 |
| -After build you'll have a file `/path/to/cloned-repo/target/scala_2.12/spark-dialect-extension_2.12-0.0.1.jar` |
48 | 31 |
|
49 |
| -##### Using SparkConf |
| 32 | +# use onETL to interact with Clickhouse |
| 33 | +clickhouse = Clickhouse( |
| 34 | + host="my.clickhouse.hostname.or.ip", |
| 35 | + port=9000, |
| 36 | + user="someuser", |
| 37 | + password="******", |
| 38 | + spark=spark, |
| 39 | +) |
50 | 40 |
|
51 |
| -For PySpark: |
| 41 | +from onetl.db import DBReader, DBWriter |
52 | 42 |
|
53 |
| -```python |
54 |
| -from pyspark.sql import SparkSession |
| 43 | +# onETL now can properly read some Clickhouse types |
| 44 | +reader = DBReader(connection=clickhouse, source="mytable") |
| 45 | +df = reader.run() |
55 | 46 |
|
56 |
| -spark = ( |
57 |
| - SparkSession.builder |
58 |
| - .appName("My Spark App") |
59 |
| - .config("spark.jars", "/path/to/cloned-repo/target/scala_2.12/spark-dialect-extension_2.12-0.0.1.jar") |
60 |
| - .getOrCreate() |
61 |
| -) |
| 47 | +# onETL now can properly write some Clickhouse types |
| 48 | +writer = DBWriter(connection=clickhouse, target="anothertable") |
| 49 | +writer.run(df) |
62 | 50 | ```
|
63 | 51 |
|
64 |
| -For Spark on Scala: |
| 52 | +### Using Spark on Scala |
65 | 53 |
|
66 | 54 | ```scala
|
67 | 55 | import org.apache.spark.sql.SparkSession
|
68 | 56 |
|
| 57 | +// describe packages should be loaded by Spark |
| 58 | +var maven_packages = Array( |
| 59 | + "io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.1", |
| 60 | + "com.clickhouse:clickhouse-jdbc:0.6.5", |
| 61 | + "com.clickhouse:clickhouse-http-client:0.6.5", |
| 62 | + "org.apache.httpcomponents.client5:httpclient5::5.3.1", |
| 63 | +) |
| 64 | + |
69 | 65 | val spark = SparkSession.builder()
|
70 | 66 | .appName("My Spark App")
|
71 |
| -.config("spark.jars", "/path/to/cloned-repo/target/scala_2.12/spark-dialect-extension_2.12-0.0.1.jar") |
| 67 | +.config("spark.jars.packages", maven_packages.mkString(",")) |
72 | 68 | .getOrCreate()
|
| 69 | + |
| 70 | +// Register custom Clickhouse dialect |
| 71 | +import io.github.mtsongithub.doetl.sparkdialectextensions.clickhouse.ClickhouseDialectRegistry |
| 72 | + |
| 73 | +ClickhouseDialectRegistry.register() |
| 74 | + |
| 75 | +// now Spark can properly handle some Clickhouse types during read & write |
| 76 | +df = spark.read.jdbc.options(...).load() |
| 77 | +df.write.jdbc.options(...).saveAsTable("anothertable") |
73 | 78 | ```
|
74 | 79 |
|
75 |
| -##### Using Spark Submit |
| 80 | +### Using Spark Submit |
| 81 | + |
| 82 | +Start Spark session with downloaded packages: |
76 | 83 |
|
77 | 84 | ```bash
|
78 |
| -spark-submit --jars /path/to/cloned-repo/target/scala_2.12/spark-dialect-extension_2.12-0.0.1.jar |
| 85 | +spark-submit --conf spark.jars.packages=io.github.mtsongithub.doetl:spark-dialect-extension_2.12:0.0.1,com.clickhouse:clickhouse-jdbc:0.6.5,com.clickhouse:clickhouse-http-client:0.6.5,org.apache.httpcomponents.client5:httpclient5::5.3.1 ... |
79 | 86 | ```
|
80 | 87 |
|
81 |
| -### Register a dialect |
82 |
| - |
83 |
| -To integrate the Spark Dialect Extension into your Spark application, you need to use ``<DBMS>DialectRegistry`` classes, which dynamically detect the Spark version and register the corresponding dialect. |
| 88 | +And then register custom dialect in started session. |
84 | 89 |
|
85 | 90 | For PySpark:
|
86 |
| - |
87 | 91 | ```python
|
88 | 92 | # Register custom Clickhouse dialect
|
89 | 93 | ClickhouseDialectRegistry = spark._jvm.io.github.mtsongithub.doetl.sparkdialectextensions.clickhouse.ClickhouseDialectRegistry
|
90 | 94 | ClickhouseDialectRegistry.register()
|
91 | 95 | ```
|
92 | 96 |
|
93 |
| -For Spark on Scala: |
| 97 | +For Scala: |
94 | 98 | ```scala
|
95 | 99 | // Register custom Clickhouse dialect
|
96 | 100 | import io.github.mtsongithub.doetl.sparkdialectextensions.clickhouse.ClickhouseDialectRegistry
|
|
0 commit comments