-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSetupTestTables.scala
39 lines (34 loc) · 1.13 KB
/
SetupTestTables.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/*
* Setup Test Data
*
* Load the csv file of dates (dates.csv) from the same directory as this script and save as a
* Parquet table in Hive
*/
// Construct local filename (i.e. not HDFS).
val sep = java.io.File.separator
var fileName = "file:///" + System.getProperty("user.dir") + sep + "data" + sep + "dates.csv"
/*
* Read the file into a DataFrame, with single date column called 'date1' and
* save to a parquet table called 'foo'.
*/
( spark.read.csv(fileName)
.withColumnRenamed("_c0","date1")
.withColumn("date1",to_date($"date1"))
.write.format("parquet")
.option("compression","gzip")
.mode("overwrite").saveAsTable("foo") )
/*
* Do the same for the salaries dataset.
*
*/
import org.apache.spark.sql.types._
fileName = "file:///" + System.getProperty("user.dir") + sep + "data" + sep + "salaries.csv"
( spark.read.option("header","true")
.csv(fileName)
.select($"emp_no".cast(IntegerType),
$"salary".cast(IntegerType),
$"from_date".cast(DateType),
$"to_date".cast(DateType))
.write.format("parquet")
.option("compression","gzip")
.mode("overwrite").saveAsTable("salaries_t") )