-
Notifications
You must be signed in to change notification settings - Fork 346
/
Copy pathtest_spark_json_to_avro.sh
executable file
·74 lines (64 loc) · 2.53 KB
/
test_spark_json_to_avro.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env bash
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2015-11-05 23:29:15 +0000 (Thu, 05 Nov 2015)
#
# https://github.com/HariSekhon/DevOps-Python-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help improve or steer this or other code I publish
#
# https://www.linkedin.com/in/HariSekhon
#
set -eu
[ -n "${DEBUG:-}" ] && set -x
srcdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd "$srcdir"
# shellcheck disable=SC1091
. ./utils.sh
section "Spark JSON => Avro"
if is_inside_docker; then
echo "detected running inside docker, skipping test..."
return 0 &>/dev/null || :
exit 0
fi
# don't support Spark <= 1.3 due to difference in databricks avro dependency
export SPARK_VERSIONS="${*:-1.4.0 1.5.1 1.6.2}"
# requires upgrade to spark-avro 3.0.0
#export SPARK_VERSIONS="${*:-2.0.0}"
for SPARK_VERSION in $SPARK_VERSIONS; do
dir="spark-$SPARK_VERSION-bin-hadoop2.6"
tar="$dir.tgz"
if ! [ -d "$dir" ]; then
if ! [ -f "$tar" ]; then
echo "fetching $tar"
# some systems don't have wget
if type -P wget &>/dev/null; then
wget "http://d3kbcqa49mib13.cloudfront.net/$tar"
else
curl -L "http://d3kbcqa49mib13.cloudfront.net/$tar" > "$tar"
fi
fi
echo "untarring $tar"
tar zxf "$tar" || rm -f "$tar" "$dir"
fi
echo
export SPARK_HOME="$dir"
# this works for both Spark 1.3.1 and 1.4.0 but calling from within spark-csv-to-avro.py doesn't like it
#spark-submit --packages com.databricks:spark-csv_2.10:1.3.0 ../spark-csv-to-avro.py -c data/test.csv -a "test-$dir.avro" --has-header $@ &&
# resolved, was due to Spark 1.4+ requiring pyspark-shell for PYSPARK_SUBMIT_ARGS
rm -fr "test-$dir.avro"
if ../spark_json_to_avro.py -j data/multirecord.json -a "test-$dir.avro"; then
echo "SUCCEEDED with header with Spark $SPARK_VERSION"
else
echo "FAILED with header with Spark $SPARK_VERSION"
exit 1
fi
#../spark_json_to_avro.py -j data/multirecord.json -a "test-$dir.avro" -s Year:String,Make,Model,Dimension.0.Length:float &&
# echo "SUCCEEDED with header with Spark $SPARK_VERSION" ||
# { echo "FAILED with header with Spark $SPARK_VERSION"; exit 1; }
# TODO: test using downloaded avro tools getschema or extend validate_avro.py + validate_parquet.py to have schema type checking
done
echo "SUCCESS"