-
Notifications
You must be signed in to change notification settings - Fork 47
Open
Labels
enhancementNew feature or requestNew feature or request
Description
I used pyrasterframes v0.10.1 with Databricks 9.1 LTS ML (includes Apache Spark 3.1.2, Scala 2.12).
I get the following error and cannot access the dataframe.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-1914649719011683> in <module>
----> 1 df.head()
/databricks/spark/python/pyspark/sql/dataframe.py in head(self, n)
1742 """
1743 if n is None:
-> 1744 rs = self.head(1)
1745 return rs[0] if rs else None
1746 return self.take(n)
/databricks/spark/python/pyspark/sql/dataframe.py in head(self, n)
1744 rs = self.head(1)
1745 return rs[0] if rs else None
-> 1746 return self.take(n)
1747
1748 def first(self):
/databricks/spark/python/pyspark/sql/dataframe.py in take(self, num)
767 [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
768 """
--> 769 return self.limit(num).collect()
770
771 def tail(self, num):
/databricks/spark/python/pyspark/sql/dataframe.py in collect(self)
713 # Default path used in OSS Spark / for non-DF-ACL clusters:
714 with SCCallSiteSync(self._sc) as css:
--> 715 sock_info = self._jdf.collectToPython()
716 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
717
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o512.collectToPython.
: java.lang.NoClassDefFoundError: Could not initialize class org.locationtech.rasterframes.ref.RFRasterSource$
at org.locationtech.rasterframes.expressions.transformers.URIToRasterSource$.apply(URIToRasterSource.scala:62)
at org.locationtech.rasterframes.datasource.raster.RasterSourceRelation.$anonfun$buildScan$6(RasterSourceRelation.scala:114)
at scala.collection.TraversableLike$WithFilter.$anonfun$map$2(TraversableLike.scala:935)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike$WithFilter.map(TraversableLike.scala:934)
at org.locationtech.rasterframes.datasource.raster.RasterSourceRelation.buildScan(RasterSourceRelation.scala:113)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:458)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:69)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:69)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:100)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:78)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$4(QueryPlanner.scala:85)
at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:82)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:100)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:78)
at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:621)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:215)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:80)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:268)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:265)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:968)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:265)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:215)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:208)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:227)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:968)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:227)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:222)
at com.databricks.sql.transaction.tahoe.metering.DeltaMetering$.reportUsage(DeltaMetering.scala:136)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$8(SQLExecution.scala:303)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:386)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:186)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:968)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:141)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:336)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3949)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3737)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Code
from pyrasterframes.utils import create_rf_spark_session
spark = create_rf_spark_session()
df = spark.read.raster('https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF')
#no error
df.printSchema()
#error
df.head()
#error
df.count()Prerequisite
Basically, I used this explanation as a guide.
https://www.databricks.com/notebooks/rasterframes-notebook.html
- Setup GDAL Init Script
The above file was created and registered in the Cluster-scoped init script.
#! /bin/bash sudo add-apt-repository ppa:ubuntugis/ppa sudo apt-get update sudo apt-get install -y cmake gdal-bin libgdal-dev python3-gdal """, True) - Install RasterFrames on the Cluster
Databricks can only use Spark 3 or higher. Therefore, the latest rasterframes release, v0.10.1, was used.- download https://github.com/locationtech/rasterframes/archive/refs/tags/0.10.1.zip.
- Unzip and run
sbt publishLocalto obtain pyrasterframes .jar and .whl.
a. pyrasterframes-assembly-0.10.1.jar
b. pyrasterframes-0.10.1-py3-none-any.whl
Register these two files in the cluster library.
Execution conditions
Confirmed on Spark Notebook.
- gdalinfo
!gdalinfo --version GDAL 3.3.2, released 2021/09/01 - gdal_version, build_info
from pyrasterframes.utils import gdal_version, build_info print(gdal_version()) print(build_info()) GDAL 3.3.2, released 2021/09/01 {'scalaVersion': '2.12.15', 'sbtVersion': '1.5.5', 'name': 'core', 'rfSparkVersion': '3.1.2', 'rfGeoMesaVersion': '3.2.0', 'GDAL': 'GDAL 3.3.2, released 2021/09/01', 'rfGeoTrellisVersion': '3.6.1', 'version': '0.10.1'}
- Java Version
!java -version openjdk version "1.8.0_345" OpenJDK Runtime Environment (Zulu 8.64.0.19-CA-linux64) (build 1.8.0_345-b01) OpenJDK 64-Bit Server VM (Zulu 8.64.0.19-CA-linux64) (build 25.345-b01, mixed mode)
I would like to use pyrasterframes on Databricks, so please investigate.
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request