renamed data_bricks to databricks_ and also updated python package information

sonalgoyal · sonalgoyal · commit cff2f49dc411 · 2023-12-28T19:17:22.000+05:30
diff --git a/docs/running/databricks.md b/docs/running/databricks.md
@@ -3,29 +3,6 @@ title: Running on Databricks
 parent: Running Zingg on Cloud
 nav_order: 6
 ---
-There are several ways to run Zingg on Databricks. All [file formats and data sources and sinks](../dataSourcesAndSinks) are supported within Databricks. 
+You can run Zingg on Databricks directly using the Databricks notebook interface. All [file formats and data sources and sinks](../dataSourcesAndSinks) are supported within Databricks. 
 
-# Running directly within Databricks using the Databricks notebook interface
 This uses the Zingg Python API and an [example notebook is available here](https://github.com/zinggAI/zingg/blob/main/examples/databricks/FebrlExample.ipynb)
-
-# Running using Databricks Connect from your local machine
-1. Configure databricks connect 11.3 and create correspoding workspace/cluster as per the [Databricks docs](https://docs.databricks.com/dev-tools/databricks-connect-legacy.html). Please makre sure that you run `databricks-connect configure`
-
-Ensure to run databricks-connect configure
-
-2. Set env variable ZINGG_HOME to the path where latest zingg release jar is e.g. location of zingg-0.4.0.jar
-
-4. Set env variable DATA_BRICKS_CONNECT to Y
-
-5. pip install zingg
-
-6. Now run zingg using the shell script with -run-databricks option, SPARK session would be made remotely to Databricks and job would run on your Databricks environment
-e.g. ./scripts/zingg.sh --run-databricks test/InMemPipeDataBricks.py
-
-Please refer to the [different options](https://docs.zingg.ai/zingg/stepbystep/zingg-command-line) available on the Zingg command line.
-
-
-# Running on Databricks using Spark Submit Jobs
-Zingg is run as a Spark Submit Job along with a python notebook-based labeler specially created to run within the Databricks cloud since the cloud environment does not have the system console for the labeler to work. 
-
-Please refer to the [Databricks Zingg tutorial](https://medium.com/@sonalgoyal/identity-resolution-on-databricks-for-customer-360-591661bcafce) for a detailed tutorial.
diff --git a/python/PKG-INFO b/python/PKG-INFO
@@ -5,7 +5,7 @@ Summary: Zingg.ai Entity Resolution
 Home-page: www.zingg.ai
 Author: Zingg.AI
 Author-email: sonalgoyal4@gmail.com
-License: UNKNOWN
+License: AGPL
 Description: ## About Zingg
         
         Zingg is an ML based entity resolution framework. The Python Package is used for building training data, training Zingg models and running the matching and linking processes.
diff --git a/python/README.md b/python/README.md
@@ -4,7 +4,7 @@ Zingg Python APIs for entity resolution, record linkage, data mastering and dedu
 [Zingg.AI](https://www.zingg.ai) 
 
 # requirement
-python 3.6+; spark 3.1.2
+python 3.6+; spark 3.5.0
 
 # Installation
 
diff --git a/python/zingg/client.py b/python/zingg/client.py
@@ -46,8 +46,8 @@ def initClient():
     global _sqlContext
     global _spark    
     if _spark_ctxt is None:
-        DATA_BRICKS_CONNECT = os.getenv('DATA_BRICKS_CONNECT')
-        if DATA_BRICKS_CONNECT=='Y' or DATA_BRICKS_CONNECT=='y':
+        DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT')
+        if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y':
             return initDataBricksConectClient()
         else:
             return initSparkClient()
@@ -130,8 +130,8 @@ def execute(self):
     def initAndExecute(self):
         """ Method to run both init and execute methods consecutively """
         self.client.init()
-        DATA_BRICKS_CONNECT = os.getenv('DATA_BRICKS_CONNECT')
-        if DATA_BRICKS_CONNECT=='Y' or DATA_BRICKS_CONNECT=='y':
+        DATABRICKS_CONNECT = os.getenv('DATABRICKS_CONNECT')
+        if DATABRICKS_CONNECT=='Y' or DATABRICKS_CONNECT=='y':
             options = self.client.getOptions()
             inpPhase = options.get(ClientOptions.PHASE).getValue()
             if (inpPhase==ZinggOptions.LABEL.getValue()):
diff --git a/scripts/zingg.sh b/scripts/zingg.sh
@@ -59,7 +59,7 @@ fi
 if [[ $RUN_PYTHON_DB_CONNECT_PHASE -eq 1 ]]; then
 	unset SPARK_MASTER
 	unset SPARK_HOME
-	export DATA_BRICKS_CONNECT=Y
+	export DATABRICKS_CONNECT=Y
 	python $EXECUTABLE
 else
 	# All the additional options must be added here
diff --git a/test/testFebrl/testArgs.py b/test/testFebrl/testArgs.py
@@ -62,7 +62,7 @@ def test_initClient_spark(self):
     def test_initClient_databricks(self):
         global _spark_ctxt
         _spark_ctxt = None
-        os.environ['DATA_BRICKS_CONNECT'] = 'Y'
+        os.environ['DATABRICKS_CONNECT'] = 'Y'
         result = initClient()
         self.assertEqual(result, 1)