Azure
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 142 additions & 10 deletions b/‎README.md
Lines changed: 142 additions & 10 deletions
diff --git a/‎docs/images/spark_sqldb_dataflow.png
434 KB b/‎docs/images/spark_sqldb_dataflow.png
434 KB
diff --git a/‎lib/mssql-jdbc-6.2.2.jre8.jar
806 KB b/‎lib/mssql-jdbc-6.2.2.jre8.jar
806 KB
diff --git a/‎pom.xml
Lines changed: 138 additions & 0 deletions b/‎pom.xml
Lines changed: 138 additions & 0 deletions
diff --git a/‎releases/azure-sqldb-spark-1.0.0/azure-sqldb-spark-1.0.0-uber.jar
863 KB b/‎releases/azure-sqldb-spark-1.0.0/azure-sqldb-spark-1.0.0-uber.jar
863 KB
diff --git a/‎releases/azure-sqldb-spark-1.0.0/azure-sqldb-spark-1.0.0.jar
101 KB b/‎releases/azure-sqldb-spark-1.0.0/azure-sqldb-spark-1.0.0.jar
101 KB
diff --git a/‎samples/notebooks/Spark Connector for Azure SQL Databases and SQL Server.html
Lines changed: 42 additions & 0 deletions b/‎samples/notebooks/Spark Connector for Azure SQL Databases and SQL Server.html
Lines changed: 42 additions & 0 deletions
diff --git a/‎samples/scripts/BulkCopySample.scala
Lines changed: 43 additions & 0 deletions b/‎samples/scripts/BulkCopySample.scala
Lines changed: 43 additions & 0 deletions
diff --git a/‎samples/scripts/ReadSample.scala
Lines changed: 40 additions & 0 deletions b/‎samples/scripts/ReadSample.scala
Lines changed: 40 additions & 0 deletions
@@ -1,2 +1,6 @@
 *.class
 *.log
+*.iml
+
+#IDE
+.idea/*
@@ -1,14 +1,146 @@
+# Spark connector for Azure SQL Databases and SQL Server
 
-# Contributing
+The Spark connector for [Azure SQL Database](https://azure.microsoft.com/en-us/services/sql-database/) and [SQL Server](https://www.microsoft.com/en-us/sql-server/default.aspx) enables SQL databases, including Azure SQL Databases and SQL Server, to act as input data source or output data sink for Spark jobs. It allows you to utilize real time transactional data in big data analytics and persist results for adhoc queries or reporting. 
 
-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.microsoft.com.
+Comparing to the built-in Spark connector, this connector provides the ability to bulk insert data into SQL databases. It can outperform row by row insertion with 10x to 20x faster performance. The Spark connector for Azure SQL Databases and SQL Server also supports AAD authentication. It allows you securely connecting to your Azure SQL databases from Azure Databricks using your AAD account. It provides similar interfaces with the built-in JDBC connector. It is easy to migrate your existing Spark jobs to use this new connector.
 
-When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
+## How to connect to Spark using this library
+This connector uses Microsoft SQLServer JDBC driver to fetch data from/to the Azure SQL Database. 
+Results are of the `DataFrame` type.
 
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [[email protected]](mailto:[email protected]) with any additional questions or comments.
+All connection properties in 
+<a href="https://docs.microsoft.com/en-us/sql/connect/jdbc/setting-the-connection-properties"> 
+  Microsoft JDBC Driver for SQL Server
+</a> are supported in this connector. Add connection properties as fields in the `com.microsoft.azure.sqldb.spark.config.Config` object.
+
+  
+### Reading from Azure SQL Database or SQL Server
+```scala 
+import com.microsoft.azure.sqldb.spark.config.Config
+import com.microsoft.azure.sqldb.spark.connect._
+
+val config = Config(Map(
+  "url"            -> "mysqlserver.database.windows.net",
+  "databaseName"   -> "MyDatabase",
+  "dbTable"        -> "dbo.Clients"
+  "user"           -> "username",
+  "password"       -> "*********",
+  "connectTimeout" -> "5", //seconds
+  "queryTimeout"   -> "5"  //seconds
+))
+
+val collection = sqlContext.read.sqlDB(config)
+collection.show()
+
+```
+
+### Writing to Azure SQL Database or SQL Server
+```scala 
+import com.microsoft.azure.sqldb.spark.config.Config
+import com.microsoft.azure.sqldb.spark.connect._
+ 
+// Aquire a DataFrame collection (val collection)
+
+val config = Config(Map(
+  "url"          -> "mysqlserver.database.windows.net",
+  "databaseName" -> "MyDatabase",
+  "dbTable"      -> "dbo.Clients"
+  "user"         -> "username",
+  "password"     -> "*********"
+))
+
+import org.apache.spark.sql.SaveMode
+collection.write.mode(SaveMode.Append).sqlDB(config)
+
+```
+### Pushdown query to Azure SQL Database or SQL Server
+For SELECT queries with expected return results, please use 
+[Reading from Azure SQL Database using Scala](#reading-from-azure-sql-database-using-scala)
+```scala
+import com.microsoft.azure.sqldb.spark.config.Config
+import com.microsoft.azure.sqldb.spark.query._
+val query = """
+              |UPDATE Customers
+              |SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
+              |WHERE CustomerID = 1;
+            """.stripMargin
+
+val config = Config(Map(
+  "url"          -> "mysqlserver.database.windows.net",
+  "databaseName" -> "MyDatabase",
+  "user"         -> "username",
+  "password"     -> "*********",
+  "queryCustom"  -> query
+))
+
+sqlContext.azurePushdownQuery(config)
+```
+### Bulk Copy to Azure SQL Database or SQL Server
+```scala
+import com.microsoft.azure.sqldb.spark.bulkcopy.BulkCopyMetadata
+import com.microsoft.azure.sqldb.spark.config.Config
+import com.microsoft.azure.sqldb.spark.connect._
+
+/** 
+  Add column Metadata.
+  If not specified, metadata will be automatically added
+  from the destination table, which may suffer performance.
+*/
+var bulkCopyMetadata = new BulkCopyMetadata
+bulkCopyMetadata.addColumnMetadata(1, "Title", java.sql.Types.NVARCHAR, 128, 0)
+bulkCopyMetadata.addColumnMetadata(2, "FirstName", java.sql.Types.NVARCHAR, 50, 0)
+bulkCopyMetadata.addColumnMetadata(3, "LastName", java.sql.Types.NVARCHAR, 50, 0)
+
+val bulkCopyConfig = Config(Map(
+  "url"               -> "mysqlserver.database.windows.net",
+  "databaseName"      -> "MyDatabase",
+  "user"              -> "username",
+  "password"          -> "*********",
+  "databaseName"      -> "MyDatabase",
+  "dbTable"           -> "dbo.Clients",
+  "bulkCopyBatchSize" -> "2500",
+  "bulkCopyTableLock" -> "true",
+  "bulkCopyTimeout"   -> "600"
+))
+
+df.bulkCopyToSqlDB(bulkCopyConfig, bulkCopyMetadata)
+//df.bulkCopyToSqlDB(bulkCopyConfig) if no metadata is specified.
+```
+
+## Requirements
+Official supported versions
+
+| Component | Versions Supported |
+| --------- | ------------------ |
+| Apache Spark | 2.0.2 or later |
+| Scala | 2.10 or later |
+| Microsoft JDBC Driver for SQL Server | 6.2 or later |
+| Microsoft SQL Server | SQL Server 2008 or later |
+| Azure SQL Databases | Supported |
+
+## Download
+### Download from Maven
+*TBD*
+
+### Build this project
+Currently, the connector project uses maven. To build the connector without dependencies, you can run:
+```sh
+mvn clean package
+```
+
+## Contributing & Feedback
+
+This project has adopted the [Microsoft Open Source Code of
+Conduct](https://opensource.microsoft.com/codeofconduct/).  For more information
+see the [Code of Conduct
+FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
+[[email protected]](mailto:[email protected]) with any additional
+questions or comments.
+
+To give feedback and/or report an issue, open a [GitHub
+Issue](https://help.github.com/articles/creating-an-issue/).
+
+
+*Apache®, Apache Spark, and Spark® are either registered trademarks or
+trademarks of the Apache Software Foundation in the United States and/or other
+countries.*
@@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.microsoft.azure.sqldb.spark</groupId>
+    <artifactId>azure-sqldb-spark</artifactId>
+    <version>1.0.0</version>
+
+    <licenses>
+        <license>
+            <name>MIT License</name>
+            <url>http://www.opensource.org/licenses/mit-license.php</url>
+        </license>
+    </licenses>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.8.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+            <version>2.2.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+            <version>2.2.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.scalactic</groupId>
+            <artifactId>scalactic_2.11</artifactId>
+            <version>3.0.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.scalatest</groupId>
+            <artifactId>scalatest_2.11</artifactId>
+            <version>3.0.4</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.microsoft.azure</groupId>
+            <artifactId>adal4j</artifactId>
+            <version>1.2.0</version>
+        </dependency>
+    </dependencies>
+    <developers>
+        <developer>
+            <name>Azure SQL DB Devs</name>
+            <organization>Microsoft</organization>
+            <organizationUrl>http://www.microsoft.com/</organizationUrl>
+        </developer>
+    </developers>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.scalastyle</groupId>
+                <artifactId>scalastyle-maven-plugin</artifactId>
+                <version>1.0.0</version>
+                <configuration>
+                    <verbose>false</verbose>
+                    <failOnViolation>true</failOnViolation>
+                    <includeTestSourceDirectory>true</includeTestSourceDirectory>
+                    <failOnWarning>false</failOnWarning>
+                    <sourceDirectory>${project.basedir}/src/main/scala</sourceDirectory>
+                    <testSourceDirectory>${project.basedir}/src/test/scala</testSourceDirectory>
+                    <configLocation>${project.basedir}/lib/scalastyle_config.xml</configLocation>
+                    <outputFile>${project.basedir}/scalastyle-output.xml</outputFile>
+                    <outputEncoding>UTF-8</outputEncoding>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>check</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>1.8</source>
+                    <target>1.8</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <version>3.0.0</version>
+                <configuration>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>assemble-all</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+              <groupId>org.apache.maven.plugins</groupId>
+              <artifactId>maven-surefire-plugin</artifactId>
+              <version>2.7</version>
+              <configuration>
+                <skipTests>true</skipTests>
+              </configuration>
+            </plugin>
+            <plugin>
+              <groupId>org.scalatest</groupId>
+              <artifactId>scalatest-maven-plugin</artifactId>
+              <version>1.0</version>
+              <configuration>
+                <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
+                <junitxml>.</junitxml>
+                <filereports>WDF TestSuite.txt</filereports>
+              </configuration>
+              <executions>
+                <execution>
+                  <id>test</id>
+                  <goals>
+                    <goal>test</goal>
+                  </goals>
+                </execution>
+              </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
@@ -0,0 +1,43 @@
+// Import libraries
+import com.microsoft.azure.sqldb.spark.bulkcopy.BulkCopyMetadata
+import com.microsoft.azure.sqldb.spark.config.Config
+import com.microsoft.azure.sqldb.spark.connect._
+
+val url = "[Enter your url here]"
+val databaseName = "[Enter your database name here]"
+val dbTable = "[Enter your database table here]"
+
+val user = "[Enter your username here]"
+val password = "[Enter your password here]"
+
+// Acquire data to be written. 
+// df could be aquired in any way.
+val localTable = "[Enter your local persisted table here]"
+val df = spark.sql(s"SELECT * FROM $localTable")
+
+val writeConfig = Config(Map(
+  "url"               -> url,
+  "databaseName"      -> databaseName,
+  "dbTable"           -> dbTable,
+  "user"              -> user, 
+  "password"          -> password,
+  "connectTimeout"    -> "5",
+  "bulkCopyBatchSize" -> "100000",
+  "bulkCopyTableLock" -> "true",
+  "bulkCopyTimeout"   -> "600"
+))
+
+df.bulkCopyToSqlDB(writeConfig)
+
+/**
+For better performance, specify the column metadata of the table
+
+var bulkCopyMetadata = new BulkCopyMetadata
+bulkCopyMetadata.addColumnMetadata(1, "Title", java.sql.Types.NVARCHAR, 128, 0)
+bulkCopyMetadata.addColumnMetadata(2, "FirstName", java.sql.Types.NVARCHAR, 128, 0)
+bulkCopyMetadata.addColumnMetadata(3, "MiddleName", java.sql.Types.NVARCHAR, 128, 0)
+bulkCopyMetadata.addColumnMetadata(4, "LastName", java.sql.Types.NVARCHAR, 128, 0)
+..........
+
+df.bulkCopyToSqlDB(writeConfig, bulkCopyMetadata)
+**/
@@ -0,0 +1,40 @@
+// Import libraries
+import com.microsoft.azure.sqldb.spark.bulkcopy.BulkCopyMetadata
+import com.microsoft.azure.sqldb.spark.config.Config
+import com.microsoft.azure.sqldb.spark.connect._
+
+val url = "[Enter your url here]"
+val databaseName = "[Enter your database name here]"
+val dbTable = "[Enter your database table here]"
+
+val user = "[Enter your username here]"
+val password = "[Enter your password here]"
+
+// READ FROM CONFIG
+val readConfig = Config(Map(
+  "url"            -> url,
+  "databaseName"   -> databaseName,
+  "user"           -> user,
+  "password"       -> password,
+  "connectTimeout" -> "5",
+  "queryTimeout"   -> "5",
+  "dbTable"        -> dbTable
+))
+
+val df = sqlContext.read.sqlDB(readConfig)
+println("Total rows: " + df.count)
+df.show()
+
+// TRADITIONAL SYNTAX
+import java.util.Properties
+
+val properties = new Properties()
+properties.put("databaseName", databaseName)
+properties.put("user", user)
+properties.put("password", password)
+properties.put("connectTimeout", "5")
+properties.put("queryTimeout", "5")
+
+val df = sqlContext.read.sqlDB(url, dbTable, properties)
+println("Total rows: " + df.count)
+df.show()
-Original file line number
+Diff line change
@@ @@ -1,2 +1,6 @@ @@
 *.class
 *.log
 +*.iml
++
 +#IDE
 +.idea/*