jleetutorial
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/04_Basics of Transformations Exercise-checkpoint.ipynb
+1-2 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/04_Basics of Transformations Exercise-checkpoint.ipynb
+1-2
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/06_Transformation Operation Exercise-checkpoint.ipynb
+1-2 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/06_Transformation Operation Exercise-checkpoint.ipynb
+1-2
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/08_Window Operations Exercise-checkpoint.ipynb
+3-5 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/08_Window Operations Exercise-checkpoint.ipynb
+3-5
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/10_countByWindow transformation Exercise-checkpoint.ipynb
+1-2 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/10_countByWindow transformation Exercise-checkpoint.ipynb
+1-2
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/12_reduceByKeyAndWindow transformation Exercise-checkpoint.ipynb
-2 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/12_reduceByKeyAndWindow transformation Exercise-checkpoint.ipynb
-2
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/14_countByValueAndWindow Transformation Exercise-checkpoint.ipynb
+1-3 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/14_countByValueAndWindow Transformation Exercise-checkpoint.ipynb
+1-3
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/17_foreachRDD Exercise-checkpoint.ipynb
+109 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/17_foreachRDD Exercise-checkpoint.ipynb
+109
diff --git a/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/19_SQL Operations Exercise-checkpoint.ipynb
+4-6 b/Diff for: ‎spark_streaming_basics/.ipynb_checkpoints/19_SQL Operations Exercise-checkpoint.ipynb
+4-6
diff --git a/Diff for: ‎spark_streaming_basics/Basics of Transformations Exercise.ipynb renamed to ‎spark_streaming_basics/04_Basics of Transformations Exercise - Solution.ipynb
+3-2 b/Diff for: ‎spark_streaming_basics/Basics of Transformations Exercise.ipynb renamed to ‎spark_streaming_basics/04_Basics of Transformations Exercise - Solution.ipynb
+3-2
diff --git a/Diff for: ‎spark_streaming_basics/04_Basics of Transformations Exercise.ipynb
+1-2 b/Diff for: ‎spark_streaming_basics/04_Basics of Transformations Exercise.ipynb
+1-2
diff --git a/Diff for: ‎spark_streaming_basics/Transformation Operation Demo.ipynb renamed to ‎spark_streaming_basics/06_Transformation Operation Exercise - Solution.ipynb
+11-4 b/Diff for: ‎spark_streaming_basics/Transformation Operation Demo.ipynb renamed to ‎spark_streaming_basics/06_Transformation Operation Exercise - Solution.ipynb
+11-4
diff --git a/Diff for: ‎spark_streaming_basics/06_Transformation Operation Exercise.ipynb
+1-2 b/Diff for: ‎spark_streaming_basics/06_Transformation Operation Exercise.ipynb
+1-2
diff --git a/Diff for: ‎spark_streaming_basics/08_Window Operations Exercise - Solution.ipynb
+123 b/Diff for: ‎spark_streaming_basics/08_Window Operations Exercise - Solution.ipynb
+123
@@ -50,8 +50,7 @@
    "source": [
     "lines = sc.textFile(\"greetings.txt\")\n",
     "\n",
-    "# TODO: Use any of the functions above to create a script that generates a Wordcount of the file greetings.txt.\n",
-    "sorted(lines.flatMap(lambda line: line.split()).map(lambda w: (w,1)).reduceByKey(lambda v1, v2: v1+v2).collect())"
+    "# TODO: Use any of the functions above to create a script that generates a Wordcount of the file greetings.txt.\n"
    ]
   },
   {
 
@@ -62,8 +62,7 @@
     "rdd2 = rdd1.map(lambda x:(x[1], x[0]))\n",
     "\n",
     "##### TODO: Creat a `newRdd` variable with the elements from RDD2 that have the same second value of RDD1\n",
-    "newRdd = rdd2.transform(lambda rdd: rdd.join(rdd.map(lambda x:(x[0],(x[1],x[2])))))\n",
-    "newRdd.map(lambda x:(x[1][0], x[0], x[1][1][0], x[1][1][1])).coalesce(1).collect()\n"
+    "\n"
    ]
   },
   {
 
@@ -71,11 +71,9 @@
     "ip_bytes_request_count_dstream = ip_count.join(ip_bytes_sum_dstream)\n",
     "ip_bytes_request_count_dstream.pprint(num = 30)\n",
     "\n",
-    "####### TODO: use window()to count data over a window##########################\n",
-    "access_logs_window = access_log_dstream.window(windowDuration = 6, slideDuration=4) \n",
-    "window_counts = access_logs_window.count()\n",
-    "print( \" Window count: \")\n",
-    "window_counts.pprint()\n",
+    "####### TODO: use window()to count data over a window ##########################\n",
+    "\n",
+    "\n",
     "\n",
     "####### Exercise End ##########################################################\n",
     "\n",
 
@@ -82,8 +82,7 @@
     "\n",
     "####### TODO: Windowed count operation using countByWindow() ###########\n",
     "\n",
-    "request_count = access_log_dstream.countByWindow(windowDuration = 6, slideDuration=4)\n",
-    "request_count.pprint()\n",
+    "\n",
     "\n",
     "####### Exercise End ##########################################################\n",
     "\n",
 
@@ -82,8 +82,6 @@
     "\n",
     "####### TODO: use reduceByKeyAndWindow() to get Ip counts per window ###########\n",
     "\n",
-    "ip_count_dstream = ip_dstream.reduceByKeyAndWindow(func = lambda x,y: x+y, invFunc = lambda x,y: x-y, windowDuration = 6, slideDuration=4)\n",
-    "ip_count_dstream.pprint(num=30)\n",
     "\n",
     "####### Exercise End ##########################################################\n",
     "\n",
 
@@ -70,9 +70,7 @@
     "ip_bytes_request_count_dstream.pprint(num = 30)\n",
     "\n",
     "####### TODO: Windowed count operation using countByValueAndWindow() ###########\n",
-    "ip_dstream = access_log_dstream.map(lambda entry: entry.ip)\n",
-    "ip_address_request_count = ip_dstream.countByValueAndWindow(windowDuration = 6, slideDuration=4)\n",
-    "ip_address_request_count.pprint()\n",
+    "\n",
     "\n",
     "####### Exercise End ##########################################################\n",
     "\n",
 
@@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# foreachRDD Exercise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Explain foreachRDD and the basic usage about foreachRDD"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pyspark\n",
+    "import pyspark.streaming\n",
+    "from pyspark.streaming import SparkContext\n",
+    "from pyspark.streaming import StreamingContext\n",
+    "import utils\n",
+    "import twitter_app\n",
+    "\n",
+    "\n",
+    "twitter_app()\n",
+    "\n",
+    "ssc = StreamingContext(\"local[*]\", \"SaveTweets\", Seconds(1))\n",
+    "\n",
+    "tweets = TwitterUtils.createStream(ssc, None)\n",
+    "    \n",
+    "# Now extract the text of each status update into RDD's using map()\n",
+    "statuses = tweets.map(lambda status: status.getText())\n",
+    "\n",
+    "totalTweets = int(0)\n",
+    "        \n",
+    "def twitterStatus(rdd, time):\n",
+    "    \n",
+    "    if rdd.count() > 0:\n",
+    "    \n",
+    "        repartitionedRDD = rdd.repartition(1).cache()\n",
+    "        repartitionedRDD.saveAsTextFile(\"Tweets_\" + time.milliseconds.toString)\n",
+    "        \n",
+    "        totalTweets += repartitionedRDD.count()\n",
+    "        print(\"Tweet count: \" + totalTweets)\n",
+    "        if totalTweets > 1000:\n",
+    "            sys.exit(0)\n",
+    "\n",
+    "# TODO: use ForeachRDD to process the 'twitterStatus()' function\n",
+    "\n",
+    "\n",
+    "###########################\n",
+    "    \n",
+    "ssc.start()\n",
+    "ssc.awaitTermination()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## References\n",
+    "1. https://spark.apache.org/docs/latest/streaming-programming-guide.html#design-patterns-for-using-foreachrdd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -71,18 +71,16 @@
     "\n",
     "        try:\n",
     "            # TODO: Get the singleton instance of SparkSession\n",
-    "            spark = getSparkSessionInstance(rdd.context.getConf())\n",
+    "            \n",
     "\n",
     "            # TODO: Convert RDD[String] to RDD[Row] to DataFrame\n",
-    "            rowRdd = rdd.map(lambda w: Row(word=w))\n",
-    "            wordsDataFrame = spark.createDataFrame(rowRdd)\n",
     "\n",
     "            # TODO: Creates a temporary view using the DataFrame.\n",
-    "            wordsDataFrame.createOrReplaceTempView(\"words\")\n",
+    "            \n",
     "\n",
     "            # TODO: Do word count on table using SQL and print it\n",
-    "            wordCountsDataFrame = spark.sql(\"select word, count(*) as total from words group by word\")\n",
-    "            wordCountsDataFrame.show()\n",
+    "            \n",
+    "            \n",
     "        except:\n",
     "            pass\n",
     "\n",
 
@@ -34,8 +34,9 @@
     "| **reduceByKey**(func, [numTasks])\t| When called on a DStream of (K, V) pairs, return a new DStream of (K, V) pairs where the values for each key are aggregated using the given reduce function. Note: By default, this uses Spark's default number of parallel tasks (2 for local mode, and in cluster mode the number is determined by the config property spark.default.parallelism) to do the grouping. You can pass an optional numTasks argument to set a different number of tasks.\n",
     "| **join**(otherStream, [numTasks])\t| When called on two DStreams of (K, V) and (K, W) pairs, return a new DStream of (K, (V, W)) pairs with all pairs of elements for each key.\n",
     "| **cogroup**(otherStream, [numTasks])\t| When called on a DStream of (K, V) and (K, W) pairs, return a new DStream of (K, Seq[V], Seq[W]) tuples.\n",
-    "| **transform**(func)\t| Return a new DStream by applying a RDD-to-RDD function to every RDD of the source DStream. This can be used to do arbitrary RDD operations on the DStream.\n",
-    "| **updateStateByKey**(func)\t| Return a new \"state\" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values for the key. This can be used to maintain arbitrary state data for each key.\n",
+    "\n",
+    "\n",
+    "If you look at the spark streaming documentation, you will also find the `transform(func)` and `updateStateByKey(func)`. We will discuss these later in the course.\n",
     "\n"
    ]
   },
 
@@ -50,8 +50,7 @@
    "source": [
     "lines = sc.textFile(\"greetings.txt\")\n",
     "\n",
-    "# TODO: Use any of the functions above to create a script that generates a Wordcount of the file greetings.txt.\n",
-    "sorted(lines.flatMap(lambda line: line.split()).map(lambda w: (w,1)).reduceByKey(lambda v1, v2: v1+v2).collect())"
+    "# TODO: Use any of the functions above to create a script that generates a Wordcount of the file greetings.txt.\n"
    ]
   },
   {
 
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Transformation Operation Demo"
+    "# Transformation Operation Exercise"
    ]
   },
   {
@@ -27,8 +27,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Demo\n",
-    "\n",
+    "### Exercise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "Suppose we have two rdds that we need to join together. They are RDD1\n",
     "\n",
     "RDD1\n",
@@ -54,7 +59,9 @@
    "source": [
     "rdd = sc.parallelize([(u'2', u'100', 2),(u'1', u'300', 1),(u'1', u'200', 1)])\n",
     "rdd1 = sc.parallelize([(u'1', u'2'), (u'1', u'3')])\n",
-    "rdd2 = rdd1.map(lambda x:(x[1],x[0]))\n",
+    "rdd2 = rdd1.map(lambda x:(x[1], x[0]))\n",
+    "\n",
+    "##### TODO: Creat a `newRdd` variable with the elements from RDD2 that have the same second value of RDD1\n",
     "newRdd = rdd2.transform(lambda rdd: rdd.join(rdd.map(lambda x:(x[0],(x[1],x[2])))))\n",
     "newRdd.map(lambda x:(x[1][0], x[0], x[1][1][0], x[1][1][1])).coalesce(1).collect()\n"
    ]
 
@@ -62,8 +62,7 @@
     "rdd2 = rdd1.map(lambda x:(x[1], x[0]))\n",
     "\n",
     "##### TODO: Creat a `newRdd` variable with the elements from RDD2 that have the same second value of RDD1\n",
-    "newRdd = rdd2.transform(lambda rdd: rdd.join(rdd.map(lambda x:(x[0],(x[1],x[2])))))\n",
-    "newRdd.map(lambda x:(x[1][0], x[0], x[1][1][0], x[1][1][1])).coalesce(1).collect()\n"
+    "\n"
    ]
   },
   {
 
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Window Operations Exercise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. What is Window Operations(better with some graphs)\n",
+    "2. Explain parameters (window length and sliding interval)\n",
+    "3. Some of the popular Window operations\n",
+    "    * Window\n",
+    "    * countByWindow\n",
+    "    * reduceByKeyAndWindow\n",
+    "    * countByValueAndWindow\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark import SparkConf, SparkContext\n",
+    "from pyspark.streaming import StreamingContext\n",
+    "import sys\n",
+    "import random\n",
+    "from apache_log_parser import ApacheAccessLog\n",
+    "\n",
+    "random.seed(15)\n",
+    "\n",
+    "if len(sys.argv) != 2:\n",
+    "    print('Please provide the path to Apache log file')\n",
+    "    print('10_10.py <path_to_log_directory>')\n",
+    "    sys.exit(2)\n",
+    "\n",
+    "conf = (SparkConf().setMaster(\"local[4]\").setAppName(\"log processor\").set(\"spark.executor.memory\", \"2g\"))\n",
+    "\n",
+    "sc = SparkContext(conf=conf)\n",
+    "\n",
+    "ssc = StreamingContext(sc, 2)\n",
+    "ssc.checkpoint(\"checkpoint\")\n",
+    " \n",
+    "directory = sys.argv[1]\n",
+    "print(directory)\n",
+    "\n",
+    "# create DStream from text file\n",
+    "# Note: the spark streaming checks for any updates to this directory.\n",
+    "# So first, start this program, and then copy the log file logs/access_log.log to 'directory' location\n",
+    "log_data = ssc.textFileStream(directory)\n",
+    "access_log_dstream = log_data.map(ApacheAccessLog.parse_from_log_line).filter(lambda parsed_line: parsed_line is not None)\n",
+    "ip_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1)) \n",
+    "ip_count = ip_dstream.reduceByKey(lambda x,y: x+y)\n",
+    "ip_count.pprint(num = 30)\n",
+    "ip_bytes_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, parsed_line.content_size))\n",
+    "ip_bytes_sum_dstream = ip_bytes_dstream.reduceByKey(lambda x,y: x+y)\n",
+    "ip_bytes_request_count_dstream = ip_count.join(ip_bytes_sum_dstream)\n",
+    "ip_bytes_request_count_dstream.pprint(num = 30)\n",
+    "\n",
+    "####### TODO: use window()to count data over a window##########################\n",
+    "access_logs_window = access_log_dstream.window(windowDuration = 6, slideDuration=4) \n",
+    "window_counts = access_logs_window.count()\n",
+    "print( \" Window count: \")\n",
+    "window_counts.pprint()\n",
+    "\n",
+    "####### Exercise End ##########################################################\n",
+    "\n",
+    "ssc.start() \n",
+    "ssc.awaitTermination()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## References\n",
+    "1. https://spark.apache.org/docs/latest/streaming-programming-guide.html#discretized-streams-dstreams"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
Original file line number	Diff line number	Diff line change
`@@ -50,8 +50,7 @@`
`50`	`50`	`"source": [`
`51`	`51`	`"lines = sc.textFile(\"greetings.txt\")\n",`
`52`	`52`	`"\n",`
`53`		`- "# TODO: Use any of the functions above to create a script that generates a Wordcount of the file greetings.txt.\n",`
`54`		`- "sorted(lines.flatMap(lambda line: line.split()).map(lambda w: (w,1)).reduceByKey(lambda v1, v2: v1+v2).collect())"`
	`53`	`+ "# TODO: Use any of the functions above to create a script that generates a Wordcount of the file greetings.txt.\n"`
`55`	`54`	`]`
`56`	`55`	`},`
`57`	`56`	`{`
Original file line number	Diff line number	Diff line change
`@@ -62,8 +62,7 @@`
`62`	`62`	`"rdd2 = rdd1.map(lambda x:(x[1], x[0]))\n",`
`63`	`63`	`"\n",`
`64`	`64`	"##### TODO: Creat a `newRdd` variable with the elements from RDD2 that have the same second value of RDD1\n",
`65`		`- "newRdd = rdd2.transform(lambda rdd: rdd.join(rdd.map(lambda x:(x[0],(x[1],x[2])))))\n",`
`66`		`- "newRdd.map(lambda x:(x[1][0], x[0], x[1][1][0], x[1][1][1])).coalesce(1).collect()\n"`
	`65`	`+ "\n"`
`67`	`66`	`]`
`68`	`67`	`},`
`69`	`68`	`{`