sb2nov
diff --git a/‎_config.yml
+10-10 b/‎_config.yml
+10-10
diff --git a/‎_posts/2014-12-20-redshift-ssd-benchmarks.md
+102 b/‎_posts/2014-12-20-redshift-ssd-benchmarks.md
+102
diff --git a/‎_site/articles/index.html
+5 b/‎_site/articles/index.html
+5
diff --git a/‎_site/assets/images/redshift-ssh-benchmark/1a.png
39.1 KB b/‎_site/assets/images/redshift-ssh-benchmark/1a.png
39.1 KB
diff --git a/‎_site/assets/images/redshift-ssh-benchmark/2.png
43.9 KB b/‎_site/assets/images/redshift-ssh-benchmark/2.png
43.9 KB
diff --git a/‎_site/assets/images/redshift-ssh-benchmark/3.png
40.5 KB b/‎_site/assets/images/redshift-ssh-benchmark/3.png
40.5 KB
diff --git a/‎_site/assets/images/redshift-ssh-benchmark/4a.png
41.2 KB b/‎_site/assets/images/redshift-ssh-benchmark/4a.png
41.2 KB
diff --git a/‎_site/atom.xml
+99-1 b/‎_site/atom.xml
+99-1
diff --git a/‎_site/extra/archive.html
+20-1 b/‎_site/extra/archive.html
+20-1
diff --git a/‎_site/extra/categories.html
+24 b/‎_site/extra/categories.html
+24
diff --git a/‎_site/extra/sitemap.txt
+1 b/‎_site/extra/sitemap.txt
+1
@@ -94,39 +94,39 @@ JB :
     #   num_posts: 5
     #   width: 580
     #   colorscheme: light
-   
+
   # Settings for analytics helper
   # Set 'provider' to the analytics provider you want to use.
   # Set 'provider' to false to turn analytics off globally.
-  #        
+  #
   analytics :
-    provider : google 
-    google : 
+    provider : google
+    google :
         tracking_id : 'UA-40495390-2'
     # getclicky :
-    #   site_id : 
+    #   site_id :
     # mixpanel :
     #     token : '_MIXPANEL_TOKEN_'
     # piwik :
     #     baseURL : 'myserver.tld/piwik' # Piwik installation address (without protocol)
     #     idsite : '1'                   # the id of the site on Piwik
 
-  # Settings for sharing helper. 
+  # Settings for sharing helper.
   # Sharing is for things like tweet, plusone, like, reddit buttons etc.
   # Set 'provider' to the sharing provider you want to use.
   # Set 'provider' to false to turn sharing off globally.
   #
   sharing :
     provider : false
-    
-  # Settings for all other include helpers can be defined by creating 
+
+  # Settings for all other include helpers can be defined by creating
   # a hash with key named for the given helper. ex:
   #
   #   pages_list :
-  #     provider : "custom"   
+  #     provider : "custom"
   #
   # Setting any helper's provider to 'custom' will bypass the helper code
   # and include your custom code. Your custom file must be defined at:
   #   ./_includes/custom/[HELPER]
   # where [HELPER] is the name of the helper you are overriding.
-  
+
@@ -0,0 +1,102 @@
+---
+layout: post
+title: "Redshift SSD Benchmarks"
+description: "Benchmarking Redshift performance for different nodes"
+category: Redshift, Data Science, Data Warehousing
+tags: [Coursera, Analytics]
+---
+{% include JB/setup %}
+
+Our warehouse runs completely on Redshift, and query performance is extremely important to us. Earlier this year, the AWS team announced the release of SSD instances for Amazon Redshift. Is the extra CPU truly worth it? We do a lot of processing with Redshift, so this question is big for us. To answer this, we decided to benchmark SSD performance and compare it to our original HDD performance.
+
+Redshift is easy to use because its PostgreSQL JDBC drivers allow us to use a range of familiar SQL clients. Its speedy performance is achieved through columnar storage and data compression.
+
+
+## Experiment Setup
+The Redshift instance specs are based off on-demand pricing, but the reserved instances can be 75% more affordable. The results from the benchmark are the mean run times after running each query 3 times.
+
+<table class="table table-bordered table-striped table-hover">
+  <colgroup>
+    <col span="1" style="width: 20%;" />
+    <col span="1" style="width: 20%;" />
+    <col span="1" style="width: 20%;" />
+    <col span="1" style="width: 20%;" />
+    <col span="1" style="width: 20%;" />
+  </colgroup>
+  <thead>
+    <tr>
+      <td> </td>
+      <td><b>HDD Setup 1</b></td>
+      <td><b>HDD Setup 2</b></td>
+      <td><b>SSD Setup 1</b></td>
+      <td><b>SSD Setup 2</b></td>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><b>Nodes</b></td>
+      <td>4 dw1.xlarge</td>
+      <td>8 dw1.xlarge</td>
+      <td>32 dw2.large</td>
+      <td>4 dw2.8xlarge</td>
+    </tr>
+    <tr>
+      <td><b>Storage</b></td>
+      <td>8 TB</td>
+      <td>16 TB</td>
+      <td>5.12 TB</td>
+      <td>10.24 TB</td>
+    </tr>
+    <tr>
+      <td><b>Memory</b></td>
+      <td>60 GB</td>
+      <td>120 GB</td>
+      <td>480 GB</td>
+      <td>976 GB</td>
+    </tr>
+    <tr>
+      <td><b>vCPU</b></td>
+      <td>8</td>
+      <td>16</td>
+      <td>64</td>
+      <td>128</td>
+    </tr>
+    <tr>
+      <td><b>Price</b></td>
+      <td>$3.4 / hr</td>
+      <td>$6.8 / hr</td>
+      <td>$8 / hr</td>
+      <td>$19.2 / hr</td>
+    </tr>
+  </tbody>
+</table>
+
+### Query 1.
+First, we ran a simple join query between a table with 1 billion rows and a table with 50 million rows. The total amount of data processed was around 46GB. The results fell in favour of SSD’s.
+
+<img src="https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/1a.png" alt="Screenshot" style="width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;"/>
+
+
+### Query 2.
+This complex query features REGEX matching and aggregate functions across 1 million rows from 4 joins. The total amount of data processed was around 100GB. The results fell even more in favour of SSD’s from 5x - 15x the performance improvement.
+
+<img src="https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/2.png" alt="Screenshot" style="width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;"/>
+
+
+### Query 3.
+A query that runs window functions on a table of 1 billion rows showed surprising results. The total amount of data in this table is about 400GB. Although the SSD’s performed better, the smaller SSD’s out-performed the bigger SSD’s despite having double the memory and CPU power.
+
+<img src="https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/3.png" alt="Screenshot" style="width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;"/>
+
+
+### Query 4.
+This last query has 4 join statements with a subquery that also includes 2 joins. The amount of data processed is around 107GB. Since this query is very compute-heavy, it is not surprising that SSD’s perform 10x better. What is shocking is that the smaller SSD’s are once again more performant than the bigger SSD’s.
+
+<img src="https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/4a.png" alt="Screenshot" style="width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;"/>
+
+## Conclusion
+We also ran some other queries and the performance improvement from HDD to SSD was consistent at about 5 - 10 times. From these experiments, the DW2 machines are clearly promising in terms of computation time. For the same price, SSD’s provide 3.4 times more CPU power and memory. However, the disk storage is about 25% of that of the HDD’s.
+
+A limitation to the dw2.large SSD instances is that a Redshift cluster can support at most 32 of them. That means dw2.large’s can provide at most 5.12 TB of disk storage. The only other option is to upgrade to dw2.8xlarge’s but this experiment shows little performance benefits from dw2.large’s to dw2.8xlarge’s despite doubling the memory and CPU.
+
+<i><small>PS: This was originally written by Jason Shao on the [Coursera blog](https://tech.coursera.org/blog/2014/12/19/redshift-benchmark/).</small></i>
@@ -142,6 +142,11 @@ <h1>Articles <br /></h1>
 <div class='container articleList'>
     <table class="table table-responsive post-table">
 
+        <tr>
+            <td><h4 class='postTitle'><a href="/redshift,%20data%20science,%20data%20warehousing/2014/12/20/redshift-ssd-benchmarks">Redshift SSD Benchmarks</a></h4></td>
+            <td class='postDate' style='vertical-align:middle;'><time datetime="2014-12-20T00:00:00-08:00">December 20, 2014</time></td>
+        </tr>
+         
         <tr>
             <td><h4 class='postTitle'><a href="/python/2014/04/20/pycon-2014---montreal">Pycon 2014 - Montreal</a></h4></td>
             <td class='postDate' style='vertical-align:middle;'><time datetime="2014-04-20T00:00:00-07:00">April 20, 2014</time></td>
 
@@ -4,14 +4,112 @@
  <title>Sourabh Bajaj</title>
  <link href="http://sourabhbajaj.com/" rel="self"/>
  <link href="http://sourabhbajaj.com"/>
- <updated>2014-11-30T11:35:40-08:00</updated>
+ <updated>2014-12-20T02:02:53-08:00</updated>
  <id>http://sourabhbajaj.com</id>
  <author>
    <name>Sourabh Bajaj</name>
    <email>[email protected]</email>
  </author>
 
 
+ <entry>
+   <title>Redshift SSD Benchmarks</title>
+   <link href="http://sourabhbajaj.com/redshift,%20data%20science,%20data%20warehousing/2014/12/20/redshift-ssd-benchmarks"/>
+   <updated>2014-12-20T00:00:00-08:00</updated>
+   <id>http://sourabhbajaj.com/redshift,%20data%20science,%20data%20warehousing/2014/12/20/redshift-ssd-benchmarks</id>
+   <content type="html">
+&lt;p&gt;Our warehouse runs completely on Redshift, and query performance is extremely important to us. Earlier this year, the AWS team announced the release of SSD instances for Amazon Redshift. Is the extra CPU truly worth it? We do a lot of processing with Redshift, so this question is big for us. To answer this, we decided to benchmark SSD performance and compare it to our original HDD performance.&lt;/p&gt;
+
+&lt;p&gt;Redshift is easy to use because its PostgreSQL JDBC drivers allow us to use a range of familiar SQL clients. Its speedy performance is achieved through columnar storage and data compression.&lt;/p&gt;
+
+&lt;h2 id=&quot;experiment-setup&quot;&gt;Experiment Setup&lt;/h2&gt;
+&lt;p&gt;The Redshift instance specs are based off on-demand pricing, but the reserved instances can be 75% more affordable. The results from the benchmark are the mean run times after running each query 3 times.&lt;/p&gt;
+
+&lt;table class=&quot;table table-bordered table-striped table-hover&quot;&gt;
+  &lt;colgroup&gt;
+    &lt;col span=&quot;1&quot; style=&quot;width: 20%;&quot; /&gt;
+    &lt;col span=&quot;1&quot; style=&quot;width: 20%;&quot; /&gt;
+    &lt;col span=&quot;1&quot; style=&quot;width: 20%;&quot; /&gt;
+    &lt;col span=&quot;1&quot; style=&quot;width: 20%;&quot; /&gt;
+    &lt;col span=&quot;1&quot; style=&quot;width: 20%;&quot; /&gt;
+  &lt;/colgroup&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt;&lt;b&gt;HDD Setup 1&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;b&gt;HDD Setup 2&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;b&gt;SSD Setup 1&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;b&gt;SSD Setup 2&lt;/b&gt;&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;b&gt;Nodes&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;4 dw1.xlarge&lt;/td&gt;
+      &lt;td&gt;8 dw1.xlarge&lt;/td&gt;
+      &lt;td&gt;32 dw2.large&lt;/td&gt;
+      &lt;td&gt;4 dw2.8xlarge&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;b&gt;Storage&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;8 TB&lt;/td&gt;
+      &lt;td&gt;16 TB&lt;/td&gt;
+      &lt;td&gt;5.12 TB&lt;/td&gt;
+      &lt;td&gt;10.24 TB&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;b&gt;Memory&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;60 GB&lt;/td&gt;
+      &lt;td&gt;120 GB&lt;/td&gt;
+      &lt;td&gt;480 GB&lt;/td&gt;
+      &lt;td&gt;976 GB&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;b&gt;vCPU&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;8&lt;/td&gt;
+      &lt;td&gt;16&lt;/td&gt;
+      &lt;td&gt;64&lt;/td&gt;
+      &lt;td&gt;128&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;b&gt;Price&lt;/b&gt;&lt;/td&gt;
+      &lt;td&gt;$3.4 / hr&lt;/td&gt;
+      &lt;td&gt;$6.8 / hr&lt;/td&gt;
+      &lt;td&gt;$8 / hr&lt;/td&gt;
+      &lt;td&gt;$19.2 / hr&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;query-1&quot;&gt;Query 1.&lt;/h3&gt;
+&lt;p&gt;First, we ran a simple join query between a table with 1 billion rows and a table with 50 million rows. The total amount of data processed was around 46GB. The results fell in favour of SSD’s.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/1a.png&quot; alt=&quot;Screenshot&quot; style=&quot;width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;query-2&quot;&gt;Query 2.&lt;/h3&gt;
+&lt;p&gt;This complex query features REGEX matching and aggregate functions across 1 million rows from 4 joins. The total amount of data processed was around 100GB. The results fell even more in favour of SSD’s from 5x - 15x the performance improvement.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/2.png&quot; alt=&quot;Screenshot&quot; style=&quot;width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;query-3&quot;&gt;Query 3.&lt;/h3&gt;
+&lt;p&gt;A query that runs window functions on a table of 1 billion rows showed surprising results. The total amount of data in this table is about 400GB. Although the SSD’s performed better, the smaller SSD’s out-performed the bigger SSD’s despite having double the memory and CPU power.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/3.png&quot; alt=&quot;Screenshot&quot; style=&quot;width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;query-4&quot;&gt;Query 4.&lt;/h3&gt;
+&lt;p&gt;This last query has 4 join statements with a subquery that also includes 2 joins. The amount of data processed is around 107GB. Since this query is very compute-heavy, it is not surprising that SSD’s perform 10x better. What is shocking is that the smaller SSD’s are once again more performant than the bigger SSD’s.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;https://dnsta5v53r71w.cloudfront.net/images/redshift-ssd-benchmark/4a.png&quot; alt=&quot;Screenshot&quot; style=&quot;width: 80%; margin-left:10%; margin-right:10%; margin-top:20px; margin-bottom:20px;&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;conclusion&quot;&gt;Conclusion&lt;/h2&gt;
+&lt;p&gt;We also ran some other queries and the performance improvement from HDD to SSD was consistent at about 5 - 10 times. From these experiments, the DW2 machines are clearly promising in terms of computation time. For the same price, SSD’s provide 3.4 times more CPU power and memory. However, the disk storage is about 25% of that of the HDD’s.&lt;/p&gt;
+
+&lt;p&gt;A limitation to the dw2.large SSD instances is that a Redshift cluster can support at most 32 of them. That means dw2.large’s can provide at most 5.12 TB of disk storage. The only other option is to upgrade to dw2.8xlarge’s but this experiment shows little performance benefits from dw2.large’s to dw2.8xlarge’s despite doubling the memory and CPU.&lt;/p&gt;
+
+&lt;p&gt;&lt;i&gt;&lt;small&gt;PS: This was originally written by Jason Shao on the &lt;a href=&quot;https://tech.coursera.org/blog/2014/12/19/redshift-benchmark/&quot;&gt;Coursera blog&lt;/a&gt;.&lt;/small&gt;&lt;/i&gt;&lt;/p&gt;
+</content>
+ </entry>
+ 
  <entry>
    <title>Pycon 2014 - Montreal</title>
    <link href="http://sourabhbajaj.com/python/2014/04/20/pycon-2014---montreal"/>
 
@@ -151,10 +151,29 @@ <h1>Archive <br /></h1>
 
 
       <h2>2014</h2>
-      <h3>April</h3>
+      <h3>December</h3>
       <ul>
 
 
+    <li><span>December 20, 2014</span> &raquo; <a href="/redshift,%20data%20science,%20data%20warehousing/2014/12/20/redshift-ssd-benchmarks">Redshift SSD Benchmarks</a></li>
+  
+    
+          
+        
+          </ul>
+          <h3>April</h3>
+          <ul>
+        
+      
+    
+  
+    
+    
+    
+    
+  
+    
+  
     <li><span>April 20, 2014</span> &raquo; <a href="/python/2014/04/20/pycon-2014---montreal">Pycon 2014 - Montreal</a></li>
 
 
 
@@ -157,6 +157,10 @@ <h1>Categories <br /></h1>
     	<li><a href="/categories.html#python-ref">
     		python <span>2</span>
     	</a></li>
+     
+    	<li><a href="/categories.html#redshift, data science, data warehousing-ref">
+    		redshift, data science, data warehousing <span>1</span>
+    	</a></li>
 
 
 
@@ -239,6 +243,26 @@ <h2 id="python-ref">python</h2>
 
 
 
+  </ul>
+ 
+  <h2 id="redshift, data science, data warehousing-ref">redshift, data science, data warehousing</h2>
+  <ul>
+      
+    
+
+
+  
+    
+      
+      	
+      	<li><a href="/redshift,%20data%20science,%20data%20warehousing/2014/12/20/redshift-ssd-benchmarks">Redshift SSD Benchmarks</a></li>
+      	
+      
+    
+  
+
+
+
   </ul>
 
 
 
@@ -13,6 +13,7 @@ http://sourabhbajaj.com/index.html
 http://sourabhbajaj.com/portfolio/index.html
 http://sourabhbajaj.com/rss.xml
 
+http://sourabhbajaj.com/redshift,%20data%20science,%20data%20warehousing/2014/12/20/redshift-ssd-benchmarks
 http://sourabhbajaj.com/python/2014/04/20/pycon-2014---montreal
 http://sourabhbajaj.com/installation/2014/04/20/mac-os-x-setup-guide
 http://sourabhbajaj.com/python/2014/03/31/fix-valueerror-unknown-locale-utf-8