diff --git a/CMakeLists.txt b/CMakeLists.txt
index c473e2c..e424774 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,10 +112,10 @@ source_group(Headers FILES ${headers})
 source_group(Sources FILES ${sources})
 
 #add_subdirectory(src/ImGui)
-#add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
+add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
 
 cuda_add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers})
 target_link_libraries(${CMAKE_PROJECT_NAME}
     ${LIBRARIES}
-    #stream_compaction  # TODO: uncomment if using your stream compaction
+    stream_compaction  # TODO: uncomment if using your stream compaction
     )
diff --git a/README.md b/README.md
index 110697c..a5b0207 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,69 @@
-CUDA Path Tracer
-================
+CUDA Stream Compaction
+======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ryan Tong
+  * [LinkedIn](https://www.linkedin.com/in/ryanctong/), [personal website](), [twitter](), etc.
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GeForce GTX 1060 6144MB (Personal Laptop)
 
-### (TODO: Your README)
+![Path Tracer](img/title.png)
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+### Project Description
+This project is a path tracer implemented on the GPU using CUDA. Specifically, the path tracer shoots out rays from the camera and bounces the rays around to accumulate a color and does this over many iterations to generate an image. By using path tracing, we are able to render images with features such as global illumination, color bleed, and 'free' soft shadows.
 
+### Project Features
+This project has the following features:
+1. Diffuse surfaces where rays bounce of the surface randomly
+2. Specular surfaces where rays bounce of the surface in a predictable way
+3. Imperfect specular surfaces where rays probabilistically bounce of the surface in a diffuse or specular way
+4. Refractive surfaces where rays are refracted into the object
+5. Physically based depth of field
+6. Stochastic sampled anti-aliasing
+7. Direct Lighting
+
+### Different Materials
+The effect of different materials (diffuse/specular/refractive) is acheived through a shading kernel which evaluates the direction that the incoming ray should bounce. Specifically, diffuse surfaces reflect rays randomly in the hemisphere of intersection, specular surfaces reflect rays in the surface normal direction, and refractive surfaces refract rays into the object according to snells law. This gives the following effect:
+### Perfectly Diffuse
+![Diffuse](img/diffuse.png)
+### Perfectly Specular
+![Specular](img/specular.png)
+### Imperfect Specular (Diffuse and Specular)
+![Imperfect](img/imperfectspecular.png)
+### Refractive
+![Refractive](img/refractive.png)
+
+### Physically Based Depth of Field
+A depth of field effect can be achieved by simulating a lens effect. To do this, we must determine a lens size and focal length where the image will be in focus. 
+### Close Object (Focal Length = 9)
+![Close](img/DOF_close.png)
+### Far Object (Focal Length = 13)
+![Far](img/DOF_far.png)
+
+### Stochastic Sampled Anti-Aliasing
+Anti-aliasing can be achieved by jittering the rays shot from the camera such that they no longer always point to the center of the pixel. Since we do this randomly at the beginning of each iteration, the first bounce caching optimization (disscussed below) is no longer valid since the rays no longer intersect at the same spot for every iteration.
+### Anti-Aliasing Comparison
+![Diffuse](img/diffuse.png)
+![AA](img/AA.png)
+### Anti-Aliasing on Left
+![Comparison](img/aa_compare.png)
+
+### Direct Lighting
+A path traced scene can be made brighter by having rays have their final bounce directed towards a lightsource. In my implementation, if a remaining ray only has used its last bounce, then I randomly select a light source and randomly select a point on the light source and set the ray direction to that point. 
+### Direct Lighting Comparison (Direct Lighting on Left)
+![Lighting](img/light_compare.jpg)
+
+### Performance Analysis: Stream Compaction
+Stream compaction is used to remove any terminated rays. This allows for rays that hit a light or no object to be terminated and improves performances as the number of rays that needs to be used in each bounce decreases over the iteration length. One key factor that affects how many rays remain after each bounce is how much "openness" there is and how big/many light sources there are. This is because the more the open space, the more the likely rays are to hit nothing and terminate and the similarly with lights. The following chart shows the number of remaining active rays over the course of a single iteration in a closed and not closed cornell box.
+### Stream Compaction Analysis
+![compaction](img/compaction.png)
+
+### Performance Analysis: Material Sorting
+The type of material that a ray hights greatly affects the amount of time it takes for the new ray direction to be determined (same material do the same math). This is important on a GPU because within a warp, threads that are executing divergent code (if statements) wait for the other threads in the warp to finish before continuing. Since the shading kernel has these if statements to case on the material type, by sorting the rays by material, we can make it such that rays taking the same branch are on the same warp on the GPU and thus there are less idle threads. Theoretically, this would improve performance. However, based on the chart below we can see that this optimization actually is worse. This is due to the fact that we do not have very complex BSDFs in our shading kernel meaning that the math done for each kernel is relatively similar in run time. This would negate any performance gain because threads already run for similar amounts of time despite divergence in the kernel. Therefore, the sorting time dominates any gains made with divergence. Similarly, there are not many different types of materials used in my renderings which would further negate the effects of divergence. In a path tracer with more complex BSDFs and more materials this would likely be better. 
+### Material Sorting Anaylsis
+![sorting](img/materialsort.png)
+
+### Performance Analysis: First Bounce Caching
+When we shoot rays from the camera, the point of intersection is the same for each iteration. Therefore, we can cache these intersections so that we do not need to recompute the same thing at the start of each iteration. This is especially impactful because the number of rays at the beginning is always the greatest (since no rays have terminated yet) and thus we can save a lot of computation. We can see these performance enhancements in the graph below. One thing to note is that this graph was generated using the Cornell box scene which is very closed off (only one open face). Therefore, the number of rays terminated is lower and thus more computations are needed throughout each iteration. For a more open scene, after the first bounce, most rays may be terminated which means that the first intersection computation becomes more dominant in the run time. 
+### First Bounce Caching Analysis
+![cache](img/cache.png)
diff --git a/img/AA.png b/img/AA.png
new file mode 100644
index 0000000..669d324
Binary files /dev/null and b/img/AA.png differ
diff --git a/img/DOF_close.png b/img/DOF_close.png
new file mode 100644
index 0000000..65b5c73
Binary files /dev/null and b/img/DOF_close.png differ
diff --git a/img/DOF_far.png b/img/DOF_far.png
new file mode 100644
index 0000000..0acba92
Binary files /dev/null and b/img/DOF_far.png differ
diff --git a/img/aa_compare.png b/img/aa_compare.png
new file mode 100644
index 0000000..dc4c356
Binary files /dev/null and b/img/aa_compare.png differ
diff --git a/img/cache.png b/img/cache.png
new file mode 100644
index 0000000..a37a6c6
Binary files /dev/null and b/img/cache.png differ
diff --git a/img/compaction.png b/img/compaction.png
new file mode 100644
index 0000000..a7566e1
Binary files /dev/null and b/img/compaction.png differ
diff --git a/img/diffuse.png b/img/diffuse.png
new file mode 100644
index 0000000..ca593ed
Binary files /dev/null and b/img/diffuse.png differ
diff --git a/img/directlighting.png b/img/directlighting.png
new file mode 100644
index 0000000..184ccdc
Binary files /dev/null and b/img/directlighting.png differ
diff --git a/img/imperfect specular.png b/img/imperfect specular.png
new file mode 100644
index 0000000..1d84835
Binary files /dev/null and b/img/imperfect specular.png differ
diff --git a/img/light_compare.jpg b/img/light_compare.jpg
new file mode 100644
index 0000000..9805d50
Binary files /dev/null and b/img/light_compare.jpg differ
diff --git a/img/materialsort.png b/img/materialsort.png
new file mode 100644
index 0000000..c38f97c
Binary files /dev/null and b/img/materialsort.png differ
diff --git a/img/refractive.png b/img/refractive.png
new file mode 100644
index 0000000..31ae992
Binary files /dev/null and b/img/refractive.png differ
diff --git a/img/specular.png b/img/specular.png
new file mode 100644
index 0000000..2a1ff14
Binary files /dev/null and b/img/specular.png differ
diff --git a/img/title.png b/img/title.png
new file mode 100644
index 0000000..872a898
Binary files /dev/null and b/img/title.png differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff820..507b32e 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -38,7 +38,7 @@ REFR        0
 REFRIOR     0
 EMITTANCE   0
 
-// Specular white
+// Perfectly Specular
 MATERIAL 4
 RGB         .98 .98 .98
 SPECEX      0
@@ -48,6 +48,26 @@ REFR        0
 REFRIOR     0
 EMITTANCE   0
 
+// Refractive
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+// Imperfect Specular
+MATERIAL 6
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        .1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
 // Camera
 CAMERA
 RES         800 800
@@ -111,7 +131,25 @@ SCALE       .01 10 10
 // Sphere
 OBJECT 6
 sphere
-material 4
+material 6
 TRANS       -1 4 -1
 ROTAT       0 0 0
 SCALE       3 3 3
+
+// Sphere
+//OBJECT 7
+//sphere
+//material 5
+//TRANS       2 4 1
+//ROTAT       0 0 0
+//SCALE       3 3 3
+
+// Sphere
+//OBJECT 8
+//sphere
+//material 4
+//TRANS       -3 4 2
+//ROTAT       0 0 0
+//SCALE       3 3 3
+
+
diff --git a/scenes/enclosed_cornell.txt b/scenes/enclosed_cornell.txt
new file mode 100644
index 0000000..2da2099
--- /dev/null
+++ b/scenes/enclosed_cornell.txt
@@ -0,0 +1,161 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Perfectly Specular
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Refractive
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+// Imperfect Specular
+MATERIAL 6
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        .1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  1
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 40
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 40
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 40
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 40
+
+// Sphere
+OBJECT 6
+sphere
+material 1
+TRANS       -1 4 -2
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere
+OBJECT 7
+sphere
+material 4
+TRANS       2 4 0
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere
+OBJECT 8
+sphere
+material 6
+TRANS       -3 4 2
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Front wall
+OBJECT 9
+cube
+material 1
+TRANS       0 5 15
+ROTAT       0 90 0
+SCALE       .01 10 10
diff --git a/scenes/title.txt b/scenes/title.txt
new file mode 100644
index 0000000..82d696e
--- /dev/null
+++ b/scenes/title.txt
@@ -0,0 +1,310 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 .85
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   7.5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse blue
+MATERIAL 2
+RGB         .35 .35 .85
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Perfectly Specular
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Refractive
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+// Imperfect Specular White
+MATERIAL 6
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        .1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Imperfect Specular Red
+MATERIAL 7
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     .85 .35 .35
+REFL        .05
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Imperfect Specular Magenta
+MATERIAL 8
+RGB         .85 .35 .85
+SPECEX      0
+SPECRGB     .85 .35 .85
+REFL        .05
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Perfectly Specular Red
+MATERIAL 9
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     .85 .35 .35
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Perfectly Specular Magenta
+MATERIAL 10
+RGB         .85 .35 .85
+SPECEX      0
+SPECRGB     .85 .35 .85
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Perfectly Specular Cyan
+MATERIAL 11
+RGB         .85 .85 .35
+SPECEX      0
+SPECRGB     .85 .85 .35
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Emissive material (yellow)
+MATERIAL 12
+RGB         1 1 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   2.5
+
+// Refractive
+MATERIAL 13
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.9
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+// Ceiling light left
+OBJECT 0
+cube
+material 0
+TRANS       -9 15 -5
+ROTAT       0 0 0
+SCALE       6 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 1 0
+ROTAT       0 0 0
+SCALE       100 .1 20
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 15 0
+ROTAT       0 0 90
+SCALE       .1 100 50
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -10
+ROTAT       0 90 0
+SCALE       .1 20 100
+
+// Ceiling light right
+OBJECT 4
+cube
+material 0
+TRANS       9 15 -5
+ROTAT       0 0 0
+SCALE       6 .3 3
+
+// Left wall
+OBJECT 5
+cube
+material 1
+TRANS       -40 5 0
+ROTAT       0 0 0
+SCALE       .1 20 20
+
+// Right wall
+OBJECT 6
+cube
+material 1
+TRANS       40 5 0
+ROTAT       0 0 0
+SCALE       .1 20 20
+
+// Sphere glass
+OBJECT 7
+sphere
+material 5
+TRANS       0 2 3
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere light
+OBJECT 8
+sphere
+material 12
+TRANS       0 2 3
+ROTAT       0 0 0
+SCALE       .5 .5 .5
+
+// Sphere tip
+OBJECT 9
+sphere
+material 4
+TRANS       0 2 0
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere layer 1
+OBJECT 10
+sphere
+material 2
+TRANS       1.5 2 1.5
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere layer 1
+OBJECT 11
+sphere
+material 3
+TRANS       -1.5 2 1.5
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere layer 2
+OBJECT 12
+sphere
+material 7
+TRANS       -3 2 3
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere layer 2
+OBJECT 13
+sphere
+material 8
+TRANS       3 2 3
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Cube right
+OBJECT 14
+cube
+material 5
+TRANS       6 4 -3
+ROTAT       0 45 0
+SCALE       3 4 3
+
+// Cube left
+OBJECT 15
+cube
+material 5
+TRANS       -6 4 -3
+ROTAT       0 45 0
+SCALE       3 4 3
+
+// Sphere in cube left
+//OBJECT 16
+//sphere
+//material 12
+//TRANS       -6 7 -3
+//ROTAT       0 0 0
+//SCALE       .5 .5 .5
+
+// Sphere in cube left
+OBJECT 16
+sphere
+material 4
+TRANS       -6 7 -3
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere in cube right
+//OBJECT 18
+//sphere
+//material 12
+//TRANS       6 7 -3
+//ROTAT       0 0 0
+//SCALE       .5 .5 .5
+
+// Sphere in cube right
+OBJECT 17
+sphere
+material 4
+TRANS       6 7 -3
+ROTAT       0 0 0
+SCALE       2 2 2
\ No newline at end of file
diff --git a/src/interactions.h b/src/interactions.h
index f969e45..a849adb 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -76,4 +76,51 @@ void scatterRay(
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
+
+    thrust::uniform_real_distribution<float> u01(0, 1);
+
+    if(m.hasRefractive) { // Refractive Case, assumes we can only have perfectly refractive objects
+        //Figure out what side of the object we are on (inside or outside)
+        float dot = glm::dot(pathSegment.ray.direction, normal);
+        bool out = dot > 0;
+
+        //Eta is refractive index of the medium of the light, air = 1, glass = 1.5
+        //Eta1 == currently in, eta2 == going into
+        float eta1 = out ? 1.f : m.indexOfRefraction;
+        float eta2 = out ? m.indexOfRefraction : 1.f;
+
+        //Compute schlicks approx to decide whether or not to reflect or refract light
+        float r_0 = ((eta1 - eta2) / (eta1 + eta2)) * ((eta1 - eta2) / (eta1 + eta2));
+
+        float R = r_0 + (1 - eta1) * pow(1 - cos(dot), 5);
+        if (u01(rng) > R) {
+            pathSegment.ray.direction = glm::normalize(glm::refract(pathSegment.ray.direction, normal, eta2 / eta1));
+            pathSegment.ray.origin = intersect + .001f * glm::normalize(pathSegment.ray.direction);
+            pathSegment.color *= m.color;
+        }
+        else {
+            pathSegment.ray.direction = glm::normalize(glm::reflect(pathSegment.ray.direction, normal));
+            //pathSegment.ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+            pathSegment.ray.origin = intersect;
+            pathSegment.color *= m.specular.color;
+        }
+        
+    }
+    else {
+        //0 = diffuse 1 = specular
+        float diffuseSpecularThresh = 1.f - m.hasReflective;
+        float diffuseSpecularRand = u01(rng);
+        if (diffuseSpecularRand > diffuseSpecularThresh) { // Specular case
+            pathSegment.color *= m.specular.color;
+            pathSegment.ray.direction = glm::normalize(glm::reflect(pathSegment.ray.direction, normal));
+            pathSegment.ray.origin = intersect;
+            //float exp = m.specular.exponent;
+        }
+        else { // Diffuse case
+            pathSegment.color *= m.color;
+            pathSegment.ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+            pathSegment.ray.origin = intersect;
+        }
+    }
+    pathSegment.remainingBounces--;
 }
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index fd2a464..ea44a75 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -4,6 +4,9 @@
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/partition.h>
 
 #include "sceneStructs.h"
 #include "scene.h"
@@ -13,9 +16,16 @@
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+#include "../stream_compaction/efficient.h"
 
 #define ERRORCHECK 1
 
+#define MATERIALSORT 0
+#define FIRSTBOUNCE 0
+#define AA 0
+#define DOF 0
+#define DIRECTLIGHTING 0
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 void checkCUDAErrorFn(const char* msg, const char* file, int line) {
@@ -74,8 +84,15 @@ static Geom* dev_geoms = NULL;
 static Material* dev_materials = NULL;
 static PathSegment* dev_paths = NULL;
 static ShadeableIntersection* dev_intersections = NULL;
+
 // TODO: static variables for device memory, any extra info you need, etc
-// ...
+thrust::device_ptr<ShadeableIntersection> dev_thrust_intersections;
+static PathSegment* dev_paths_cache = NULL;
+static ShadeableIntersection* dev_intersections_cache = NULL;
+static PathSegment* dev_paths_pingpong = NULL;
+static PathSegment* temp_paths = NULL;
+// Direct Lighting storage of light objects index
+static int* dev_lights = NULL;
 
 void InitDataContainer(GuiDataContainer* imGuiData)
 {
@@ -103,6 +120,18 @@ void pathtraceInit(Scene* scene) {
 	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
 	// TODO: initialize any extra device memeory you need
+	cudaMalloc(&dev_paths_cache, pixelcount * sizeof(PathSegment));
+
+	cudaMalloc(&dev_intersections_cache, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections_cache, 0, pixelcount * sizeof(ShadeableIntersection));
+
+	//Ping pong buffer for dev paths after stream compaction
+	cudaMalloc(&dev_paths_pingpong, pixelcount * sizeof(PathSegment));
+
+	cudaMalloc(&dev_lights, scene->lights.size() * sizeof(int));
+	cudaMemcpy(dev_lights, scene->lights.data(), scene->lights.size() * sizeof(int), cudaMemcpyHostToDevice);
+
+	temp_paths = (PathSegment*) malloc(pixelcount * sizeof(PathSegment));
 
 	checkCUDAError("pathtraceInit");
 }
@@ -115,7 +144,14 @@ void pathtraceFree() {
 	cudaFree(dev_intersections);
 	// TODO: clean up any extra device memory you created
 
+	cudaFree(dev_intersections_cache);
+	cudaFree(dev_paths_cache);
+	cudaFree(dev_paths_pingpong);
+
+	free(temp_paths);
 	checkCUDAError("pathtraceFree");
+
+	cudaFree(dev_lights);
 }
 
 /**
@@ -130,20 +166,45 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	float focalLength = 13.f; // close ball = 9, far ball = 13ish?
+	float apertureSize = .5;
 
 	if (x < cam.resolution.x && y < cam.resolution.y) {
 		int index = x + (y * cam.resolution.x);
+		thrust::default_random_engine rng_x = makeSeededRandomEngine(iter, index, 0);
+		thrust::default_random_engine rng_y = makeSeededRandomEngine(iter, index, 1);
+		thrust::default_random_engine rng_z = makeSeededRandomEngine(iter, index, 2);
+		thrust::uniform_real_distribution<float> u01(0, 1);
+		thrust::uniform_real_distribution<float> aaOffset(-1, 1);
 		PathSegment& segment = pathSegments[index];
 
 		segment.ray.origin = cam.position;
 		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
 		// TODO: implement antialiasing by jittering the ray
+		// Offset < / 2 pixel width
+		float offsetX = AA ? aaOffset(rng_x) * cam.pixelLength.x / 2.f : 0.f;
+		float offsetY = AA ? aaOffset(rng_y) * cam.pixelLength.y / 2.f : 0.f;
+
 		segment.ray.direction = glm::normalize(cam.view
 			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
 			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
+			- offsetX * cam.right - offsetY * cam.up
 		);
 
+		// Depth of field
+		// Implemented following: https://pathtracing.home.blog/depth-of-field/
+		if (apertureSize > 0 && DOF) {
+			glm::vec3 focalPoint = segment.ray.direction * focalLength + segment.ray.origin;
+
+			thrust::uniform_real_distribution<float> u01(0, 1);
+			glm::vec3 blur = glm::vec3(u01(rng_x), u01(rng_y), u01(rng_z));
+			blur -= .5;
+
+			segment.ray.origin += blur * apertureSize;
+			segment.ray.direction = glm::normalize(focalPoint - segment.ray.origin);                     
+		}
+
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
 	}
@@ -273,6 +334,72 @@ __global__ void shadeFakeMaterial(
 	}
 }
 
+__host__ __device__ glm::vec3 getPointOnGeom(const Geom& g, thrust::default_random_engine& rng) {
+	thrust::uniform_real_distribution<float> u01(-1, 1);
+	glm::vec3 point = glm::vec3(u01(rng), u01(rng), u01(rng));
+
+	if (g.type == SPHERE) { // Normalize to get points on sphere
+		point = glm::normalize(point);
+	}
+
+	return glm::vec3(g.transform * glm::vec4(point, 1.f));
+}
+
+__global__ void shadeBSDF(
+	int iter
+	, int depth
+	, int num_paths
+	, ShadeableIntersection* shadeableIntersections
+	, PathSegment* pathSegments
+	, Material* materials
+	, Geom* geoms
+	, int* lights_idx
+	, int num_lights
+)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx >= num_paths) {
+		return;
+	}
+
+	ShadeableIntersection intersection = shadeableIntersections[idx];
+	if (pathSegments[idx].remainingBounces == 1) {
+		thrust::uniform_int_distribution<int> randLight(0, num_lights - 1);
+	}
+	if (intersection.t > 0.0f) { // if the intersection exists...
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
+		thrust::uniform_real_distribution<float> u01(0, 1);
+
+		Material material = materials[intersection.materialId];
+		glm::vec3 materialColor = material.color;
+		if (pathSegments[idx].remainingBounces == 1) {
+			thrust::uniform_int_distribution<int> randLight(0, num_lights - 1);
+		}
+		if (material.emittance > 0.0f) { //Is light
+			pathSegments[idx].color *= (materialColor * material.emittance);
+			pathSegments[idx].remainingBounces = 0;
+		}
+		else {
+			// Small offset
+			glm::vec3 intersect = (intersection.t - 1e-5f) * glm::normalize(pathSegments[idx].ray.direction) + pathSegments[idx].ray.origin;
+			scatterRay(pathSegments[idx], intersect, intersection.surfaceNormal, material, rng);
+			if (DIRECTLIGHTING && pathSegments[idx].remainingBounces == 1) { // Second to last depth, set to face light  depth == 6
+				thrust::uniform_int_distribution<int> randLight(0, num_lights - 1);
+				int light = randLight(rng);
+				glm::vec3 randPoint = getPointOnGeom(geoms[lights_idx[light]], rng);
+				
+				// Looks like the light is around [-2.5, 2.5] X [9.5, 10.5] Y [-2.5, 2.5]
+				pathSegments[idx].ray.direction = glm::normalize(randPoint - pathSegments[idx].ray.origin);
+			}
+		}
+	}
+	else { //No intersection
+		pathSegments[idx].color = glm::vec3(0.f);
+		pathSegments[idx].remainingBounces = 0;
+	}
+}
+
+
 // Add the current iteration's output to the overall image
 __global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iterationPaths)
 {
@@ -285,6 +412,7 @@ __global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iteration
 	}
 }
 
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
@@ -302,6 +430,7 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
 
 	// 1D block for path tracing
 	const int blockSize1d = 128;
+	int depth = 1;
 
 	///////////////////////////////////////////////////////////////////////////
 
@@ -331,57 +460,120 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
 	//     since some shaders you write may also cause a path to terminate.
 	// * Finally, add this iteration's results to the image. This has been done
 	//   for you.
+	PathSegment* dev_path_end = dev_paths + pixelcount;
+	int num_paths = dev_path_end - dev_paths;
+	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
 
-	// TODO: perform one iteration of path tracing
+	if (AA == 0 && FIRSTBOUNCE && iter == 1) { //Generate Cache
+		generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths_cache);
+		checkCUDAError("generate camera ray");
 
-	generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
+		cudaMemset(dev_intersections_cache, 0, num_paths * sizeof(ShadeableIntersection));
 
-	int depth = 0;
-	PathSegment* dev_path_end = dev_paths + pixelcount;
-	int num_paths = dev_path_end - dev_paths;
+		computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (1, pixelcount, dev_paths_cache, dev_geoms, hst_scene->geoms.size(), dev_intersections_cache);
+		checkCUDAError("compute intersection");
+		cudaDeviceSynchronize();
+
+		if (MATERIALSORT) {
+			thrust::sort_by_key(thrust::device, dev_intersections_cache, dev_intersections_cache + pixelcount, dev_paths_cache, sortMaterial());
+		}
+		cudaDeviceSynchronize();
+
+		//cudaMemcpy(temp_paths_cache, dev_paths_cache, sizeof(PathSegment) * pixelcount, cudaMemcpyDeviceToHost);
+		//cudaMemcpy(temp_intersections_cache, dev_intersections_cache, sizeof(ShadeableIntersection) * pixelcount, cudaMemcpyDeviceToHost);
+	}
+	else {
+		generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
+		checkCUDAError("generate camera ray");
+	}
 
 	// --- PathSegment Tracing Stage ---
 	// Shoot ray into scene, bounce between objects, push shading chunks
-
 	bool iterationComplete = false;
 	while (!iterationComplete) {
+		numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+		if (depth > 1 || AA || FIRSTBOUNCE == 0) { // Dont use cache
+			// clean shading chunks
+			cudaMemset(dev_intersections, 0, num_paths * sizeof(ShadeableIntersection));
+
+			// tracing
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth
+				, num_paths
+				, dev_paths
+				, dev_geoms
+				, hst_scene->geoms.size()
+				, dev_intersections
+				);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+
+			// Sort intersections & paths together
+			if (MATERIALSORT) {
+				thrust::sort_by_key(thrust::device, dev_intersections, dev_intersections + num_paths, dev_paths, sortMaterial());
+			}
+			cudaDeviceSynchronize();
+		}
+		else { // Use cache
+			cudaMemcpy(dev_paths, dev_paths_cache, sizeof(PathSegment) * pixelcount, cudaMemcpyDeviceToDevice);
+			cudaMemcpy(dev_intersections, dev_intersections_cache, sizeof(ShadeableIntersection) * pixelcount, cudaMemcpyDeviceToDevice);
+			cudaDeviceSynchronize();
+		}
 
-		// clean shading chunks
-		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-		// tracing
-		dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-		computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
-			depth
-			, num_paths
-			, dev_paths
-			, dev_geoms
-			, hst_scene->geoms.size()
-			, dev_intersections
+		shadeBSDF << <numblocksPathSegmentTracing, blockSize1d >> > (
+			iter,
+			depth,
+			num_paths,
+			dev_intersections,
+			dev_paths,
+			dev_materials,
+			dev_geoms,
+			dev_lights,
+			hst_scene->lights.size()
 			);
-		checkCUDAError("trace one bounce");
+
 		cudaDeviceSynchronize();
+
 		depth++;
 
 		// TODO:
 		// --- Shading Stage ---
 		// Shade path segments based on intersections and generate new rays by
-	  // evaluating the BSDF.
-	  // Start off with just a big kernel that handles all the different
-	  // materials you have in the scenefile.
-	  // TODO: compare between directly shading the path segments and shading
-	  // path segments that have been reshuffled to be contiguous in memory.
-
-		shadeFakeMaterial << <numblocksPathSegmentTracing, blockSize1d >> > (
-			iter,
-			num_paths,
-			dev_intersections,
-			dev_paths,
-			dev_materials
-			);
-		iterationComplete = true; // TODO: should be based off stream compaction results.
-
+		// evaluating the BSDF.
+		// Start off with just a big kernel that handles all the different
+		// materials you have in the scenefile.
+		// TODO: compare between directly shading the path segments and shading
+		// path segments that have been reshuffled to be contiguous in memory.
+
+		// DEBUG STREAM COMPACT
+		//cudaMemcpy(temp_paths, dev_paths, num_paths * sizeof(PathSegment), cudaMemcpyDeviceToHost);
+		//cout << "BEFORE COMPACTION: " << num_paths << endl;
+		//for (int i = 0; i < num_paths; ++i) {
+		//	cout << "ID: " << temp_paths[i].pixelIndex << ", NUM_BOUNCES: " << temp_paths[i].remainingBounces << endl;
+		//}
+
+		// Thrust stream compact
+		dev_path_end = thrust::partition(thrust::device, dev_paths, dev_paths + num_paths, isNonZero());
+		num_paths = dev_path_end - dev_paths;
+
+		// My stream compact
+		//num_paths = StreamCompaction::Efficient::compact(num_paths, dev_paths_pingpong, dev_paths);
+		//dev_path_end = dev_paths + num_paths;
+		//PathSegment* temp = dev_paths;
+		//dev_paths = dev_paths_pingpong;
+		//dev_paths_pingpong = temp;
+		
+		// DEBUG STREAM COMPACT
+		//cudaMemcpy(temp_paths, dev_paths, num_paths * sizeof(PathSegment), cudaMemcpyDeviceToHost);
+		//cout << "AFTER COMPACTION: " << num_paths << endl;
+		//for (int i = 0; i < num_paths; ++i) {
+		//	cout << "ID: " << temp_paths[i].pixelIndex << ", NUM_BOUNCES: " << temp_paths[i].remainingBounces << endl;
+		//}
+
+
+		if (depth > traceDepth || dev_path_end == dev_paths) {
+			iterationComplete = true;
+		}
 		if (guiData != NULL)
 		{
 			guiData->TracedDepth = depth;
@@ -390,7 +582,7 @@ void pathtrace(uchar4* pbo, int frame, int iter) {
 
 	// Assemble this iteration and apply it to the image
 	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
+	finalGather << <numBlocksPixels, blockSize1d >> > (pixelcount, dev_image, dev_paths);
 
 	///////////////////////////////////////////////////////////////////////////
 
diff --git a/src/scene.cpp b/src/scene.cpp
index 3fb6239..224ae7c 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -30,6 +30,8 @@ Scene::Scene(string filename) {
             }
         }
     }
+    cout << "Populating Lights" << endl;
+    getLights();
 }
 
 int Scene::loadGeom(string objectid) {
@@ -186,3 +188,12 @@ int Scene::loadMaterial(string materialid) {
         return 1;
     }
 }
+
+void Scene::getLights() {
+    for (int i = 0; i < geoms.size(); ++i) {
+        Geom g = geoms[i];
+        if (materials[g.materialid].emittance > 0) {
+            lights.push_back(i);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/scene.h b/src/scene.h
index f29a917..726fca6 100644
--- a/src/scene.h
+++ b/src/scene.h
@@ -16,11 +16,13 @@ class Scene {
     int loadMaterial(string materialid);
     int loadGeom(string objectid);
     int loadCamera();
+    void getLights();
 public:
     Scene(string filename);
     ~Scene();
 
     std::vector<Geom> geoms;
+    std::vector<int> lights;
     std::vector<Material> materials;
     RenderState state;
 };
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da4dbf3..594596c 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -74,3 +74,17 @@ struct ShadeableIntersection {
   glm::vec3 surfaceNormal;
   int materialId;
 };
+
+// For stream compaction
+struct isNonZero {
+    __host__ __device__ bool operator()(const PathSegment& path) {
+        return path.remainingBounces > 0;
+    }
+};
+
+// For material sorting
+struct sortMaterial {
+    __host__ __device__ bool operator()(const ShadeableIntersection& a, const ShadeableIntersection& b) {
+        return a.materialId < b.materialId;
+    }
+};
\ No newline at end of file
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 4538f04..aff9663 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,6 +1,18 @@
-set(SOURCE_FILES
+cmake_minimum_required(VERSION 3.1)
+set(headers
+    "common.h"
+    "efficient.h"
     )
 
-cuda_add_library(stream_compaction
-    ${SOURCE_FILES}
+set(sources
+    "common.cu"
+    "efficient.cu"
     )
+
+list(SORT headers)
+list(SORT sources)
+
+source_group(Headers FILES ${headers})
+source_group(Sources FILES ${sources})
+
+cuda_add_library(stream_compaction ${sources} ${headers})
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..321cc61
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,80 @@
+#include "common.h"
+
+
+//void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+//    cudaError_t err = cudaGetLastError();
+//    if (cudaSuccess == err) {
+//        return;
+//    }
+//
+//    fprintf(stderr, "CUDA error");
+//    if (file) {
+//        fprintf(stderr, " (%s:%d)", file, line);
+//    }
+//    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+//    exit(EXIT_FAILURE);
+//}
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int *bools, const int*idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = idata[index] == 0 ? 0 : 1;
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int *odata,
+                const int*idata, const int *bools, const int *indices) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n || bools[index] == 0) {
+                return;
+            }
+            odata[indices[index]] = idata[index];
+        }
+
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void pathtrace_kernMapToBoolean(int n, int* bools, const PathSegment* idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = idata[index].remainingBounces == 0 ? 0 : 1;
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void pathtrace_kernScatter(int n, PathSegment* odata,
+            const PathSegment* idata, const int* bools, const int* indices) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n || bools[index] == 0) {
+                return;
+            }
+            //odata[indices[index]].color = idata[index].color;
+            //odata[indices[index]].pixelIndex = idata[index].pixelIndex;
+            //odata[indices[index]].remainingBounces = idata[index].remainingBounces;
+            //odata[indices[index]].ray.direction = idata[index].ray.direction;
+            //odata[indices[index]].ray.origin = idata[index].ray.origin;
+            memcpy(odata + indices[index], idata + index, sizeof(PathSegment));
+            //odata[indices[index]] = idata[index];
+        }
+    }
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..8ecc326
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+#include "device_launch_parameters.h"
+#include "../src/sceneStructs.h"
+
+#define blockSize 256
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+//#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+//void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata);
+
+        __global__ void kernScatter(int n, int* odata,
+            const int* idata, const int* bools, const int* indices);
+
+        __global__ void pathtrace_kernMapToBoolean(int n, int *bools, const PathSegment*idata);
+
+        __global__ void pathtrace_kernScatter(int n, PathSegment*odata,
+                const PathSegment*idata, const int *bools, const int *indices);
+
+        /**
+        * This class is used for timing the performance
+        * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+        */
+        class PerformanceTimer
+        {
+        public:
+            PerformanceTimer()
+            {
+                cudaEventCreate(&event_start);
+                cudaEventCreate(&event_end);
+            }
+
+            ~PerformanceTimer()
+            {
+                cudaEventDestroy(event_start);
+                cudaEventDestroy(event_end);
+            }
+
+            void startCpuTimer()
+            {
+                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+                cpu_timer_started = true;
+
+                time_start_cpu = std::chrono::high_resolution_clock::now();
+            }
+
+            void endCpuTimer()
+            {
+                time_end_cpu = std::chrono::high_resolution_clock::now();
+
+                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+                std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+                prev_elapsed_time_cpu_milliseconds =
+                    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+                cpu_timer_started = false;
+            }
+
+            void startGpuTimer()
+            {
+                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+                gpu_timer_started = true;
+
+                cudaEventRecord(event_start);
+            }
+
+            void endGpuTimer()
+            {
+                cudaEventRecord(event_end);
+                cudaEventSynchronize(event_end);
+
+                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+                cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+                gpu_timer_started = false;
+            }
+
+            float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+            {
+                return prev_elapsed_time_cpu_milliseconds;
+            }
+
+            float getGpuElapsedTimeForPreviousOperation() //noexcept
+            {
+                return prev_elapsed_time_gpu_milliseconds;
+            }
+
+            // remove copy and move functions
+            PerformanceTimer(const PerformanceTimer&) = delete;
+            PerformanceTimer(PerformanceTimer&&) = delete;
+            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+        private:
+            cudaEvent_t event_start = nullptr;
+            cudaEvent_t event_end = nullptr;
+
+            using time_point_t = std::chrono::high_resolution_clock::time_point;
+            time_point_t time_start_cpu;
+            time_point_t time_end_cpu;
+
+            bool cpu_timer_started = false;
+            bool gpu_timer_started = false;
+
+            float prev_elapsed_time_cpu_milliseconds = 0.f;
+            float prev_elapsed_time_gpu_milliseconds = 0.f;
+        };
+    }
+}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
new file mode 100644
index 0000000..ffdfe06
--- /dev/null
+++ b/stream_compaction/efficient.cu
@@ -0,0 +1,309 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+        __global__ void kernUpSweep(int n, int d, int* idata) {
+            // Parallel Reduction
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int k = index * (1 << (d + 1));
+            idata[k + (1 << (d + 1)) - 1] += idata[k + (1 << d) - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int d, int* idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int k = index * (1 << (d + 1));
+            int t = idata[k + (1 << d) - 1];
+            idata[k + (1 << d) - 1] = idata[k + (1 << (d + 1)) - 1];
+            idata[k + (1 << (d + 1)) - 1] += t;
+        }
+       
+        __host__ __device__ int copyIlog2(int x) { //copied the given functions bc i am lazy
+            int lg = 0;
+            while (x >>= 1) {
+                ++lg;
+            }
+            return lg;
+        }
+
+        __host__ __device__ int copyIlog2ceil(int x) {
+            return x == 1 ? 0 : copyIlog2(x - 1) + 1;
+        }
+        
+        
+        // Steps for shared scan
+        //    1. Launch kernel with N / (blockSize * 2) blocks, blockSize threads per blockSize
+        //    2. For each block generate a shared mem size 2 * blockSize
+        //    3. Load values from input array in pairs to shared mem
+        //    4. Do same indexing upsweep scheme as before on individual blocks
+        //    5. Get INCLUSIVE element for each block endand add to temp array
+        //    6. Zero out last element of each block, like with root zeroing
+        //    7. Down sweep on individual blocks
+        //      OR DO REST ON CPU :)
+        //    8. Pass the temp array as the new input array and recurse steps 1 - 7
+        //    9. Recursively add the output of the temp array as an offset to each block
+        __global__ void kernSharedScan(int n, int* idata, int* temp) {
+            // Parallel Reduction w/ shared memory
+            // Shared memory should be 2 * blockSize
+            __shared__ int partialSum[2 * blockSize];
+            // Load input memory into shared memory in pairs
+            int index = threadIdx.x + (blockIdx.x * blockDim.x); //index of all launched threads (N / 2)
+            int sharedIdx = threadIdx.x; // per block index
+            partialSum[sharedIdx * 2] = idata[index * 2];
+            partialSum[sharedIdx * 2 + 1] = idata[index * 2 + 1];
+            // Per block upsweep
+            int logBlock = copyIlog2ceil(blockDim.x * 2); //blockSize * 2 since we are doing blockSize*2 elements per block
+            for (int d = 0; d < logBlock; ++d) { // Runs log2(blockSize) times
+                __syncthreads();
+                if (sharedIdx < (blockDim.x / (1 << d))) {
+                    int k = sharedIdx * (1 << (d + 1));
+                    partialSum[k + (1 << (d + 1)) - 1] += partialSum[k + (1 << d) - 1];
+                }
+            }
+            __syncthreads();
+            // Save last INCLUSIVE VALUE of block (for recursion and offset)
+            // Zero out root
+            if (sharedIdx == blockDim.x - 1) { // Last thread in block
+                temp[blockIdx.x] = partialSum[2 * blockDim.x - 1]; //+ idata[(2 * blockDim.x - 1) + blockIdx.x * blockDim.x * 2]; // Last element in shared mem + last element of block in idata (last inclusive element)
+                partialSum[2 * blockDim.x - 1] = 0; 
+            }
+            __syncthreads();
+            // Per block downsweep
+            for (int d = logBlock - 1; d >= 0; --d) {
+                if (sharedIdx < (blockDim.x / (1 << d))) {
+                    int k = sharedIdx * (1 << (d + 1));
+                    int t = partialSum[k + (1 << d) - 1];
+                    partialSum[k + (1 << d) - 1] = partialSum[k + (1 << (d + 1)) - 1];
+                    partialSum[k + (1 << (d + 1)) - 1] += t;
+                }
+            }
+            __syncthreads();
+            //Write to input array in place
+            idata[index * 2] = partialSum[sharedIdx * 2];
+            idata[index * 2 + 1] = partialSum[sharedIdx * 2 + 1];
+        }
+
+        // Function to add offset buffer to each block
+        // ex. offset = [10, 20, 30], add 10 to block 0, add 20 to block 1, add 30 to block 2   
+        __global__ void addOffsets(int n, int* idata, int* offsets) {
+            // n is num elements in idata
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index > n) {
+                return;
+            }
+            idata[index] += offsets[(int)index / (blockDim.x * 2)];
+        }
+
+        void scan(int n, int* odata, const int* idata) {
+                int paddedN = (1 << ilog2ceil(n));
+                int* dev_idata;
+                cudaMalloc((void**)&dev_idata, paddedN * sizeof(int));
+                //checkCUDAError("cudaMalloc dev_idata failed!");
+                cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+                cudaMemset(dev_idata + n, 0, (paddedN - n) * sizeof(int));
+                //cudaDeviceSynchronize();
+
+                //timer().startGpuTimer();
+                //Determine size of temp array after 1 pass
+                int tempSize = n / (blockSize * 2);
+                dim3 gridSize(tempSize);
+                int* dev_temp;
+                int* temp = (int*) malloc(tempSize * sizeof(int));
+                cudaMalloc((void**)& dev_temp, tempSize * sizeof(int));
+                cudaDeviceSynchronize();
+                kernSharedScan << <gridSize, blockSize >> > (paddedN, dev_idata, dev_temp);
+                //checkCUDAError("kernSharedScan failed!");
+                cudaDeviceSynchronize();
+                cudaMemcpy(temp, dev_temp, tempSize * sizeof(int), cudaMemcpyDeviceToHost);
+                cudaDeviceSynchronize();
+                int prev = 0;
+                for (int i = 0; i < tempSize; ++i) { // In-place CPU exclusive scan
+                    int tempVal = temp[i];
+                    temp[i] = prev;
+                    prev += tempVal;
+                }
+                
+                cudaMemcpy(dev_temp, temp, tempSize * sizeof(int), cudaMemcpyHostToDevice);
+                cudaDeviceSynchronize();
+                dim3 offsetGridSize(paddedN / blockSize);
+                addOffsets << <offsetGridSize, blockSize >> > (paddedN, dev_idata, dev_temp);
+                //checkCUDAError("addOffsets failed!");
+                cudaDeviceSynchronize();
+                //timer().endGpuTimer();
+
+                cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+                cudaFree(dev_idata);
+                cudaFree(dev_temp);
+                free(temp);
+        }
+
+        __global__ void kernZeroRoot(int n, int* idata) {
+            // Root is last element
+            idata[n - 1] = 0;
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void oldScan(int n, int *odata, const int *idata) {
+            // Account for non-powers of 2 by padding by 0
+            int paddedN = (1 << ilog2ceil(n));
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, paddedN * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemset(dev_idata + n, 0, (paddedN - n) * sizeof(int));
+            cudaDeviceSynchronize();
+
+            timer().startGpuTimer();
+            // Upsweep
+            for (int i = 0; i < ilog2ceil(n); ++i) {
+                int numThreads = paddedN / (1 << (i + 1));
+                dim3 upSweepGridSize((numThreads + blockSize - 1) / blockSize);
+                kernUpSweep << <upSweepGridSize, blockSize >> >
+                    (numThreads, i, dev_idata);
+                //checkCUDAError("kernUpSweep failed!");
+                cudaDeviceSynchronize();
+            }
+
+            // Downsweep
+            kernZeroRoot << <1, 1 >> > (paddedN, dev_idata);
+            for (int i = ilog2ceil(n) - 1; i >= 0; --i) {
+                int numThreads = paddedN / (1 << (i + 1));
+                dim3 downSweepGridSize((numThreads + blockSize - 1) / blockSize);
+                kernDownSweep << <downSweepGridSize, blockSize >> >
+                    (numThreads, i, dev_idata);
+                //checkCUDAError("kernDownSweep failed!");
+                cudaDeviceSynchronize();
+            }
+            timer().endGpuTimer();
+            cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_idata);
+        }
+
+        /**
+         * Performs stream compaction on idata, storing the result into odata.
+         * All zeroes are discarded.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to compact.
+         * @returns      The number of elements remaining after compaction.
+         */
+        int compact(int n, PathSegment* dev_odata, const PathSegment* dev_idata) {
+            // Account for non-powers of 2 by padding by 0
+            int paddedN = (1 << ilog2ceil(n));
+            //int* dev_idata;
+            //int* dev_odata;
+            int* dev_bool;
+            int* dev_indices;
+
+            //cudaMalloc((void**)&dev_idata, n * sizeof(PathSegment));
+            //checkCUDAError("cudaMalloc dev_idata failed!");
+            //cudaMemcpy(dev_idata, idata, n * sizeof(PathSegment), cudaMemcpyHostToDevice);
+            
+            // Pad bool array instead of idata to save operations in kernMapToBoolean
+            cudaMalloc((void**)&dev_bool, paddedN * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_bool failed!");
+            cudaMemset(dev_bool + n, 0, (paddedN - n) * sizeof(int));
+
+            cudaMalloc((void**)&dev_indices, paddedN * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_indices failed!");
+            //cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaDeviceSynchronize();
+
+
+            //Determine size of temp array after 1 pass
+            int tempSize = paddedN / (blockSize * 2);
+            dim3 gridSize(tempSize);
+            int* dev_temp;
+            int* temp = (int*)malloc(tempSize * sizeof(int));
+            cudaMalloc((void**)&dev_temp, tempSize * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_temp failed!");
+
+            //timer().startGpuTimer();
+            // Binarize
+            dim3 nGridSize((n + blockSize - 1) / blockSize);
+            StreamCompaction::Common::pathtrace_kernMapToBoolean << < nGridSize, blockSize >> >
+                (n, dev_bool, dev_idata);
+            //checkCUDAError("kernMapToBoolean failed!");
+            cudaDeviceSynchronize();
+            // We need bool array for scatter so copy bool result to indices to be modified in place
+            cudaMemcpy(dev_indices, dev_bool, paddedN * sizeof(int), cudaMemcpyDeviceToDevice);
+            //checkCUDAError("cudaMemcpy failed!");
+            cudaDeviceSynchronize();
+            
+            // Shared scan copied from above
+            kernSharedScan << <gridSize, blockSize >> > (paddedN, dev_indices, dev_temp);
+            //checkCUDAError("kernSharedScan failed!");
+            cudaDeviceSynchronize();
+            cudaMemcpy(temp, dev_temp, tempSize * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+            int prev = 0;
+            for (int i = 0; i < tempSize; ++i) { // In-place CPU exclusive scan
+                int tempVal = temp[i];
+                temp[i] = prev;
+                prev += tempVal;
+            }
+            cudaMemcpy(dev_temp, temp, tempSize * sizeof(int), cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+            dim3 offsetGridSize(paddedN / blockSize);
+            addOffsets << <offsetGridSize, blockSize >> > (paddedN, dev_indices, dev_temp);
+            //checkCUDAError("addOffsets failed!");
+            cudaDeviceSynchronize();
+            //// Copied Scan code from above
+            //// Upsweep
+            //for (int i = 0; i < ilog2ceil(n); ++i) {
+            //    int numThreads = paddedN / (1 << (i + 1));
+            //    dim3 upSweepGridSize((numThreads + blockSize - 1) / blockSize);
+            //    kernUpSweep << <upSweepGridSize, blockSize >> >
+            //        (numThreads, i, dev_indices);
+            //    checkCUDAError("kernUpSweep failed!");
+            //    cudaDeviceSynchronize();
+            //}
+
+            // Downsweep
+            //kernZeroRoot << <1, 1 >> > (paddedN, dev_indices);
+            //for (int i = ilog2ceil(n) - 1; i >= 0; --i) {
+            //    int numThreads = paddedN / (1 << (i + 1));
+            //    dim3 downSweepGridSize((numThreads + blockSize - 1) / blockSize);
+            //    kernDownSweep << <downSweepGridSize, blockSize >> >
+            //        (numThreads, i, dev_indices);
+            //    checkCUDAError("kernDownSweep failed!");
+            //    cudaDeviceSynchronize();
+            //}
+
+            // Scatter
+            StreamCompaction::Common::pathtrace_kernScatter << <nGridSize, blockSize >> >
+                (n, dev_odata, dev_idata, dev_bool, dev_indices);
+            //checkCUDAError("kernScatter failed!");
+            cudaDeviceSynchronize();
+            //timer().endGpuTimer();
+            int* finalNum = (int*) malloc(sizeof(int));
+            cudaMemcpy(finalNum, dev_indices + paddedN - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            //cudaMemcpy(odata, dev_odata, finalNum * sizeof(int), cudaMemcpyDeviceToHost);
+            //cudaFree(dev_idata);
+            cudaFree(dev_bool);
+            cudaFree(dev_indices);
+            //cudaFree(dev_odata);
+            return finalNum[0];
+        }
+    }
+}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
new file mode 100644
index 0000000..5b1d5f2
--- /dev/null
+++ b/stream_compaction/efficient.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "common.h"
+#include "../src/sceneStructs.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compact(int n, PathSegment*odata, const PathSegment*idata);
+    }
+}