SegmentLinking
diff --git a/‎README.md
Lines changed: 3 additions & 2 deletions b/‎README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎SDL/Event.cu
Lines changed: 15 additions & 7 deletions b/‎SDL/Event.cu
Lines changed: 15 additions & 7 deletions
diff --git a/‎SDL/Module.cu
Lines changed: 12 additions & 0 deletions b/‎SDL/Module.cu
Lines changed: 12 additions & 0 deletions
diff --git a/‎SDL/Module.cuh
Lines changed: 3 additions & 0 deletions b/‎SDL/Module.cuh
Lines changed: 3 additions & 0 deletions
diff --git a/‎SDL/Quintuplet.cu
Lines changed: 17 additions & 17 deletions b/‎SDL/Quintuplet.cu
Lines changed: 17 additions & 17 deletions
diff --git a/‎SDL/Quintuplet.cuh
Lines changed: 1 addition & 2 deletions b/‎SDL/Quintuplet.cuh
Lines changed: 1 addition & 2 deletions
@@ -102,8 +102,9 @@ Comparing two different runs
     lst_plot_performance.py \
         num_den_hist_1.root \     # Reference
         num_den_hist_2.root \     # New work
-        -l BaseLine,MyNewWork \   # Labeling
-        -t "mywork"
+        -L BaseLine,MyNewWork \   # Labeling
+        -t "mywork" \
+        --compare
 
 ## CMSSW Integration
 This is the a complete set of instruction on how the TrackLooper code
 
@@ -687,7 +687,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
     if(mdsInGPU == nullptr)
     {
         mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream);
-        //hardcoded range numbers for this will come from studies!
         unsigned int nTotalMDs;
         createMDArrayRanges(*modulesInGPU, *rangesInGPU, nLowerModules, nTotalMDs, stream, N_MAX_PIXEL_MD_PER_MODULES);
     	createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream);
@@ -702,7 +701,13 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
         //hardcoded range numbers for this will come from studies!
         // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously.
         // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them
-        createSegmentArrayRanges(*modulesInGPU, *rangesInGPU, *mdsInGPU, nLowerModules, nTotalSegments, stream, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
+        unsigned int *device_nTotalSegments;
+        cudaMalloc((void **)&device_nTotalSegments, sizeof(unsigned int));
+        createSegmentArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *mdsInGPU, device_nTotalSegments);
+        cudaMemcpyAsync(&nTotalSegments,device_nTotalSegments,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
+        cudaStreamSynchronize(stream);
+        nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
+        cudaFree(device_nTotalSegments);
         createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream);
 
         cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
@@ -1101,8 +1106,12 @@ void SDL::Event::createTriplets()
     {
         tripletsInGPU = (SDL::triplets*)cms::cuda::allocate_host(sizeof(SDL::triplets), stream);
         unsigned int maxTriplets;
-        createTripletArrayRanges(*modulesInGPU, *rangesInGPU, *segmentsInGPU, nLowerModules, maxTriplets, stream);
-//        cout<<"nTotalTriplets: "<<maxTriplets<<std::endl; // for memory usage
+        unsigned int *device_maxTriplets;
+        cudaMalloc((void **)&device_maxTriplets, sizeof(unsigned int));
+        createTripletArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *segmentsInGPU, device_maxTriplets);
+        cudaMemcpyAsync(&maxTriplets,device_maxTriplets,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
+        cudaStreamSynchronize(stream);
+        cudaFree(device_maxTriplets);
         createTripletsInExplicitMemory(*tripletsInGPU, maxTriplets, nLowerModules,stream);
 
         cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
@@ -1377,12 +1386,11 @@ cudaStreamSynchronize(stream);
     unsigned int nTotalQuintuplets;
     unsigned int *device_nTotalQuintuplets;
     cudaMalloc((void **)&device_nTotalQuintuplets, sizeof(unsigned int));
-    createEligibleModulesListForQuintupletsGPU<<<1,1024,0,stream>>>(*modulesInGPU, *tripletsInGPU, device_nTotalQuintuplets, stream, *rangesInGPU);
-cudaStreamSynchronize(stream);
+    createEligibleModulesListForQuintupletsGPU<<<1,1024,0,stream>>>(*modulesInGPU, *tripletsInGPU, device_nTotalQuintuplets, *rangesInGPU);
     cudaMemcpyAsync(&nEligibleT5Modules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
     cudaMemcpyAsync(&nTotalQuintuplets,device_nTotalQuintuplets,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
-    cudaFree(device_nTotalQuintuplets);
 cudaStreamSynchronize(stream);
+    cudaFree(device_nTotalQuintuplets);
 
     if(quintupletsInGPU == nullptr)
     {
 
@@ -29,9 +29,12 @@ void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned
     rangesInGPU.nEligibleT5Modules =    (uint16_t*)cms::cuda::allocate_device(dev,sizeof(unsigned int),stream);
 
     rangesInGPU.quintupletModuleIndices = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
+    rangesInGPU.quintupletModuleOccupancy = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
     rangesInGPU.miniDoubletModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
     rangesInGPU.segmentModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
+    rangesInGPU.segmentModuleOccupancy = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
     rangesInGPU.tripletModuleIndices = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
+    rangesInGPU.tripletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
 
 #else
     cudaMalloc(&rangesInGPU.hitRanges,nModules * 2 * sizeof(int));
@@ -47,10 +50,13 @@ void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned
     cudaMalloc(&rangesInGPU.quintupletRanges, nModules * 2 * sizeof(int));
     cudaMalloc(&rangesInGPU.nEligibleT5Modules, sizeof(uint16_t));
     cudaMalloc(&rangesInGPU.quintupletModuleIndices, nLowerModules * sizeof(int));
+    cudaMalloc(&rangesInGPU.quintupletModuleOccupancy, nLowerModules * sizeof(int));
 
     cudaMalloc(&rangesInGPU.miniDoubletModuleIndices, (nLowerModules + 1) * sizeof(int));
     cudaMalloc(&rangesInGPU.segmentModuleIndices, (nLowerModules + 1) * sizeof(int));
+    cudaMalloc(&rangesInGPU.segmentModuleOccupancy, (nLowerModules + 1) * sizeof(int));
     cudaMalloc(&rangesInGPU.tripletModuleIndices, nLowerModules * sizeof(int));
+    cudaMalloc(&rangesInGPU.tripletModuleOccupancy, nLowerModules * sizeof(int));
 
 #endif
 }
@@ -101,13 +107,16 @@ void SDL::objectRanges::freeMemoryCache()//struct objectRanges& rangesInGPU)
   cms::cuda::free_device(dev,nEligibleT5Modules);
   cms::cuda::free_device(dev, indicesOfEligibleT5Modules);
   cms::cuda::free_device(dev,quintupletModuleIndices);
+  cms::cuda::free_device(dev,quintupletModuleOccupancy);
   cms::cuda::free_device(dev, hitRangesLower);
   cms::cuda::free_device(dev, hitRangesUpper);
   cms::cuda::free_device(dev, hitRangesnLower);
   cms::cuda::free_device(dev, hitRangesnUpper);
   cms::cuda::free_device(dev, miniDoubletModuleIndices);
   cms::cuda::free_device(dev, segmentModuleIndices);
+  cms::cuda::free_device(dev, segmentModuleOccupancy);
   cms::cuda::free_device(dev, tripletModuleIndices);
+  cms::cuda::free_device(dev, tripletModuleOccupancy);
 }
 void SDL::objectRanges::freeMemory()
 {
@@ -125,9 +134,12 @@ void SDL::objectRanges::freeMemory()
   cudaFree(nEligibleT5Modules);
   cudaFree(indicesOfEligibleT5Modules);
   cudaFree(quintupletModuleIndices);
+  cudaFree(quintupletModuleOccupancy);
   cudaFree(miniDoubletModuleIndices);
   cudaFree(segmentModuleIndices);
+  cudaFree(segmentModuleOccupancy);
   cudaFree(tripletModuleIndices);
+  cudaFree(tripletModuleOccupancy);
 }
 void SDL::freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping)
 {
 
@@ -65,9 +65,12 @@ namespace SDL
         uint16_t* indicesOfEligibleT5Modules;// will be allocated in createQuintuplets kernel!!!!
         //to store different starting points for variable occupancy stuff
         int *quintupletModuleIndices;
+        int *quintupletModuleOccupancy;
         int *miniDoubletModuleIndices;
         int *segmentModuleIndices;
+        int *segmentModuleOccupancy;
         int *tripletModuleIndices;
+        int *tripletModuleOccupancy;
 
 //        unsigned int nTotalQuintuplets;
 
 
@@ -83,7 +83,7 @@ void SDL::quintuplets::freeMemory(cudaStream_t stream)
     cudaStreamSynchronize(stream);
 }
 //TODO:Reuse the track candidate one instead of this!
-__global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU,struct triplets& tripletsInGPU, unsigned int* device_nTotalQuintuplets, cudaStream_t stream,struct objectRanges& rangesInGPU)
+__global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU,struct triplets& tripletsInGPU, unsigned int* device_nTotalQuintuplets, struct objectRanges& rangesInGPU)
 {
     __shared__ int nEligibleT5Modulesx;
     __shared__ unsigned int nTotalQuintupletsx;
@@ -113,28 +113,28 @@ __global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules&
         if (subdets == SDL::Endcap and layers > 1) continue;
 
         int nEligibleT5Modules = atomicAdd(&nEligibleT5Modulesx,1);
-        if (nEligibleT5Modules < 0) printf("%u\n",nEligibleT5Modules);
         if (layers<=3 && subdets==5) category_number = 0;
-        if (layers>=4 && subdets==5) category_number = 1;
-        if (layers<=2 && subdets==4 && rings>=11) category_number = 2;
-        if (layers>=3 && subdets==4 && rings>=8) category_number = 2;
-        if (layers<=2 && subdets==4 && rings<=10) category_number = 3;
-        if (layers>=3 && subdets==4 && rings<=7) category_number = 3;
+        else if (layers>=4 && subdets==5) category_number = 1;
+        else if (layers<=2 && subdets==4 && rings>=11) category_number = 2;
+        else if (layers>=3 && subdets==4 && rings>=8) category_number = 2;
+        else if (layers<=2 && subdets==4 && rings<=10) category_number = 3;
+        else if (layers>=3 && subdets==4 && rings<=7) category_number = 3;
         if (abs(eta)<0.75) eta_number=0;
-        if (abs(eta)>0.75 && abs(eta)<1.5) eta_number=1;
-        if (abs(eta)>1.5 && abs(eta)<2.25) eta_number=2;
-        if (abs(eta)>2.25 && abs(eta)<3) eta_number=3;
+        else if (abs(eta)>0.75 && abs(eta)<1.5) eta_number=1;
+        else if (abs(eta)>1.5 && abs(eta)<2.25) eta_number=2;
+        else if (abs(eta)>2.25 && abs(eta)<3) eta_number=3;
 
         if (category_number == 0 && eta_number == 0) occupancy = 336;
-        if (category_number == 0 && eta_number == 1) occupancy = 414;
-        if (category_number == 0 && eta_number == 2) occupancy = 231;
-        if (category_number == 0 && eta_number == 3) occupancy = 146;
-        if (category_number == 3 && eta_number == 1) occupancy = 0;
-        if (category_number == 3 && eta_number == 2) occupancy = 191;
-        if (category_number == 3 && eta_number == 3) occupancy = 106;
+        else if (category_number == 0 && eta_number == 1) occupancy = 414;
+        else if (category_number == 0 && eta_number == 2) occupancy = 231;
+        else if (category_number == 0 && eta_number == 3) occupancy = 146;
+        else if (category_number == 3 && eta_number == 1) occupancy = 0;
+        else if (category_number == 3 && eta_number == 2) occupancy = 191;
+        else if (category_number == 3 && eta_number == 3) occupancy = 106;
 
         unsigned int nTotQ = atomicAdd(&nTotalQuintupletsx,occupancy);
         rangesInGPU.quintupletModuleIndices[i] = nTotQ;
+        rangesInGPU.quintupletModuleOccupancy[i] = occupancy;
         rangesInGPU.indicesOfEligibleT5Modules[nEligibleT5Modules] = i;
     }
     __syncthreads();
@@ -1301,7 +1301,7 @@ __global__ void SDL::createQuintupletsInGPUv2(struct SDL::modules& modulesInGPU,
                     return;
                 } // ignore anything else TODO: move this to start, before object is made (faster)
                 unsigned int totOccupancyQuintuplets = atomicAdd(&quintupletsInGPU.totOccupancyQuintuplets[lowerModule1], 1);
-                if(totOccupancyQuintuplets >= (rangesInGPU.quintupletModuleIndices[lowerModule1 + 1] - rangesInGPU.quintupletModuleIndices[lowerModule1]))
+                if(totOccupancyQuintuplets >= (rangesInGPU.quintupletModuleOccupancy[lowerModule1]))
                 {
 #ifdef Warnings
                     printf("Quintuplet excess alert! Module index = %d\n", lowerModule1);
 
@@ -60,8 +60,7 @@ namespace SDL
 
     void createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& maxQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream);
 
-//    void createEligibleModulesListForQuintuplets(struct modules& modulesInGPU, struct triplets& tripletsInGPU, uint16_t& nEligibleModules, uint16_t* indicesOfEligibleModules, unsigned int& nTotalQuintuplets, unsigned int& maxTriplets,cudaStream_t stream, struct objectRanges& rangesInGPU);
-    __global__ void createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU, struct triplets& tripletsInGPU, unsigned int* nTotalQuintuplets, cudaStream_t stream, struct objectRanges& rangesInGPU);
+    __global__ void createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU, struct triplets& tripletsInGPU, unsigned int* nTotalQuintuplets, struct objectRanges& rangesInGPU);
 
 //  CUDA_DEV void rmQuintupletToMemory(struct SDL::quintuplets& quintupletsInGPU, unsigned int quintupletIndex);