Skip to content
This repository was archived by the owner on Dec 9, 2024. It is now read-only.

Commit 58d7293

Browse files
authored
Merge pull request #272 from tresreid/ranges
Move CPU Range functions to GPU
2 parents 398270a + c14adbb commit 58d7293

File tree

10 files changed

+157
-147
lines changed

10 files changed

+157
-147
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,9 @@ Comparing two different runs
102102
lst_plot_performance.py \
103103
num_den_hist_1.root \ # Reference
104104
num_den_hist_2.root \ # New work
105-
-l BaseLine,MyNewWork \ # Labeling
106-
-t "mywork"
105+
-L BaseLine,MyNewWork \ # Labeling
106+
-t "mywork" \
107+
--compare
107108

108109
## CMSSW Integration
109110
This is the a complete set of instruction on how the TrackLooper code

SDL/Event.cu

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,6 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
687687
if(mdsInGPU == nullptr)
688688
{
689689
mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream);
690-
//hardcoded range numbers for this will come from studies!
691690
unsigned int nTotalMDs;
692691
createMDArrayRanges(*modulesInGPU, *rangesInGPU, nLowerModules, nTotalMDs, stream, N_MAX_PIXEL_MD_PER_MODULES);
693692
createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream);
@@ -702,7 +701,13 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
702701
//hardcoded range numbers for this will come from studies!
703702
// can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously.
704703
// If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them
705-
createSegmentArrayRanges(*modulesInGPU, *rangesInGPU, *mdsInGPU, nLowerModules, nTotalSegments, stream, N_MAX_PIXEL_SEGMENTS_PER_MODULE);
704+
unsigned int *device_nTotalSegments;
705+
cudaMalloc((void **)&device_nTotalSegments, sizeof(unsigned int));
706+
createSegmentArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *mdsInGPU, device_nTotalSegments);
707+
cudaMemcpyAsync(&nTotalSegments,device_nTotalSegments,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
708+
cudaStreamSynchronize(stream);
709+
nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
710+
cudaFree(device_nTotalSegments);
706711
createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream);
707712

708713
cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
@@ -1101,8 +1106,12 @@ void SDL::Event::createTriplets()
11011106
{
11021107
tripletsInGPU = (SDL::triplets*)cms::cuda::allocate_host(sizeof(SDL::triplets), stream);
11031108
unsigned int maxTriplets;
1104-
createTripletArrayRanges(*modulesInGPU, *rangesInGPU, *segmentsInGPU, nLowerModules, maxTriplets, stream);
1105-
// cout<<"nTotalTriplets: "<<maxTriplets<<std::endl; // for memory usage
1109+
unsigned int *device_maxTriplets;
1110+
cudaMalloc((void **)&device_maxTriplets, sizeof(unsigned int));
1111+
createTripletArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *segmentsInGPU, device_maxTriplets);
1112+
cudaMemcpyAsync(&maxTriplets,device_maxTriplets,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
1113+
cudaStreamSynchronize(stream);
1114+
cudaFree(device_maxTriplets);
11061115
createTripletsInExplicitMemory(*tripletsInGPU, maxTriplets, nLowerModules,stream);
11071116

11081117
cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
@@ -1377,12 +1386,11 @@ cudaStreamSynchronize(stream);
13771386
unsigned int nTotalQuintuplets;
13781387
unsigned int *device_nTotalQuintuplets;
13791388
cudaMalloc((void **)&device_nTotalQuintuplets, sizeof(unsigned int));
1380-
createEligibleModulesListForQuintupletsGPU<<<1,1024,0,stream>>>(*modulesInGPU, *tripletsInGPU, device_nTotalQuintuplets, stream, *rangesInGPU);
1381-
cudaStreamSynchronize(stream);
1389+
createEligibleModulesListForQuintupletsGPU<<<1,1024,0,stream>>>(*modulesInGPU, *tripletsInGPU, device_nTotalQuintuplets, *rangesInGPU);
13821390
cudaMemcpyAsync(&nEligibleT5Modules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
13831391
cudaMemcpyAsync(&nTotalQuintuplets,device_nTotalQuintuplets,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
1384-
cudaFree(device_nTotalQuintuplets);
13851392
cudaStreamSynchronize(stream);
1393+
cudaFree(device_nTotalQuintuplets);
13861394

13871395
if(quintupletsInGPU == nullptr)
13881396
{

SDL/Module.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,12 @@ void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned
2929
rangesInGPU.nEligibleT5Modules = (uint16_t*)cms::cuda::allocate_device(dev,sizeof(unsigned int),stream);
3030

3131
rangesInGPU.quintupletModuleIndices = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
32+
rangesInGPU.quintupletModuleOccupancy = (int*)cms::cuda::allocate_device(dev,nLowerModules * sizeof(int),stream);
3233
rangesInGPU.miniDoubletModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
3334
rangesInGPU.segmentModuleIndices = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
35+
rangesInGPU.segmentModuleOccupancy = (int*)cms::cuda::allocate_device(dev, (nLowerModules + 1) * sizeof(int), stream);
3436
rangesInGPU.tripletModuleIndices = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
37+
rangesInGPU.tripletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
3538

3639
#else
3740
cudaMalloc(&rangesInGPU.hitRanges,nModules * 2 * sizeof(int));
@@ -47,10 +50,13 @@ void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned
4750
cudaMalloc(&rangesInGPU.quintupletRanges, nModules * 2 * sizeof(int));
4851
cudaMalloc(&rangesInGPU.nEligibleT5Modules, sizeof(uint16_t));
4952
cudaMalloc(&rangesInGPU.quintupletModuleIndices, nLowerModules * sizeof(int));
53+
cudaMalloc(&rangesInGPU.quintupletModuleOccupancy, nLowerModules * sizeof(int));
5054

5155
cudaMalloc(&rangesInGPU.miniDoubletModuleIndices, (nLowerModules + 1) * sizeof(int));
5256
cudaMalloc(&rangesInGPU.segmentModuleIndices, (nLowerModules + 1) * sizeof(int));
57+
cudaMalloc(&rangesInGPU.segmentModuleOccupancy, (nLowerModules + 1) * sizeof(int));
5358
cudaMalloc(&rangesInGPU.tripletModuleIndices, nLowerModules * sizeof(int));
59+
cudaMalloc(&rangesInGPU.tripletModuleOccupancy, nLowerModules * sizeof(int));
5460

5561
#endif
5662
}
@@ -101,13 +107,16 @@ void SDL::objectRanges::freeMemoryCache()//struct objectRanges& rangesInGPU)
101107
cms::cuda::free_device(dev,nEligibleT5Modules);
102108
cms::cuda::free_device(dev, indicesOfEligibleT5Modules);
103109
cms::cuda::free_device(dev,quintupletModuleIndices);
110+
cms::cuda::free_device(dev,quintupletModuleOccupancy);
104111
cms::cuda::free_device(dev, hitRangesLower);
105112
cms::cuda::free_device(dev, hitRangesUpper);
106113
cms::cuda::free_device(dev, hitRangesnLower);
107114
cms::cuda::free_device(dev, hitRangesnUpper);
108115
cms::cuda::free_device(dev, miniDoubletModuleIndices);
109116
cms::cuda::free_device(dev, segmentModuleIndices);
117+
cms::cuda::free_device(dev, segmentModuleOccupancy);
110118
cms::cuda::free_device(dev, tripletModuleIndices);
119+
cms::cuda::free_device(dev, tripletModuleOccupancy);
111120
}
112121
void SDL::objectRanges::freeMemory()
113122
{
@@ -125,9 +134,12 @@ void SDL::objectRanges::freeMemory()
125134
cudaFree(nEligibleT5Modules);
126135
cudaFree(indicesOfEligibleT5Modules);
127136
cudaFree(quintupletModuleIndices);
137+
cudaFree(quintupletModuleOccupancy);
128138
cudaFree(miniDoubletModuleIndices);
129139
cudaFree(segmentModuleIndices);
140+
cudaFree(segmentModuleOccupancy);
130141
cudaFree(tripletModuleIndices);
142+
cudaFree(tripletModuleOccupancy);
131143
}
132144
void SDL::freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping)
133145
{

SDL/Module.cuh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,12 @@ namespace SDL
6565
uint16_t* indicesOfEligibleT5Modules;// will be allocated in createQuintuplets kernel!!!!
6666
//to store different starting points for variable occupancy stuff
6767
int *quintupletModuleIndices;
68+
int *quintupletModuleOccupancy;
6869
int *miniDoubletModuleIndices;
6970
int *segmentModuleIndices;
71+
int *segmentModuleOccupancy;
7072
int *tripletModuleIndices;
73+
int *tripletModuleOccupancy;
7174

7275
// unsigned int nTotalQuintuplets;
7376

SDL/Quintuplet.cu

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ void SDL::quintuplets::freeMemory(cudaStream_t stream)
8383
cudaStreamSynchronize(stream);
8484
}
8585
//TODO:Reuse the track candidate one instead of this!
86-
__global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU,struct triplets& tripletsInGPU, unsigned int* device_nTotalQuintuplets, cudaStream_t stream,struct objectRanges& rangesInGPU)
86+
__global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU,struct triplets& tripletsInGPU, unsigned int* device_nTotalQuintuplets, struct objectRanges& rangesInGPU)
8787
{
8888
__shared__ int nEligibleT5Modulesx;
8989
__shared__ unsigned int nTotalQuintupletsx;
@@ -113,28 +113,28 @@ __global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules&
113113
if (subdets == SDL::Endcap and layers > 1) continue;
114114

115115
int nEligibleT5Modules = atomicAdd(&nEligibleT5Modulesx,1);
116-
if (nEligibleT5Modules < 0) printf("%u\n",nEligibleT5Modules);
117116
if (layers<=3 && subdets==5) category_number = 0;
118-
if (layers>=4 && subdets==5) category_number = 1;
119-
if (layers<=2 && subdets==4 && rings>=11) category_number = 2;
120-
if (layers>=3 && subdets==4 && rings>=8) category_number = 2;
121-
if (layers<=2 && subdets==4 && rings<=10) category_number = 3;
122-
if (layers>=3 && subdets==4 && rings<=7) category_number = 3;
117+
else if (layers>=4 && subdets==5) category_number = 1;
118+
else if (layers<=2 && subdets==4 && rings>=11) category_number = 2;
119+
else if (layers>=3 && subdets==4 && rings>=8) category_number = 2;
120+
else if (layers<=2 && subdets==4 && rings<=10) category_number = 3;
121+
else if (layers>=3 && subdets==4 && rings<=7) category_number = 3;
123122
if (abs(eta)<0.75) eta_number=0;
124-
if (abs(eta)>0.75 && abs(eta)<1.5) eta_number=1;
125-
if (abs(eta)>1.5 && abs(eta)<2.25) eta_number=2;
126-
if (abs(eta)>2.25 && abs(eta)<3) eta_number=3;
123+
else if (abs(eta)>0.75 && abs(eta)<1.5) eta_number=1;
124+
else if (abs(eta)>1.5 && abs(eta)<2.25) eta_number=2;
125+
else if (abs(eta)>2.25 && abs(eta)<3) eta_number=3;
127126

128127
if (category_number == 0 && eta_number == 0) occupancy = 336;
129-
if (category_number == 0 && eta_number == 1) occupancy = 414;
130-
if (category_number == 0 && eta_number == 2) occupancy = 231;
131-
if (category_number == 0 && eta_number == 3) occupancy = 146;
132-
if (category_number == 3 && eta_number == 1) occupancy = 0;
133-
if (category_number == 3 && eta_number == 2) occupancy = 191;
134-
if (category_number == 3 && eta_number == 3) occupancy = 106;
128+
else if (category_number == 0 && eta_number == 1) occupancy = 414;
129+
else if (category_number == 0 && eta_number == 2) occupancy = 231;
130+
else if (category_number == 0 && eta_number == 3) occupancy = 146;
131+
else if (category_number == 3 && eta_number == 1) occupancy = 0;
132+
else if (category_number == 3 && eta_number == 2) occupancy = 191;
133+
else if (category_number == 3 && eta_number == 3) occupancy = 106;
135134

136135
unsigned int nTotQ = atomicAdd(&nTotalQuintupletsx,occupancy);
137136
rangesInGPU.quintupletModuleIndices[i] = nTotQ;
137+
rangesInGPU.quintupletModuleOccupancy[i] = occupancy;
138138
rangesInGPU.indicesOfEligibleT5Modules[nEligibleT5Modules] = i;
139139
}
140140
__syncthreads();
@@ -1301,7 +1301,7 @@ __global__ void SDL::createQuintupletsInGPUv2(struct SDL::modules& modulesInGPU,
13011301
return;
13021302
} // ignore anything else TODO: move this to start, before object is made (faster)
13031303
unsigned int totOccupancyQuintuplets = atomicAdd(&quintupletsInGPU.totOccupancyQuintuplets[lowerModule1], 1);
1304-
if(totOccupancyQuintuplets >= (rangesInGPU.quintupletModuleIndices[lowerModule1 + 1] - rangesInGPU.quintupletModuleIndices[lowerModule1]))
1304+
if(totOccupancyQuintuplets >= (rangesInGPU.quintupletModuleOccupancy[lowerModule1]))
13051305
{
13061306
#ifdef Warnings
13071307
printf("Quintuplet excess alert! Module index = %d\n", lowerModule1);

SDL/Quintuplet.cuh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ namespace SDL
6060

6161
void createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& maxQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream);
6262

63-
// void createEligibleModulesListForQuintuplets(struct modules& modulesInGPU, struct triplets& tripletsInGPU, uint16_t& nEligibleModules, uint16_t* indicesOfEligibleModules, unsigned int& nTotalQuintuplets, unsigned int& maxTriplets,cudaStream_t stream, struct objectRanges& rangesInGPU);
64-
__global__ void createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU, struct triplets& tripletsInGPU, unsigned int* nTotalQuintuplets, cudaStream_t stream, struct objectRanges& rangesInGPU);
63+
__global__ void createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU, struct triplets& tripletsInGPU, unsigned int* nTotalQuintuplets, struct objectRanges& rangesInGPU);
6564

6665
// CUDA_DEV void rmQuintupletToMemory(struct SDL::quintuplets& quintupletsInGPU, unsigned int quintupletIndex);
6766

0 commit comments

Comments
 (0)