Skip to content
This repository was archived by the owner on Dec 9, 2024. It is now read-only.

Commit 84326ad

Browse files
committed
remove cudaMalloc in favor of caching within rangeInGPU
1 parent 0db2cb7 commit 84326ad

File tree

11 files changed

+56
-49
lines changed

11 files changed

+56
-49
lines changed

SDL/Event.cu

Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -303,23 +303,23 @@ void SDL::Event::resetEvent()
303303
}
304304
}
305305
if(hitsInGPU){cms::cuda::free_host(hitsInGPU);
306-
hitsInGPU = nullptr;}
306+
hitsInGPU = nullptr;}
307307
if(mdsInGPU){cms::cuda::free_host(mdsInGPU);
308-
mdsInGPU = nullptr;}
308+
mdsInGPU = nullptr;}
309309
if(rangesInGPU){cms::cuda::free_host(rangesInGPU);
310-
rangesInGPU = nullptr;}
310+
rangesInGPU = nullptr;}
311311
if(segmentsInGPU){cms::cuda::free_host(segmentsInGPU);
312-
segmentsInGPU = nullptr;}
312+
segmentsInGPU = nullptr;}
313313
if(tripletsInGPU){cms::cuda::free_host(tripletsInGPU);
314-
tripletsInGPU = nullptr;}
315-
if(quintupletsInGPU){cms::cuda::free_host(quintupletsInGPU);
314+
tripletsInGPU = nullptr;}
315+
if(quintupletsInGPU){cms::cuda::free_host(quintupletsInGPU);
316316
quintupletsInGPU = nullptr;}
317317
if(trackCandidatesInGPU){cms::cuda::free_host(trackCandidatesInGPU);
318-
trackCandidatesInGPU = nullptr;}
318+
trackCandidatesInGPU = nullptr;}
319319
if(pixelTripletsInGPU){cms::cuda::free_host(pixelTripletsInGPU);
320-
pixelTripletsInGPU = nullptr;}
320+
pixelTripletsInGPU = nullptr;}
321321
if(pixelQuintupletsInGPU){cms::cuda::free_host(pixelQuintupletsInGPU);
322-
pixelQuintupletsInGPU = nullptr;}
322+
pixelQuintupletsInGPU = nullptr;}
323323

324324
if(hitsInCPU != nullptr)
325325
{
@@ -679,14 +679,11 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
679679
{
680680
mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host(sizeof(SDL::miniDoublets), stream);
681681
unsigned int nTotalMDs;
682-
unsigned int *device_nTotalMDs;
683-
cudaMalloc((void **)&device_nTotalMDs, sizeof(unsigned int));
684682
cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
685-
createMDArrayRangesGPU<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, device_nTotalMDs);
686-
cudaMemcpyAsync(&nTotalMDs,device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
683+
createMDArrayRangesGPU<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU);
684+
cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
687685
cudaStreamSynchronize(stream);
688686
nTotalMDs+= N_MAX_PIXEL_MD_PER_MODULES;
689-
cudaFree(device_nTotalMDs);
690687
createMDsInExplicitMemory(*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream);
691688
cudaMemcpyAsync(mdsInGPU->nMemoryLocations, &nTotalMDs, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
692689
cudaStreamSynchronize(stream);
@@ -698,13 +695,10 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
698695
//hardcoded range numbers for this will come from studies!
699696
// can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously.
700697
// If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them
701-
unsigned int *device_nTotalSegments;
702-
cudaMalloc((void **)&device_nTotalSegments, sizeof(unsigned int));
703-
createSegmentArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *mdsInGPU, device_nTotalSegments);
704-
cudaMemcpyAsync(&nTotalSegments,device_nTotalSegments,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
698+
createSegmentArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *mdsInGPU);
699+
cudaMemcpyAsync(&nTotalSegments,rangesInGPU->device_nTotalSegs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
705700
cudaStreamSynchronize(stream);
706701
nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
707-
cudaFree(device_nTotalSegments);
708702
createSegmentsInExplicitMemory(*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream);
709703

710704
cudaMemcpyAsync(segmentsInGPU->nMemoryLocations, &nTotalSegments, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);;
@@ -901,14 +895,11 @@ void SDL::Event::createMiniDoublets()
901895
{
902896
//hardcoded range numbers for this will come from studies!
903897
unsigned int nTotalMDs;
904-
unsigned int *device_nTotalMDs;
905-
cudaMalloc((void **)&device_nTotalMDs, sizeof(unsigned int));
906898
cudaMemsetAsync(&rangesInGPU->miniDoubletModuleOccupancy[nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof(unsigned int),stream);
907-
createMDArrayRangesGPU<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, device_nTotalMDs);
908-
cudaMemcpyAsync(&nTotalMDs,device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
899+
createMDArrayRangesGPU<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU);
900+
cudaMemcpyAsync(&nTotalMDs,rangesInGPU->device_nTotalMDs,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
909901
cudaStreamSynchronize(stream);
910902
nTotalMDs+=N_MAX_PIXEL_MD_PER_MODULES;
911-
cudaFree(device_nTotalMDs);
912903

913904
if(mdsInGPU == nullptr)
914905
{
@@ -1004,12 +995,9 @@ void SDL::Event::createTriplets()
1004995
{
1005996
tripletsInGPU = (SDL::triplets*)cms::cuda::allocate_host(sizeof(SDL::triplets), stream);
1006997
unsigned int maxTriplets;
1007-
unsigned int *device_maxTriplets;
1008-
cudaMalloc((void **)&device_maxTriplets, sizeof(unsigned int));
1009-
createTripletArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *segmentsInGPU, device_maxTriplets);
1010-
cudaMemcpyAsync(&maxTriplets,device_maxTriplets,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
998+
createTripletArrayRanges<<<1,1024,0,stream>>>(*modulesInGPU, *rangesInGPU, *segmentsInGPU);
999+
cudaMemcpyAsync(&maxTriplets,rangesInGPU->device_nTotalTrips,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
10111000
cudaStreamSynchronize(stream);
1012-
cudaFree(device_maxTriplets);
10131001
createTripletsInExplicitMemory(*tripletsInGPU, maxTriplets, nLowerModules,stream);
10141002

10151003
cudaMemcpyAsync(tripletsInGPU->nMemoryLocations, &maxTriplets, sizeof(unsigned int), cudaMemcpyHostToDevice, stream);
@@ -1272,15 +1260,12 @@ void SDL::Event::createQuintuplets()
12721260
cudaMalloc(&(rangesInGPU->indicesOfEligibleT5Modules), nLowerModules * sizeof(uint16_t));
12731261
#endif
12741262
cudaMemsetAsync(rangesInGPU->quintupletModuleIndices, -1, sizeof(int) * (nLowerModules),stream);
1275-
cudaStreamSynchronize(stream);
1263+
cudaStreamSynchronize(stream);
12761264
unsigned int nTotalQuintuplets;
1277-
unsigned int *device_nTotalQuintuplets;
1278-
cudaMalloc((void **)&device_nTotalQuintuplets, sizeof(unsigned int));
1279-
createEligibleModulesListForQuintupletsGPU<<<1,1024,0,stream>>>(*modulesInGPU, *tripletsInGPU, device_nTotalQuintuplets, *rangesInGPU);
1265+
createEligibleModulesListForQuintupletsGPU<<<1,1024,0,stream>>>(*modulesInGPU, *tripletsInGPU, *rangesInGPU);
12801266
cudaMemcpyAsync(&nEligibleT5Modules,rangesInGPU->nEligibleT5Modules,sizeof(uint16_t),cudaMemcpyDeviceToHost,stream);
1281-
cudaMemcpyAsync(&nTotalQuintuplets,device_nTotalQuintuplets,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
1282-
cudaStreamSynchronize(stream);
1283-
cudaFree(device_nTotalQuintuplets);
1267+
cudaMemcpyAsync(&nTotalQuintuplets,rangesInGPU->device_nTotalQuints,sizeof(unsigned int),cudaMemcpyDeviceToHost,stream);
1268+
cudaStreamSynchronize(stream);
12841269

12851270
if(quintupletsInGPU == nullptr)
12861271
{

SDL/MiniDoublet.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ void SDL::miniDoublets::resetMemory(unsigned int nMemoryLocationsx, unsigned int
1717
}
1818

1919

20-
__global__ void SDL::createMDArrayRangesGPU(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, unsigned int* nTotalMDsx)
20+
__global__ void SDL::createMDArrayRangesGPU(struct modules& modulesInGPU, struct objectRanges& rangesInGPU)//, unsigned int* nTotalMDsx)
2121
{
2222
short module_subdets;
2323
short module_layers;
@@ -67,7 +67,8 @@ __global__ void SDL::createMDArrayRangesGPU(struct modules& modulesInGPU, struct
6767
__syncthreads();
6868
if(threadIdx.x==0){
6969
rangesInGPU.miniDoubletModuleIndices[*modulesInGPU.nLowerModules] = nTotalMDs;
70-
*nTotalMDsx=nTotalMDs;
70+
//*nTotalMDsx=nTotalMDs;
71+
*rangesInGPU.device_nTotalMDs=nTotalMDs;
7172
}
7273

7374
}

SDL/MiniDoublet.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ namespace SDL
9292
void createMDsInExplicitMemory(struct miniDoublets& mdsInGPU, unsigned int maxMDs,uint16_t nLowerModules, unsigned int maxPixelMDs,cudaStream_t stream);
9393

9494

95-
__global__ void createMDArrayRangesGPU(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, unsigned int* nTotalMDs);
95+
__global__ void createMDArrayRangesGPU(struct modules& modulesInGPU, struct objectRanges& rangesInGPU);//, unsigned int* nTotalMDs);
9696

9797
__global__ void addMiniDoubletRangesToEventExplicit(struct modules& modulesInGPU, struct miniDoublets& mdsInGPU, struct objectRanges& rangesInGPU, struct hits& hitsInGPU);
9898

SDL/Module.cu

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned
3737
rangesInGPU.tripletModuleIndices = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
3838
rangesInGPU.tripletModuleOccupancy = (int*)cms::cuda::allocate_device(dev, nLowerModules * sizeof(int), stream);
3939

40+
rangesInGPU.device_nTotalMDs = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
41+
rangesInGPU.device_nTotalSegs = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
42+
rangesInGPU.device_nTotalTrips = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
43+
rangesInGPU.device_nTotalQuints = (unsigned int*)cms::cuda::allocate_device(dev, sizeof(unsigned int), stream);
44+
4045
#else
4146
cudaMalloc(&rangesInGPU.hitRanges,nModules * 2 * sizeof(int));
4247
cudaMalloc(&rangesInGPU.hitRangesLower,nModules * sizeof(int));
@@ -59,6 +64,11 @@ void SDL::createRangesInExplicitMemory(struct objectRanges& rangesInGPU,unsigned
5964
cudaMalloc(&rangesInGPU.segmentModuleOccupancy, (nLowerModules + 1) * sizeof(int));
6065
cudaMalloc(&rangesInGPU.tripletModuleIndices, nLowerModules * sizeof(int));
6166
cudaMalloc(&rangesInGPU.tripletModuleOccupancy, nLowerModules * sizeof(int));
67+
68+
cudaMalloc(&rangesInGPU.device_nTotalMDs, sizeof(unsigned int));
69+
cudaMalloc(&rangesInGPU.device_nTotalSegs, sizeof(unsigned int));
70+
cudaMalloc(&rangesInGPU.device_nTotalTrips, sizeof(unsigned int));
71+
cudaMalloc(&rangesInGPU.device_nTotalQuints, sizeof(unsigned int));
6272

6373
#endif
6474
}
@@ -120,6 +130,10 @@ void SDL::objectRanges::freeMemoryCache()//struct objectRanges& rangesInGPU)
120130
cms::cuda::free_device(dev, segmentModuleOccupancy);
121131
cms::cuda::free_device(dev, tripletModuleIndices);
122132
cms::cuda::free_device(dev, tripletModuleOccupancy);
133+
cms::cuda::free_device(dev, device_nTotalMDs);
134+
cms::cuda::free_device(dev, device_nTotalSegs);
135+
cms::cuda::free_device(dev, device_nTotalTrips);
136+
cms::cuda::free_device(dev, device_nTotalQuints);
123137
}
124138
void SDL::objectRanges::freeMemory()
125139
{
@@ -144,6 +158,10 @@ void SDL::objectRanges::freeMemory()
144158
cudaFree(segmentModuleOccupancy);
145159
cudaFree(tripletModuleIndices);
146160
cudaFree(tripletModuleOccupancy);
161+
cudaFree(device_nTotalMDs);
162+
cudaFree(device_nTotalSegs);
163+
cudaFree(device_nTotalTrips);
164+
cudaFree(device_nTotalQuints);
147165
}
148166
void SDL::freeModulesCache(struct modules& modulesInGPU,struct pixelMap& pixelMapping)
149167
{

SDL/Module.cuh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@ namespace SDL
7373
int *tripletModuleIndices;
7474
int *tripletModuleOccupancy;
7575

76-
// unsigned int nTotalQuintuplets;
76+
unsigned int *device_nTotalMDs;
77+
unsigned int *device_nTotalSegs;
78+
unsigned int *device_nTotalTrips;
79+
unsigned int *device_nTotalQuints;
7780

7881
void freeMemoryCache();
7982
void freeMemory();

SDL/Quintuplet.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ void SDL::quintuplets::freeMemory(cudaStream_t stream)
8383
cudaStreamSynchronize(stream);
8484
}
8585
//TODO:Reuse the track candidate one instead of this!
86-
__global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU,struct triplets& tripletsInGPU, unsigned int* device_nTotalQuintuplets, struct objectRanges& rangesInGPU)
86+
__global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU,struct triplets& tripletsInGPU, struct objectRanges& rangesInGPU)
8787
{
8888
__shared__ int nEligibleT5Modulesx;
8989
__shared__ unsigned int nTotalQuintupletsx;
@@ -140,7 +140,7 @@ __global__ void SDL::createEligibleModulesListForQuintupletsGPU(struct modules&
140140
__syncthreads();
141141
if(threadIdx.x==0){
142142
*rangesInGPU.nEligibleT5Modules = static_cast<uint16_t>(nEligibleT5Modulesx);
143-
*device_nTotalQuintuplets = nTotalQuintupletsx;
143+
*rangesInGPU.device_nTotalQuints = nTotalQuintupletsx;
144144
}
145145
}
146146

SDL/Quintuplet.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ namespace SDL
6060

6161
void createQuintupletsInExplicitMemory(struct SDL::quintuplets& quintupletsInGPU, const unsigned int& maxQuintuplets, const uint16_t& nLowerModules, const uint16_t& nEligibleModules,cudaStream_t stream);
6262

63-
__global__ void createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU, struct triplets& tripletsInGPU, unsigned int* nTotalQuintuplets, struct objectRanges& rangesInGPU);
63+
__global__ void createEligibleModulesListForQuintupletsGPU(struct modules& modulesInGPU, struct triplets& tripletsInGPU, struct objectRanges& rangesInGPU);
6464
__global__ void addQuintupletRangesToEventExplicit(struct modules& modulesInGPU, struct quintuplets& quintupletsInGPU, struct objectRanges& rangesInGPU);
6565

6666
// CUDA_DEV void rmQuintupletToMemory(struct SDL::quintuplets& quintupletsInGPU, unsigned int quintupletIndex);

SDL/Segment.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ void SDL::segments::resetMemory(unsigned int nMemoryLocationsx, unsigned int nLo
3030
}
3131

3232

33-
__global__ void SDL::createSegmentArrayRanges(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU, unsigned int* nTotalSegmentsx)
33+
__global__ void SDL::createSegmentArrayRanges(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsInGPU)
3434
{
3535
short module_subdets;
3636
short module_layers;
@@ -88,7 +88,7 @@ __global__ void SDL::createSegmentArrayRanges(struct modules& modulesInGPU, stru
8888
__syncthreads();
8989
if(threadIdx.x==0){
9090
rangesInGPU.segmentModuleIndices[*modulesInGPU.nLowerModules] = nTotalSegments;
91-
*nTotalSegmentsx = nTotalSegments;
91+
*rangesInGPU.device_nTotalSegs = nTotalSegments;
9292
}
9393
}
9494

SDL/Segment.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ namespace SDL
7474

7575
void createSegmentsInExplicitMemory(struct segments& segmentsInGPU, unsigned int maxSegments, uint16_t nLowerModules, unsigned int maxPixelSegments,cudaStream_t stream);
7676

77-
__global__ void createSegmentArrayRanges(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsinGPU, unsigned int* nSegments);
77+
__global__ void createSegmentArrayRanges(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct miniDoublets& mdsinGPU);
7878

7979

8080
__global__ void addSegmentRangesToEventExplicit(struct modules& modulesInGPU, struct segments& segmentsInGPU, struct objectRanges& rangesInGPU);

SDL/Triplet.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void SDL::triplets::resetMemory(unsigned int maxTriplets, unsigned int nLowerMod
1616
cudaMemsetAsync(partOfPT3, 0, maxTriplets * sizeof(bool), stream);
1717
}
1818

19-
__global__ void SDL::createTripletArrayRanges(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct segments& segmentsInGPU, unsigned int* nTotalTripletsx)
19+
__global__ void SDL::createTripletArrayRanges(struct modules& modulesInGPU, struct objectRanges& rangesInGPU, struct segments& segmentsInGPU)
2020
{
2121

2222
short module_subdets;
@@ -71,7 +71,7 @@ __global__ void SDL::createTripletArrayRanges(struct modules& modulesInGPU, stru
7171
}
7272
__syncthreads();
7373
if(threadIdx.x==0){
74-
*nTotalTripletsx = nTotalTriplets;
74+
*rangesInGPU.device_nTotalTrips = nTotalTriplets;
7575
}
7676
}
7777

0 commit comments

Comments
 (0)