@@ -303,23 +303,23 @@ void SDL::Event::resetEvent()
303
303
}
304
304
}
305
305
if (hitsInGPU){cms::cuda::free_host (hitsInGPU);
306
- hitsInGPU = nullptr ;}
306
+ hitsInGPU = nullptr ;}
307
307
if (mdsInGPU){cms::cuda::free_host (mdsInGPU);
308
- mdsInGPU = nullptr ;}
308
+ mdsInGPU = nullptr ;}
309
309
if (rangesInGPU){cms::cuda::free_host (rangesInGPU);
310
- rangesInGPU = nullptr ;}
310
+ rangesInGPU = nullptr ;}
311
311
if (segmentsInGPU){cms::cuda::free_host (segmentsInGPU);
312
- segmentsInGPU = nullptr ;}
312
+ segmentsInGPU = nullptr ;}
313
313
if (tripletsInGPU){cms::cuda::free_host (tripletsInGPU);
314
- tripletsInGPU = nullptr ;}
315
- if (quintupletsInGPU){cms::cuda::free_host (quintupletsInGPU);
314
+ tripletsInGPU = nullptr ;}
315
+ if (quintupletsInGPU){cms::cuda::free_host (quintupletsInGPU);
316
316
quintupletsInGPU = nullptr ;}
317
317
if (trackCandidatesInGPU){cms::cuda::free_host (trackCandidatesInGPU);
318
- trackCandidatesInGPU = nullptr ;}
318
+ trackCandidatesInGPU = nullptr ;}
319
319
if (pixelTripletsInGPU){cms::cuda::free_host (pixelTripletsInGPU);
320
- pixelTripletsInGPU = nullptr ;}
320
+ pixelTripletsInGPU = nullptr ;}
321
321
if (pixelQuintupletsInGPU){cms::cuda::free_host (pixelQuintupletsInGPU);
322
- pixelQuintupletsInGPU = nullptr ;}
322
+ pixelQuintupletsInGPU = nullptr ;}
323
323
324
324
if (hitsInCPU != nullptr )
325
325
{
@@ -679,9 +679,12 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
679
679
{
680
680
mdsInGPU = (SDL::miniDoublets*)cms::cuda::allocate_host (sizeof (SDL::miniDoublets), stream);
681
681
unsigned int nTotalMDs;
682
- createMDArrayRanges (*modulesInGPU, *rangesInGPU, nLowerModules, nTotalMDs, stream, N_MAX_PIXEL_MD_PER_MODULES);
683
- createMDsInExplicitMemory (*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream);
684
-
682
+ cudaMemsetAsync (&rangesInGPU->miniDoubletModuleOccupancy [nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof (unsigned int ),stream);
683
+ createMDArrayRangesGPU<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *rangesInGPU);
684
+ cudaMemcpyAsync (&nTotalMDs,rangesInGPU->device_nTotalMDs ,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
685
+ cudaStreamSynchronize (stream);
686
+ nTotalMDs+= N_MAX_PIXEL_MD_PER_MODULES;
687
+ createMDsInExplicitMemory (*mdsInGPU, nTotalMDs, nLowerModules, N_MAX_PIXEL_MD_PER_MODULES,stream);
685
688
cudaMemcpyAsync (mdsInGPU->nMemoryLocations , &nTotalMDs, sizeof (unsigned int ), cudaMemcpyHostToDevice, stream);
686
689
cudaStreamSynchronize (stream);
687
690
@@ -692,13 +695,10 @@ void SDL::Event::addPixelSegmentToEvent(std::vector<unsigned int> hitIndices0,st
692
695
// hardcoded range numbers for this will come from studies!
693
696
// can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously.
694
697
// If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them
695
- unsigned int *device_nTotalSegments;
696
- cudaMalloc ((void **)&device_nTotalSegments, sizeof (unsigned int ));
697
- createSegmentArrayRanges<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *rangesInGPU, *mdsInGPU, device_nTotalSegments);
698
- cudaMemcpyAsync (&nTotalSegments,device_nTotalSegments,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
698
+ createSegmentArrayRanges<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *rangesInGPU, *mdsInGPU);
699
+ cudaMemcpyAsync (&nTotalSegments,rangesInGPU->device_nTotalSegs ,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
699
700
cudaStreamSynchronize (stream);
700
701
nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE;
701
- cudaFree (device_nTotalSegments);
702
702
createSegmentsInExplicitMemory (*segmentsInGPU, nTotalSegments, nLowerModules, N_MAX_PIXEL_SEGMENTS_PER_MODULE,stream);
703
703
704
704
cudaMemcpyAsync (segmentsInGPU->nMemoryLocations , &nTotalSegments, sizeof (unsigned int ), cudaMemcpyHostToDevice, stream);;
@@ -895,7 +895,11 @@ void SDL::Event::createMiniDoublets()
895
895
{
896
896
// hardcoded range numbers for this will come from studies!
897
897
unsigned int nTotalMDs;
898
- createMDArrayRanges (*modulesInGPU, *rangesInGPU, nLowerModules, nTotalMDs, stream, N_MAX_PIXEL_MD_PER_MODULES);
898
+ cudaMemsetAsync (&rangesInGPU->miniDoubletModuleOccupancy [nLowerModules],N_MAX_PIXEL_MD_PER_MODULES, sizeof (unsigned int ),stream);
899
+ createMDArrayRangesGPU<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *rangesInGPU);
900
+ cudaMemcpyAsync (&nTotalMDs,rangesInGPU->device_nTotalMDs ,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
901
+ cudaStreamSynchronize (stream);
902
+ nTotalMDs+=N_MAX_PIXEL_MD_PER_MODULES;
899
903
900
904
if (mdsInGPU == nullptr )
901
905
{
@@ -991,12 +995,9 @@ void SDL::Event::createTriplets()
991
995
{
992
996
tripletsInGPU = (SDL::triplets*)cms::cuda::allocate_host (sizeof (SDL::triplets), stream);
993
997
unsigned int maxTriplets;
994
- unsigned int *device_maxTriplets;
995
- cudaMalloc ((void **)&device_maxTriplets, sizeof (unsigned int ));
996
- createTripletArrayRanges<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *rangesInGPU, *segmentsInGPU, device_maxTriplets);
997
- cudaMemcpyAsync (&maxTriplets,device_maxTriplets,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
998
+ createTripletArrayRanges<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *rangesInGPU, *segmentsInGPU);
999
+ cudaMemcpyAsync (&maxTriplets,rangesInGPU->device_nTotalTrips ,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
998
1000
cudaStreamSynchronize (stream);
999
- cudaFree (device_maxTriplets);
1000
1001
createTripletsInExplicitMemory (*tripletsInGPU, maxTriplets, nLowerModules,stream);
1001
1002
1002
1003
cudaMemcpyAsync (tripletsInGPU->nMemoryLocations , &maxTriplets, sizeof (unsigned int ), cudaMemcpyHostToDevice, stream);
@@ -1010,7 +1011,7 @@ void SDL::Event::createTriplets()
1010
1011
uint16_t *index_gpu;
1011
1012
index_gpu = (uint16_t *)cms::cuda::allocate_device (dev, nLowerModules*sizeof (uint16_t ), stream);
1012
1013
unsigned int *nSegments = (unsigned int *)malloc (nLowerModules*sizeof (unsigned int ));
1013
- cudaMemcpyAsync ((void *)nSegments, segmentsInGPU->nSegments , nLowerModules*sizeof (unsigned int ), cudaMemcpyDeviceToHost,stream);
1014
+ cudaMemcpyAsync ((void *)nSegments, segmentsInGPU->nSegments , nLowerModules*sizeof (unsigned int ), cudaMemcpyDeviceToHost,stream);
1014
1015
cudaStreamSynchronize (stream);
1015
1016
1016
1017
uint16_t * module_nConnectedModules;
@@ -1259,15 +1260,12 @@ void SDL::Event::createQuintuplets()
1259
1260
cudaMalloc (&(rangesInGPU->indicesOfEligibleT5Modules ), nLowerModules * sizeof (uint16_t ));
1260
1261
#endif
1261
1262
cudaMemsetAsync (rangesInGPU->quintupletModuleIndices , -1 , sizeof (int ) * (nLowerModules),stream);
1262
- cudaStreamSynchronize (stream);
1263
+ cudaStreamSynchronize (stream);
1263
1264
unsigned int nTotalQuintuplets;
1264
- unsigned int *device_nTotalQuintuplets;
1265
- cudaMalloc ((void **)&device_nTotalQuintuplets, sizeof (unsigned int ));
1266
- createEligibleModulesListForQuintupletsGPU<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *tripletsInGPU, device_nTotalQuintuplets, *rangesInGPU);
1265
+ createEligibleModulesListForQuintupletsGPU<<<1 ,1024 ,0 ,stream>>> (*modulesInGPU, *tripletsInGPU, *rangesInGPU);
1267
1266
cudaMemcpyAsync (&nEligibleT5Modules,rangesInGPU->nEligibleT5Modules ,sizeof (uint16_t ),cudaMemcpyDeviceToHost,stream);
1268
- cudaMemcpyAsync (&nTotalQuintuplets,device_nTotalQuintuplets,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
1269
- cudaStreamSynchronize (stream);
1270
- cudaFree (device_nTotalQuintuplets);
1267
+ cudaMemcpyAsync (&nTotalQuintuplets,rangesInGPU->device_nTotalQuints ,sizeof (unsigned int ),cudaMemcpyDeviceToHost,stream);
1268
+ cudaStreamSynchronize (stream);
1271
1269
1272
1270
if (quintupletsInGPU == nullptr )
1273
1271
{
0 commit comments