Skip to content

Commit 8863f37

Browse files
committed
Fix CPU and memory affinity under external resource management
- Fixes CPU affinity when running inference on CPU, and when CPUs are externally managed using taskset, numactl, cgroups, Kubernetes CPU manager, NRI resource policy plugins, for instance. - Detect external CPU management and trust the external CPU manager completely. It is more likely that external manager has the big picture of all other tasks running on the system, their QoS, hardware characteristics, etc. - For instance, do not modify even memory affinity, because the external manager may know better which NUMA node has fastest memory, or which NUMA nodes have enough free memory for this inference. Fixes: huggingface#3011 Signed-off-by: Antti Kervinen <[email protected]>
1 parent 571ac9b commit 8863f37

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

server/text_generation_server/models/flash_causal_lm.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,39 @@ def get_sliding_windows() -> int:
102102

103103

104104
def init_cpu_threads_env(rank_id: int, world_size: int):
105+
import psutil
106+
allowed_cpus = psutil.Process().cpu_affinity()
107+
if len(allowed_cpus) < psutil.cpu_count(logical=True):
108+
_init_cpu_threads_env_use_allowed(rank_id, world_size, allowed_cpus)
109+
else:
110+
_init_cpu_threads_env_use_all(rank_id, world_size)
111+
112+
def _init_cpu_threads_env_use_allowed(rank_id: int, world_size: int, allowed_cpus: list):
113+
import importlib.util
114+
115+
if os.getenv("OMP_NUM_THREADS") is None:
116+
num_cpus_per_rank = max(int(len(allowed_cpus) / world_size), 1)
117+
else:
118+
num_cpus_per_rank = min(int(os.getenv("OMP_NUM_THREADS")), len(allowed_cpus))
119+
120+
if importlib.util.find_spec("numa") is not None:
121+
import numa
122+
123+
slice_info = f"slice {rank_id+1}/{world_size} of externally allowed {len(allowed_cpus)} CPUs"
124+
allowed_mems = numa.memory.get_membind_nodes()
125+
cpu_start = num_cpus_per_rank * rank_id
126+
allowed_cpus_for_rank = allowed_cpus[cpu_start : cpu_start + num_cpus_per_rank]
127+
numa.schedule.run_on_cpus(0, *allowed_cpus_for_rank)
128+
effective_allowed_cpus = numa.schedule.get_affinitive_cpus(0)
129+
else:
130+
slice_info = "externally allowed, cannot import numa for slicing"
131+
allowed_mems = "n/a"
132+
effective_allowed_cpus = allowed_cpus
133+
num_threads = num_cpus_per_rank
134+
torch.set_num_threads(num_threads)
135+
logger.info(f"affinity={effective_allowed_cpus} ({slice_info}), membind={allowed_mems}, threads={num_threads}")
136+
137+
def _init_cpu_threads_env_use_all(rank_id: int, world_size: int):
105138
import importlib.util
106139

107140
if importlib.util.find_spec("numa") is not None:

0 commit comments

Comments
 (0)