(KT) lmr@lmr-Rack-Server:~/AI/ktransformers/ktransformers$ python -m sglang.launch_server \
--host 0.0.0.0 \
--port 8080 \
--model /home/lmr/AI/models/glm4.7/config \
--trust-remote-code \
--mem-fraction-static 0.92 \
--chunked-prefill-size 4096 \
--served-model-name glm4.7 \
--enable-mixed-chunk \
--kt-method LLAMAFILE \
--kt-weight-path /home/lmr/AI/models/glm4.7/glm4.7-gguf/UD-Q4_K_XL \
--kt-cpuinfer 56 \
--kt-threadpool-count 2 \
--kt-num-gpu-experts 2 \
--kt-max-deferred-experts-per-token 2 \
--max-total-tokens 8192
[2026-06-09 20:27:20] WARNING model_config.py:1107: Transformers version 5.10.2 is used for model type glm4_moe. If you experience issues related to RoPE parameters, they may be due to incompatibilities between Transformers >=5.0.0 and some models. You can try downgrading to transformers==4.57.1 as a workaround.
[2026-06-09 20:27:20] INFO server_args.py:2123: Attention backend not specified. Use flashinfer backend by default.
[transformers] `BaseImageProcessorFast` is deprecated. The `Fast` suffix for image processors has been removed; use `BaseImageProcessor` instead.
[2026-06-09 20:27:21] server_args=ServerArgs(model_path='/home/lmr/AI/models/glm4.7/config', tokenizer_path='/home/lmr/AI/models/glm4.7/config', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=True, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='0.0.0.0', port=8080, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.92, max_running_requests=None, max_queued_requests=None, max_total_tokens=8192, chunked_prefill_size=4096, enable_dynamic_chunking=False, max_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=1, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', enable_prefill_delayer=False, prefill_delayer_max_delay_passes=30, prefill_delayer_token_usage_low_watermark=None, prefill_delayer_forward_passes_buckets=None, prefill_delayer_wait_seconds_buckets=None, device='cuda', tp_size=1, pp_size=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, stream_output=False, random_seed=893093240, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, watchdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, model_checksum=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, log_requests_format='text', log_requests_target=None, uvicorn_access_log_exclude_prefixes=[], crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metrics_allowed_custom_labels=None, extra_metric_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False, export_metrics_to_file_dir=None, api_key=None, admin_api_key=None, served_model_name='glm4.7', weight_version='default', chat_template=None, hf_chat_template_name=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', attn_cp_size=1, moe_dp_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, enable_lora_overlap_loading=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, attention_backend='flashinfer', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, fp8_gemm_runner_backend='auto', fp4_gemm_runner_backend='flashinfer_cutlass', nsa_prefill_backend=None, nsa_decode_backend=None, disable_flashinfer_autotune=False, mamba_backend='triton', speculative_algorithm=None, speculative_draft_model_path=None, speculative_draft_model_revision=None, speculative_draft_load_format=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend='auto', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_ngram_min_match_window_size=1, speculative_ngram_max_match_window_size=12, speculative_ngram_min_bfs_breadth=1, speculative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_branch_length=18, speculative_ngram_capacity=10000000, enable_multi_layer_eagle=False, ep_size=1, moe_a2a_backend='none', moe_runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, enable_aiter_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, mooncake_ib_device=None, max_mamba_cache_size=None, mamba_ssm_dtype=None, mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, linear_attn_backend='triton', linear_attn_decode_backend=None, linear_attn_prefill_backend=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', disable_hicache_numa_detect=False, hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, enable_hisparse=False, hierarchical_sparse_attention_extra_config=None, enable_lmcache=False, kt_weight_path='/home/lmr/AI/models/glm4.7/glm4.7-gguf/UD-Q4_K_XL', kt_method='LLAMAFILE', kt_cpuinfer=56, kt_threadpool_count=2, kt_numa_nodes=None, kt_num_gpu_experts=2, kt_gpu_experts_ratio=None, kt_max_deferred_experts_per_token=2, kt_gpu_prefill_token_threshold=None, record_kt_gpu_expert_distribution=False, kt_enable_dynamic_expert_update=False, kt_expert_placement_strategy='uniform', kt_lora_path=None, kt_expert_lora_path=None, dllm_algorithm=None, dllm_algorithm_config=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', multi_item_scoring_delimiter=None, disable_radix_cache=False, cuda_graph_max_bs=24, cuda_graph_bs=[1, 2, 4, 8, 12, 16, 24], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, enable_torch_symm_mem=False, disable_overlap_schedule=False, enable_mixed_chunk=True, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, enable_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=4096, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, enable_return_indexer_topk=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, enable_nsa_prefill_context_parallel=False, nsa_prefill_cp_mode='round-robin-split', enable_fused_qk_norm_rope=False, enable_precise_embedding_interpolation=False, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, disaggregation_decode_enable_offload_kvcache=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, mm_max_concurrent_calls=32, mm_per_request_timeout=10.0, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, limit_mm_data_per_request=None, enable_mm_global_cache=False, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None)
[2026-06-09 20:27:21] Transformers version 5.10.2 is used for model type glm4_moe. If you experience issues related to RoPE parameters, they may be due to incompatibilities between Transformers >=5.0.0 and some models. You can try downgrading to transformers==4.57.1 as a workaround.
[2026-06-09 20:27:23] Using default HuggingFace chat template with detected content format: openai
[transformers] `BaseImageProcessorFast` is deprecated. The `Fast` suffix for image processors has been removed; use `BaseImageProcessor` instead.
[transformers] `BaseImageProcessorFast` is deprecated. The `Fast` suffix for image processors has been removed; use `BaseImageProcessor` instead.
[2026-06-09 20:27:28] Transformers version 5.10.2 is used for model type glm4_moe. If you experience issues related to RoPE parameters, they may be due to incompatibilities between Transformers >=5.0.0 and some models. You can try downgrading to transformers==4.57.1 as a workaround.
[2026-06-09 20:27:30] Mamba selective_state_update backend initialized: triton
[2026-06-09 20:27:30] Transformers version 5.10.2 is used for model type glm4_moe. If you experience issues related to RoPE parameters, they may be due to incompatibilities between Transformers >=5.0.0 and some models. You can try downgrading to transformers==4.57.1 as a workaround.
[2026-06-09 20:27:30] Init torch distributed begin.
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[2026-06-09 20:27:30] Init torch distributed ends. elapsed=0.18 s, mem usage=0.06 GB
[2026-06-09 20:27:30] In import_model_classes: Ignore import error when loading sglang.srt.models.deepseek_v4: No module named 'tilelang'
[2026-06-09 20:27:30] In import_model_classes: Ignore import error when loading sglang.srt.models.deepseek_v4_nextn: No module named 'tilelang'
[2026-06-09 20:27:31] Load weight begin. avail mem=22.47 GB
[2026-06-09 20:27:31] Shared experts fusion optimization enabled.
[2026-06-09 20:27:31] Using kt_num_gpu_experts=2 per layer, total GPU experts: 178 (= 2 × 89 MoE layers)
[2026-06-09 20:27:31] Using uniform strategy for GPU expert placement
[2026-06-09 20:27:31] KT GPU experts: layer 3 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 4 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 5 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 6 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 7 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 8 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 9 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 10 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 11 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 12 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 13 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 14 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 15 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 16 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 17 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 18 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 19 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 20 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 21 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 22 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 23 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 24 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 25 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 26 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 27 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 28 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 29 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 30 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 31 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 32 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 33 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 34 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 35 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 36 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 37 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 38 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 39 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 40 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 41 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 42 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 43 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 44 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 45 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 46 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 47 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 48 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 49 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 50 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 51 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 52 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 53 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 54 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 55 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 56 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 57 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 58 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 59 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 60 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 61 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 62 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 63 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 64 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 65 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 66 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 67 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 68 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 69 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 70 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 71 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 72 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 73 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 74 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 75 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 76 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 77 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 78 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 79 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 80 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 81 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 82 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 83 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 84 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 85 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 86 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 87 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 88 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 89 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 90 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] KT GPU experts: layer 91 (MoE) has 2 GPU experts
[2026-06-09 20:27:31] Generated KT GPU experts masks using 'uniform' strategy: 89 MoE layers (out of 92 total layers) x 160 experts, total GPU experts in MoE layers = 178
[2026-06-09 20:27:31] [KT] Created shared staging buffer: 40.0 MiB (shape=torch.Size([4096, 5120]), dtype=torch.bfloat16)
[GGUFLoader] Loading GGUF files from directory: /home/lmr/AI/models/glm4.7/glm4.7-gguf/UD-Q4_K_XL
Loading: GLM-4.7-UD-Q4_K_XL-00001-of-00005.gguf
Loading: GLM-4.7-UD-Q4_K_XL-00002-of-00005.gguf
Loading: GLM-4.7-UD-Q4_K_XL-00003-of-00005.gguf
Loading: GLM-4.7-UD-Q4_K_XL-00004-of-00005.gguf
Loading: GLM-4.7-UD-Q4_K_XL-00005-of-00005.gguf
[GGUFLoader] Summary:
Files loaded: 5
Total tensors: 1761
Metadata keys: 63
[LlamafileMoEWrapper] Layer 3 TP configuration:
intermediate_size: 1536
threadpool_count: 2
QK_K: 256
Total blocks: 6
Base blocks per TP: 3
Extra blocks (distributed to first TPs): 0
TP 0: size=768, offset=0, blocks=3
TP 1: size=768, offset=768, blocks=3
[2026-06-09 20:27:43] Scheduler hit an exception: Traceback (most recent call last):
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 3207, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 367, in __init__
self.init_model_worker()
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 563, in init_model_worker
self.init_tp_model_worker()
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 521, in init_tp_model_worker
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/managers/tp_worker.py", line 247, in __init__
self._init_model_runner()
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/managers/tp_worker.py", line 330, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 418, in __init__
self.initialize()
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 501, in initialize
self.load_model()
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 1038, in load_model
self.model = self.loader.load_model(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/model_loader/loader.py", line 671, in load_model
model = _initialize_model(
^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/model_loader/loader.py", line 277, in _initialize_model
return model_class(**kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/models/glm4_moe.py", line 1031, in __init__
self.model = Glm4MoeModel(
^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/models/glm4_moe.py", line 920, in __init__
self.layers, self.start_layer, self.end_layer = make_layers(
^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/utils/common.py", line 648, in make_layers
+ get_offloader().wrap_modules(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/utils/offloader.py", line 36, in wrap_modules
return list(all_modules_generator)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/utils/common.py", line 650, in <genexpr>
layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/models/glm4_moe.py", line 922, in <lambda>
lambda idx, prefix: Glm4MoeDecoderLayer(
^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/models/glm4_moe.py", line 741, in __init__
self.mlp = Glm4MoeSparseMoeBlock(
^^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/models/glm4_moe.py", line 375, in __init__
self.experts = get_moe_impl_class(quant_config)(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 291, in __init__
self.quant_method.create_weights(
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/sglang/srt/layers/moe/kt_ep_wrapper.py", line 2525, in create_weights
self.wrapper = KTMoEWrapper(
^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/kt_kernel/experts.py", line 207, in __new__
return _create_inference_wrapper(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/kt_kernel/experts.py", line 360, in _create_inference_wrapper
return backend_cls(
^^^^^^^^^^^^
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/kt_kernel/utils/llamafile.py", line 123, in __init__
super().__init__(
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/kt_kernel/experts_base.py", line 290, in __init__
self.gpu_experts_mask.copy_(gpu_experts_mask)
File "/home/lmr/miniconda3/envs/KT/lib/python3.11/site-packages/torch/utils/_device.py", line 103, in __torch_function__
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
RuntimeError: The size of tensor a (161) must match the size of tensor b (160) at non-singleton dimension 0
[2026-06-09 20:27:43] Received sigquit from a child process. It usually means the child failed.
Reminder
System Info
14090+2epyc7532+256g ddr4
run command:
python -m sglang.launch_server
--host 0.0.0.0
--port 8080
--model /home/lmr/AI/models/glm4.7/config
--trust-remote-code
--mem-fraction-static 0.92
--chunked-prefill-size 4096
--served-model-name glm4.7
--enable-mixed-chunk
--kt-method LLAMAFILE
--kt-weight-path /home/lmr/AI/models/glm4.7/glm4.7-gguf/UD-Q4_K_XL
--kt-cpuinfer 56
--kt-threadpool-count 2
--kt-num-gpu-experts 2
--kt-max-deferred-experts-per-token 2
--max-total-tokens 8192
Reproduction
Others
No response