19
19
xgr_installed = False
20
20
pass
21
21
22
+ from vllm .logger import init_logger
22
23
from vllm .model_executor .guided_decoding .utils import (convert_lark_to_gbnf ,
23
24
grammar_is_likely_lark )
24
25
from vllm .transformers_utils .tokenizers .mistral import MistralTokenizer
29
30
from vllm .config import ModelConfig
30
31
from vllm .sampling_params import GuidedDecodingParams
31
32
33
+ logger = init_logger (__name__ )
34
+
32
35
33
36
# TODO: passing batch size to max threads here
34
37
def get_local_xgrammar_guided_decoding_logits_processor (
@@ -161,6 +164,7 @@ class GrammarConfig:
161
164
json_str : str | None = None
162
165
grammar_str : str | None = None
163
166
json_object : bool | None = None
167
+ any_whitespace : bool = True
164
168
max_threads : int = 8
165
169
tokenizer_data : TokenizerData | None = None
166
170
@@ -180,19 +184,42 @@ def from_guided_params(cls,
180
184
else :
181
185
json_str = guided_params .json
182
186
187
+ any_whitespace = 'disable-any-whitespace' not in \
188
+ guided_params .backend_options ()
189
+
190
+ # Check and log if model with xgrammar and whitespace have history
191
+ # of runaway generation of whitespaces.
192
+ # References:
193
+ # https://github.com/vllm-project/vllm/pull/12744
194
+ # https://github.com/mlc-ai/xgrammar/issues/212
195
+ model_with_warn = None
196
+
197
+ if 'Mistral' in model_config .model :
198
+ model_with_warn = 'Mistral'
199
+ elif 'Qwen' in model_config .model :
200
+ model_with_warn = 'Qwen'
201
+
202
+ if model_with_warn is not None and any_whitespace :
203
+ msg = (f"{ model_with_warn } "
204
+ f"model detected, consider set "
205
+ f"`guided_backend=xgrammar:disable-any-whitespace` "
206
+ f"to prevent runaway generation of whitespaces." )
207
+ logger .info_once (msg )
183
208
# Validate the schema and raise ValueError here if it is invalid.
184
209
# This is to avoid exceptions in model execution, which will crash
185
210
# the engine worker process.
186
211
try :
187
- xgr .Grammar .from_json_schema (json_str )
212
+ xgr .Grammar .from_json_schema (json_str ,
213
+ any_whitespace = any_whitespace )
188
214
except RuntimeError as err :
189
215
raise ValueError (str (err )) from err
190
216
191
217
return cls (json_str = json_str ,
192
218
vocab_size = model_config .hf_text_config .vocab_size ,
193
219
tokenizer_hash = tokenizer_hash ,
194
220
max_threads = max_threads ,
195
- tokenizer_data = tokenizer_data )
221
+ tokenizer_data = tokenizer_data ,
222
+ any_whitespace = any_whitespace )
196
223
elif guided_params .grammar :
197
224
# XGrammar only supports GBNF grammars, so we must convert Lark
198
225
if grammar_is_likely_lark (guided_params .grammar ):
@@ -290,7 +317,10 @@ def _ensure_ctx(self):
290
317
if self .ctx is None :
291
318
compiler = GrammarCompilerCache .get_compiler (self .config )
292
319
if self .config .json_str is not None :
293
- self .ctx = compiler .compile_json_schema (self .config .json_str )
320
+ any_whitespace = self .config .any_whitespace
321
+ self .ctx = compiler \
322
+ .compile_json_schema (self .config .json_str ,
323
+ any_whitespace = any_whitespace )
294
324
elif self .config .grammar_str is not None :
295
325
self .ctx = compiler .compile_grammar (self .config .grammar_str )
296
326
elif self .config .json_object :
0 commit comments