@@ -158,7 +158,7 @@ def _sanitize_parameters(
158
158
max_length = None ,
159
159
continue_final_message = None ,
160
160
skip_special_tokens = None ,
161
- tokenizer_kwargs = None ,
161
+ tokenizer_encode_kwargs = None ,
162
162
** generate_kwargs ,
163
163
):
164
164
# preprocess kwargs
@@ -196,8 +196,8 @@ def _sanitize_parameters(
196
196
if continue_final_message is not None :
197
197
preprocess_params ["continue_final_message" ] = continue_final_message
198
198
199
- if tokenizer_kwargs is not None :
200
- preprocess_params ["tokenizer_kwargs " ] = tokenizer_kwargs
199
+ if tokenizer_encode_kwargs is not None :
200
+ preprocess_params ["tokenizer_encode_kwargs " ] = tokenizer_encode_kwargs
201
201
202
202
preprocess_params .update (generate_kwargs )
203
203
@@ -293,9 +293,9 @@ def __call__(self, text_inputs, **kwargs):
293
293
- `None` : default strategy where nothing in particular happens
294
294
- `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
295
295
truncate a lot of the prompt and not suitable when generation exceed the model capacity)
296
- tokenizer_kwargs (`dict`, *optional*):
297
- Additional keyword arguments to pass along to the tokenizer. If the text input is a chat, it is passed
298
- to `apply_chat_template`. Otherwise, it is passed to `__call__`.
296
+ tokenizer_encode_kwargs (`dict`, *optional*):
297
+ Additional keyword arguments to pass along to encoding step of the tokenizer. If the text input is a
298
+ chat, it is passed to `apply_chat_template`. Otherwise, it is passed to `__call__`.
299
299
generate_kwargs (`dict`, *optional*):
300
300
Additional keyword arguments to pass along to the generate method of the model (see the generate method
301
301
corresponding to your framework [here](./text_generation)).
@@ -341,18 +341,18 @@ def preprocess(
341
341
padding = None ,
342
342
max_length = None ,
343
343
continue_final_message = None ,
344
- tokenizer_kwargs = None ,
344
+ tokenizer_encode_kwargs = None ,
345
345
** generate_kwargs ,
346
346
):
347
347
# Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
348
- base_tokenizer_kwargs = {
348
+ tokenizer_kwargs = {
349
349
"add_special_tokens" : add_special_tokens ,
350
350
"truncation" : truncation ,
351
351
"padding" : padding ,
352
352
"max_length" : max_length , # NOTE: `max_length` is also a `generate` arg. Use `tokenizer_kwargs` to avoid a name clash
353
353
}
354
- base_tokenizer_kwargs = {key : value for key , value in base_tokenizer_kwargs .items () if value is not None }
355
- tokenizer_kwargs = { ** base_tokenizer_kwargs , ** ( tokenizer_kwargs or {})}
354
+ tokenizer_kwargs = {key : value for key , value in tokenizer_kwargs .items () if value is not None }
355
+ tokenizer_kwargs . update ( tokenizer_encode_kwargs or {})
356
356
357
357
if isinstance (prompt_text , Chat ):
358
358
tokenizer_kwargs .pop ("add_special_tokens" , None ) # ignore add_special_tokens on chats
0 commit comments