Skip to content

vllm.tool_parsers.apertus_tool_parser

Tool call parser for Apertus models.

Extracts tool calls from the format: <|tools_prefix|>[{"function_name": {"arg1": "value1", ...}}, ...]<|tools_suffix|>

Used when --enable-auto-tool-choice --tool-call-parser apertus are set.

ApertusToolParser

Bases: ToolParser

Tool call parser for Apertus models.

Handles the extraction of tool calls from text in both non-streaming (complete string) and streaming (chunked token) environments.

The expected Apertus function call format is a JSON array of single-key dictionaries sandwiched between special tokens: <|tools_prefix|>[{"function_name": {"arg1": "value1"}}, ...]<|tools_suffix|>

Examples:

>>> tokenizer = ...  # Mock tokenizer
>>> parser = ApertusToolParser(tokenizer)
>>> output = 'I will check. <|tools_prefix|>[{"get_weather": '            '{"city": "Paris"}}]<|tools_suffix|>'
>>> request = ChatCompletionRequest(...)
>>> info = parser.extract_tool_calls(output, request)
>>> info.content
"I will check."
>>> info.tool_calls[0].function.name
"get_weather"
>>> info.tool_calls[0].function.arguments
'{"city": "Paris"}'
Source code in vllm/tool_parsers/apertus_tool_parser.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
class ApertusToolParser(ToolParser):
    """
    Tool call parser for Apertus models.

    Handles the extraction of tool calls from text in both non-streaming
    (complete string) and streaming (chunked token) environments.

    The expected Apertus function call format is a JSON array of single-key dictionaries
    sandwiched between special tokens:
    `<|tools_prefix|>[{"function_name": {"arg1": "value1"}}, ...]<|tools_suffix|>`

    Examples:
        >>> tokenizer = ...  # Mock tokenizer
        >>> parser = ApertusToolParser(tokenizer)
        >>> output = 'I will check. <|tools_prefix|>[{"get_weather": '\
            '{"city": "Paris"}}]<|tools_suffix|>'
        >>> request = ChatCompletionRequest(...)
        >>> info = parser.extract_tool_calls(output, request)
        >>> info.content
        "I will check."
        >>> info.tool_calls[0].function.name
        "get_weather"
        >>> info.tool_calls[0].function.arguments
        '{"city": "Paris"}'
    """

    def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
        """
        Initializes the ApertusToolParser.

        Args:
            tokenizer: The model's tokenizer.
                Must be provided to interact with special tokens.
            tools: Optional list of tools available for the current request.

        Raises:
            ValueError: If the `model_tokenizer`
                is not successfully passed to the base class.
        """
        super().__init__(tokenizer, tools)

        if not self.model_tokenizer:
            raise ValueError(
                "The model tokenizer must be passed to the ToolParser "
                "constructor during construction."
            )
        # Regex to extract tool calls block (suffix is optional for incomplete outputs)
        self.tool_call_regex = re.compile(
            rf"{re.escape(TOOL_CALLS_PREFIX)}"
            rf"(.*?)"
            rf"(?:{re.escape(TOOL_CALLS_SUFFIX)}|$)",
            re.DOTALL,
        )

        self._reset_streaming_state()

    def _reset_streaming_state(self) -> None:
        """
        Resets all streaming state variables for a new completion request.

        This clears the delta text buffer and resets the pointers used to
        track the currently streaming tool index and arguments. Called implicitly
        during initialization and should be called between separate streams.
        """
        self.buffered_delta_text = ""
        self.current_tool_id = -1
        self.current_tool_name_sent = False
        self.streamed_args_for_tool: list[str] = []

    def adjust_request(
        self, request: ChatCompletionRequest | ResponsesRequest
    ) -> ChatCompletionRequest | ResponsesRequest:
        """
        Adjusts the generation request to ensure special tool tokens are not skipped.

        Forces `skip_special_tokens=False` if tools are actively being evaluated,
        ensuring the tools special tokens are surfaced to the engine for parsing.

        Args:
            request: The incoming OpenAI-compatible chat completion request.

        Returns:
            The potentially modified chat completion request.
        """
        request = super().adjust_request(request)
        if request.tools and request.tool_choice != "none":
            request.skip_special_tokens = False
        return request

    def _buffer_delta_text(self, delta_text: str) -> str:
        """
        Buffers incoming delta chunks to prevent
        fragmentation of multi-token special tags.

        If a chunk ends with a partial match of
        `<|tools_prefix|>` or `<|tools_suffix|>`,
        it holds that part back until the next chunk clarifies if it's the actual tag
        or just normal text.

        Args:
            delta_text: The newly generated text chunk

        Returns:
            The safe, verified text chunk free of partial tag collisions.

        Examples:
            >>> parser = ApertusToolParser(...)
            >>> parser._buffer_delta_text("Let me check <|tool" \
            "Let me check "  # "<|tool" is buffered internally
            >>> parser._buffer_delta_text("s_prefix|>" \
            "<|tools_prefix|>"  # Buffer released on completion
        """
        self.buffered_delta_text += delta_text
        text = self.buffered_delta_text

        for tag in (TOOL_CALLS_PREFIX, TOOL_CALLS_SUFFIX):
            if text.endswith(tag):
                self.buffered_delta_text = ""
                return text

            # Evaluate longest possible partial match first
            for i in range(len(tag) - 1, 0, -1):
                if text.endswith(tag[:i]):
                    self.buffered_delta_text = text[-i:]
                    return text[:-i]

        self.buffered_delta_text = ""
        return text

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        """
        Extracts tool calls from a completely generated model response (Non-Streaming).

        Args:
            model_output: The full completion string generated by the model.
            request: The current chat completion
                request context containing tool schemas.

        Returns:
            An `ExtractedToolCallInformation` object containing normal text content
            and a list of fully formatted `ToolCall` objects.

        Examples:
            >>> output = 'Let me see. <|tools_prefix|>[{"get_weather":' \
                '{"loc": "Paris"}}]<|tools_suffix|>'
            >>> info = parser.extract_tool_calls(output, request)
            >>> info.tools_called
            True
            >>> info.content
            'Let me see.'
            >>> info.tool_calls[0].function.name
            'get_weather'
        """
        match = self.tool_call_regex.search(model_output)
        if not match:
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )

        try:
            # group(1) might contain trailing text if the suffix is missing
            matched_text = match.group(1)
            stripped_text = matched_text.lstrip()

            try:
                # Use raw_decode to robustly isolate
                # the valid JSON array from any trailing garbage
                parsed_json, idx = json.JSONDecoder().raw_decode(stripped_text)
                trailing_in_group = stripped_text[idx:]
            except json.JSONDecodeError:
                # Fallback sequentially to partial parser for token-truncated requests
                parsed_json, _ = partial_json_loads(matched_text, Allow.ALL)
                trailing_in_group = ""

            if not isinstance(parsed_json, list):
                parsed_json = [parsed_json] if parsed_json else []

            tool_calls: list[ToolCall] = []
            for obj in parsed_json:
                if isinstance(obj, dict) and obj:
                    name, args = next(iter(obj.items()))
                    tool_calls.append(
                        ToolCall(
                            type="function",
                            id=make_tool_call_id(),
                            function=FunctionCall(
                                name=name,
                                arguments=json.dumps(args, ensure_ascii=False),
                            ),
                        )
                    )

            # Content combines any generated text
            # prior to and safely after the tool block
            content_str = model_output[: match.start()].strip()

            # Surface any hallucinated text inside
            # the regex group (due to missing suffix)
            if trailing_in_group.strip():
                trailing = trailing_in_group.replace(TOOL_CALLS_SUFFIX, "").strip()
                if trailing:
                    content_str = (content_str + "\n" + trailing).strip()

            # Surface text natively generated after the explicit suffix
            after_suffix = (
                model_output[match.end() :].replace(TOOL_CALLS_SUFFIX, "").strip()
            )
            if after_suffix:
                content_str = (content_str + "\n" + after_suffix).strip()

            return ExtractedToolCallInformation(
                tools_called=True,
                tool_calls=tool_calls,
                content=content_str if content_str else None,
            )

        except Exception:
            logger.exception("Error extracting tool calls from Apertus response")
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> DeltaMessage | None:
        """
        Handles streaming chunks

        Args:
            previous_text: The complete model text generated prior to this chunk.
            current_text: The complete model text including this chunk.
            delta_text: The incremental text addition.
            previous_token_ids: Tokens generated prior to this chunk.
            current_token_ids: Total tokens generated.
            delta_token_ids: Incremental token additions.
            request: The chat completion request.

        Returns:
            A `DeltaMessage` with updated content or tool argument diffs, or `None` if
            the chunk shouldn't emit visible changes yet (e.g. it was purely buffered).

        Examples:
            >>> prev = '<|tools_prefix|>[{"get_weather": {"loc'
            >>> cur = '<|tools_prefix|>[{"get_weather": {"location": "Paris"}}'
            >>> delta = 'ation": "Paris"}}'
            >>> msg = parser.extract_tool_calls_streaming(
            ...     prev, cur, delta, ..., request
            ... )
            >>> msg.tool_calls[0].function.arguments
            'ation": "Paris"}'
        """
        delta_text = self._buffer_delta_text(delta_text)
        if not delta_text:
            return None

        # Fast path: normal text generation before any tools are invoked
        if TOOL_CALLS_PREFIX not in current_text:
            return DeltaMessage(content=delta_text)

        try:
            return self._extract_streaming(current_text, delta_text)
        except Exception:
            logger.exception("Error in Apertus streaming tool call extraction")
            return None

    def _extract_streaming(
        self, current_text: str, delta_text: str
    ) -> DeltaMessage | None:
        """
        Core streaming logic.
        Separates visible chat text from JSON blocks and computes diffs.

        Args:
            current_text: The full generated output string so far.
            delta_text: The latest chunk of text added.

        Returns:
            A `DeltaMessage` containing the `content` delta and/or `tool_calls` delta.
        """
        prefix_idx = current_text.rfind(TOOL_CALLS_PREFIX)
        suffix_idx = current_text.rfind(TOOL_CALLS_SUFFIX)

        is_inside_tools = prefix_idx > suffix_idx

        json_completed = False
        json_end_idx: int | None = None

        # Check if the JSON array successfully closed implicitly
        if is_inside_tools:
            json_start = prefix_idx + len(TOOL_CALLS_PREFIX)
            s = current_text[json_start:].lstrip()
            try:
                # If raw_decode succeeds,
                # the JSON array is fully formed and implicitly closed
                _, idx = json.JSONDecoder().raw_decode(s)
                json_end_idx = len(current_text) - len(s) + idx
                json_completed, is_inside_tools = True, False
            except Exception:
                pass

        just_finished = (TOOL_CALLS_SUFFIX in delta_text) or json_completed

        # 1. Fast path: Output normal text immediately
        # if we are completely outside tool block constraints
        if not is_inside_tools and not just_finished:
            text = delta_text.replace(TOOL_CALLS_PREFIX, "").replace(
                TOOL_CALLS_SUFFIX, ""
            )
            return DeltaMessage(content=text) if text else None

        # 2. Extract leading and trailing normal text directly adjacent to tool blocks
        content_str = ""
        if TOOL_CALLS_PREFIX in delta_text:
            content_str += delta_text.split(TOOL_CALLS_PREFIX)[0].replace(
                TOOL_CALLS_SUFFIX, ""
            )

        if just_finished:
            if json_completed and json_end_idx is not None:
                # The tool block finished in this chunk via implicit JSON completion
                # Ensure we strictly isolate
                # and extract only trailing text that is part of `delta_text`
                delta_start_idx = len(current_text) - len(delta_text)
                content_start = max(json_end_idx, delta_start_idx)
                if content_start < len(current_text):
                    content_str += current_text[content_start:].replace(
                        TOOL_CALLS_SUFFIX, ""
                    )
            else:
                content_str += delta_text.split(TOOL_CALLS_SUFFIX)[-1]

        # 3. Extract the isolated JSON array string for the active block
        json_start = prefix_idx + len(TOOL_CALLS_PREFIX)
        json_end = suffix_idx if suffix_idx > prefix_idx else json_end_idx
        json_str = current_text[json_start:json_end]

        tool_calls = self._parse_and_diff_json(json_str, is_final=not is_inside_tools)

        if tool_calls or content_str:
            return DeltaMessage(
                content=content_str if content_str else None,
                tool_calls=tool_calls if tool_calls else None,
            )

        return None

    def _parse_and_diff_json(
        self, json_str: str, is_final: bool
    ) -> list[DeltaToolCall]:
        """
        Parses an isolated, potentially incomplete streaming JSON array and returns
        newly accumulated tool call diffs.

        Args:
            json_str: The extracted JSON array string so far
                (e.g. `[{"weather": {"city": "Par"}]`).
            is_final: True if the tool block has received its closing`<|tools_suffix|>`

        Returns:
            A list of `DeltaToolCall`
            items representing string diffs in function arguments
            to stream back to the client.
        """
        try:
            parsed, _ = partial_json_loads(json_str, Allow.ALL)
            if not isinstance(parsed, list):
                parsed = [parsed] if parsed else []
        except Exception:
            return []

        if not parsed:
            return []

        tool_calls: list[DeltaToolCall] = []
        latest_index = len(parsed) - 1

        # Catch up and finalize any tools we fully skipped over in one large text delta
        while self.current_tool_id < latest_index:
            if self.current_tool_id >= 0:
                if not self.current_tool_name_sent:
                    self._emit_tool_name(parsed, self.current_tool_id, tool_calls)

                delta = self._get_tool_diff(parsed, self.current_tool_id, is_final=True)
                if delta:
                    tool_calls.append(delta)

            self.current_tool_id += 1
            self.current_tool_name_sent = False
            while len(self.streamed_args_for_tool) <= self.current_tool_id:
                self.streamed_args_for_tool.append("")

        # Stream the currently active tool
        if self.current_tool_id >= 0:
            if not self.current_tool_name_sent:
                self._emit_tool_name(parsed, self.current_tool_id, tool_calls)

            delta = self._get_tool_diff(parsed, self.current_tool_id, is_final)
            if delta:
                tool_calls.append(delta)

        return tool_calls

    def _emit_tool_name(
        self, parsed: list, index: int, tool_calls: list[DeltaToolCall]
    ) -> None:
        """
        Extracts and emits the function name mapped to a new tool call ID.

        Args:
            parsed: The partially parsed JSON list containing tool dictionaries.
            index: The active index within the JSON list.
            tool_calls: The running list of delta chunks to mutate.

        Examples:
            Appends `DeltaToolCall(index=0,
                function=DeltaFunctionCall(name="get_weather", ...))`
            to the `tool_calls` list and marks the name as sent.
        """
        obj = parsed[index]
        if isinstance(obj, dict) and obj:
            name = next(iter(obj))
            self.current_tool_name_sent = True
            tool_calls.append(
                DeltaToolCall(
                    index=index,
                    type="function",
                    id=make_tool_call_id(),
                    function=DeltaFunctionCall(name=name, arguments="").model_dump(
                        exclude_none=True
                    ),
                )
            )

    def _get_tool_diff(
        self, parsed: list, index: int, is_final: bool
    ) -> DeltaToolCall | None:
        """
        Calculates the exact string difference to safely append new tool parameters.

        This ensures characters like `{`, `}`, and `"` don't jump around unevenly
        in the UI frontend while streaming incomplete JSON arguments.

        Args:
            parsed: The latest list of parsed JSON objects.
            index: The active tool's array index.
            is_final: Whether to emit
                trailing structural brackets (True if block is done).

        Returns:
            A `DeltaToolCall` mapping to the arguments diff,
                or None if no text was appended.

        Examples:
            >>> # Previous streamed state: '{"city": "Pari'
            >>> # Current full parse state: '{"city": "Paris"}'
            >>> # Returns diff (closing bracket suppressed until final):
            >>> parser._get_tool_diff(parsed, index=0, is_final=False)
            DeltaToolCall(index=0, function=DeltaFunctionCall(arguments='s'))
        """
        obj = parsed[index]
        if not isinstance(obj, dict) or not obj:
            return None

        name, args = next(iter(obj.items()))
        if args is None:
            return None

        args_json = json.dumps(args, ensure_ascii=False)

        # Suppress trailing structural characters
        # during stream (looks cleaner in frontends)
        if not is_final:
            while args_json and args_json[-1] in ("}", '"', "]", " ", ","):
                args_json = args_json[:-1]

        prev_sent = self.streamed_args_for_tool[index]
        if args_json == prev_sent:
            return None

        prefix = find_common_prefix(prev_sent, args_json)
        if len(prefix) < len(prev_sent):
            # Backtrack state if partial parser structurally updates a past assumption
            self.streamed_args_for_tool[index] = prefix
            return None

        diff = args_json[len(prev_sent) :]
        if diff:
            self.streamed_args_for_tool[index] = args_json
            return DeltaToolCall(
                index=index,
                function=DeltaFunctionCall(arguments=diff).model_dump(
                    exclude_none=True
                ),
            )

        return None

__init__

__init__(
    tokenizer: TokenizerLike,
    tools: list[Tool] | None = None,
)

Initializes the ApertusToolParser.

Parameters:

Name Type Description Default
tokenizer TokenizerLike

The model's tokenizer. Must be provided to interact with special tokens.

required
tools list[Tool] | None

Optional list of tools available for the current request.

None

Raises:

Type Description
ValueError

If the model_tokenizer is not successfully passed to the base class.

Source code in vllm/tool_parsers/apertus_tool_parser.py
def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
    """
    Initializes the ApertusToolParser.

    Args:
        tokenizer: The model's tokenizer.
            Must be provided to interact with special tokens.
        tools: Optional list of tools available for the current request.

    Raises:
        ValueError: If the `model_tokenizer`
            is not successfully passed to the base class.
    """
    super().__init__(tokenizer, tools)

    if not self.model_tokenizer:
        raise ValueError(
            "The model tokenizer must be passed to the ToolParser "
            "constructor during construction."
        )
    # Regex to extract tool calls block (suffix is optional for incomplete outputs)
    self.tool_call_regex = re.compile(
        rf"{re.escape(TOOL_CALLS_PREFIX)}"
        rf"(.*?)"
        rf"(?:{re.escape(TOOL_CALLS_SUFFIX)}|$)",
        re.DOTALL,
    )

    self._reset_streaming_state()

_buffer_delta_text

_buffer_delta_text(delta_text: str) -> str

Buffers incoming delta chunks to prevent fragmentation of multi-token special tags.

If a chunk ends with a partial match of <|tools_prefix|> or <|tools_suffix|>, it holds that part back until the next chunk clarifies if it's the actual tag or just normal text.

Parameters:

Name Type Description Default
delta_text str

The newly generated text chunk

required

Returns:

Type Description
str

The safe, verified text chunk free of partial tag collisions.

Examples:

>>> parser = ApertusToolParser(...)
>>> parser._buffer_delta_text("Let me check <|tool"             "Let me check "  # "<|tool" is buffered internally
>>> parser._buffer_delta_text("s_prefix|>"             "<|tools_prefix|>"  # Buffer released on completion
Source code in vllm/tool_parsers/apertus_tool_parser.py
def _buffer_delta_text(self, delta_text: str) -> str:
    """
    Buffers incoming delta chunks to prevent
    fragmentation of multi-token special tags.

    If a chunk ends with a partial match of
    `<|tools_prefix|>` or `<|tools_suffix|>`,
    it holds that part back until the next chunk clarifies if it's the actual tag
    or just normal text.

    Args:
        delta_text: The newly generated text chunk

    Returns:
        The safe, verified text chunk free of partial tag collisions.

    Examples:
        >>> parser = ApertusToolParser(...)
        >>> parser._buffer_delta_text("Let me check <|tool" \
        "Let me check "  # "<|tool" is buffered internally
        >>> parser._buffer_delta_text("s_prefix|>" \
        "<|tools_prefix|>"  # Buffer released on completion
    """
    self.buffered_delta_text += delta_text
    text = self.buffered_delta_text

    for tag in (TOOL_CALLS_PREFIX, TOOL_CALLS_SUFFIX):
        if text.endswith(tag):
            self.buffered_delta_text = ""
            return text

        # Evaluate longest possible partial match first
        for i in range(len(tag) - 1, 0, -1):
            if text.endswith(tag[:i]):
                self.buffered_delta_text = text[-i:]
                return text[:-i]

    self.buffered_delta_text = ""
    return text

_emit_tool_name

_emit_tool_name(
    parsed: list,
    index: int,
    tool_calls: list[DeltaToolCall],
) -> None

Extracts and emits the function name mapped to a new tool call ID.

Parameters:

Name Type Description Default
parsed list

The partially parsed JSON list containing tool dictionaries.

required
index int

The active index within the JSON list.

required
tool_calls list[DeltaToolCall]

The running list of delta chunks to mutate.

required

Examples:

Appends DeltaToolCall(index=0, function=DeltaFunctionCall(name="get_weather", ...)) to the tool_calls list and marks the name as sent.

Source code in vllm/tool_parsers/apertus_tool_parser.py
def _emit_tool_name(
    self, parsed: list, index: int, tool_calls: list[DeltaToolCall]
) -> None:
    """
    Extracts and emits the function name mapped to a new tool call ID.

    Args:
        parsed: The partially parsed JSON list containing tool dictionaries.
        index: The active index within the JSON list.
        tool_calls: The running list of delta chunks to mutate.

    Examples:
        Appends `DeltaToolCall(index=0,
            function=DeltaFunctionCall(name="get_weather", ...))`
        to the `tool_calls` list and marks the name as sent.
    """
    obj = parsed[index]
    if isinstance(obj, dict) and obj:
        name = next(iter(obj))
        self.current_tool_name_sent = True
        tool_calls.append(
            DeltaToolCall(
                index=index,
                type="function",
                id=make_tool_call_id(),
                function=DeltaFunctionCall(name=name, arguments="").model_dump(
                    exclude_none=True
                ),
            )
        )

_extract_streaming

_extract_streaming(
    current_text: str, delta_text: str
) -> DeltaMessage | None

Core streaming logic. Separates visible chat text from JSON blocks and computes diffs.

Parameters:

Name Type Description Default
current_text str

The full generated output string so far.

required
delta_text str

The latest chunk of text added.

required

Returns:

Type Description
DeltaMessage | None

A DeltaMessage containing the content delta and/or tool_calls delta.

Source code in vllm/tool_parsers/apertus_tool_parser.py
def _extract_streaming(
    self, current_text: str, delta_text: str
) -> DeltaMessage | None:
    """
    Core streaming logic.
    Separates visible chat text from JSON blocks and computes diffs.

    Args:
        current_text: The full generated output string so far.
        delta_text: The latest chunk of text added.

    Returns:
        A `DeltaMessage` containing the `content` delta and/or `tool_calls` delta.
    """
    prefix_idx = current_text.rfind(TOOL_CALLS_PREFIX)
    suffix_idx = current_text.rfind(TOOL_CALLS_SUFFIX)

    is_inside_tools = prefix_idx > suffix_idx

    json_completed = False
    json_end_idx: int | None = None

    # Check if the JSON array successfully closed implicitly
    if is_inside_tools:
        json_start = prefix_idx + len(TOOL_CALLS_PREFIX)
        s = current_text[json_start:].lstrip()
        try:
            # If raw_decode succeeds,
            # the JSON array is fully formed and implicitly closed
            _, idx = json.JSONDecoder().raw_decode(s)
            json_end_idx = len(current_text) - len(s) + idx
            json_completed, is_inside_tools = True, False
        except Exception:
            pass

    just_finished = (TOOL_CALLS_SUFFIX in delta_text) or json_completed

    # 1. Fast path: Output normal text immediately
    # if we are completely outside tool block constraints
    if not is_inside_tools and not just_finished:
        text = delta_text.replace(TOOL_CALLS_PREFIX, "").replace(
            TOOL_CALLS_SUFFIX, ""
        )
        return DeltaMessage(content=text) if text else None

    # 2. Extract leading and trailing normal text directly adjacent to tool blocks
    content_str = ""
    if TOOL_CALLS_PREFIX in delta_text:
        content_str += delta_text.split(TOOL_CALLS_PREFIX)[0].replace(
            TOOL_CALLS_SUFFIX, ""
        )

    if just_finished:
        if json_completed and json_end_idx is not None:
            # The tool block finished in this chunk via implicit JSON completion
            # Ensure we strictly isolate
            # and extract only trailing text that is part of `delta_text`
            delta_start_idx = len(current_text) - len(delta_text)
            content_start = max(json_end_idx, delta_start_idx)
            if content_start < len(current_text):
                content_str += current_text[content_start:].replace(
                    TOOL_CALLS_SUFFIX, ""
                )
        else:
            content_str += delta_text.split(TOOL_CALLS_SUFFIX)[-1]

    # 3. Extract the isolated JSON array string for the active block
    json_start = prefix_idx + len(TOOL_CALLS_PREFIX)
    json_end = suffix_idx if suffix_idx > prefix_idx else json_end_idx
    json_str = current_text[json_start:json_end]

    tool_calls = self._parse_and_diff_json(json_str, is_final=not is_inside_tools)

    if tool_calls or content_str:
        return DeltaMessage(
            content=content_str if content_str else None,
            tool_calls=tool_calls if tool_calls else None,
        )

    return None

_get_tool_diff

_get_tool_diff(
    parsed: list, index: int, is_final: bool
) -> DeltaToolCall | None

Calculates the exact string difference to safely append new tool parameters.

This ensures characters like {, }, and " don't jump around unevenly in the UI frontend while streaming incomplete JSON arguments.

Parameters:

Name Type Description Default
parsed list

The latest list of parsed JSON objects.

required
index int

The active tool's array index.

required
is_final bool

Whether to emit trailing structural brackets (True if block is done).

required

Returns:

Type Description
DeltaToolCall | None

A DeltaToolCall mapping to the arguments diff, or None if no text was appended.

Examples:

>>> # Previous streamed state: '{"city": "Pari'
>>> # Current full parse state: '{"city": "Paris"}'
>>> # Returns diff (closing bracket suppressed until final):
>>> parser._get_tool_diff(parsed, index=0, is_final=False)
DeltaToolCall(index=0, function=DeltaFunctionCall(arguments='s'))
Source code in vllm/tool_parsers/apertus_tool_parser.py
def _get_tool_diff(
    self, parsed: list, index: int, is_final: bool
) -> DeltaToolCall | None:
    """
    Calculates the exact string difference to safely append new tool parameters.

    This ensures characters like `{`, `}`, and `"` don't jump around unevenly
    in the UI frontend while streaming incomplete JSON arguments.

    Args:
        parsed: The latest list of parsed JSON objects.
        index: The active tool's array index.
        is_final: Whether to emit
            trailing structural brackets (True if block is done).

    Returns:
        A `DeltaToolCall` mapping to the arguments diff,
            or None if no text was appended.

    Examples:
        >>> # Previous streamed state: '{"city": "Pari'
        >>> # Current full parse state: '{"city": "Paris"}'
        >>> # Returns diff (closing bracket suppressed until final):
        >>> parser._get_tool_diff(parsed, index=0, is_final=False)
        DeltaToolCall(index=0, function=DeltaFunctionCall(arguments='s'))
    """
    obj = parsed[index]
    if not isinstance(obj, dict) or not obj:
        return None

    name, args = next(iter(obj.items()))
    if args is None:
        return None

    args_json = json.dumps(args, ensure_ascii=False)

    # Suppress trailing structural characters
    # during stream (looks cleaner in frontends)
    if not is_final:
        while args_json and args_json[-1] in ("}", '"', "]", " ", ","):
            args_json = args_json[:-1]

    prev_sent = self.streamed_args_for_tool[index]
    if args_json == prev_sent:
        return None

    prefix = find_common_prefix(prev_sent, args_json)
    if len(prefix) < len(prev_sent):
        # Backtrack state if partial parser structurally updates a past assumption
        self.streamed_args_for_tool[index] = prefix
        return None

    diff = args_json[len(prev_sent) :]
    if diff:
        self.streamed_args_for_tool[index] = args_json
        return DeltaToolCall(
            index=index,
            function=DeltaFunctionCall(arguments=diff).model_dump(
                exclude_none=True
            ),
        )

    return None

_parse_and_diff_json

_parse_and_diff_json(
    json_str: str, is_final: bool
) -> list[DeltaToolCall]

Parses an isolated, potentially incomplete streaming JSON array and returns newly accumulated tool call diffs.

Parameters:

Name Type Description Default
json_str str

The extracted JSON array string so far (e.g. [{"weather": {"city": "Par"}]).

required
is_final bool

True if the tool block has received its closing<|tools_suffix|>

required

Returns:

Type Description
list[DeltaToolCall]

A list of DeltaToolCall

list[DeltaToolCall]

items representing string diffs in function arguments

list[DeltaToolCall]

to stream back to the client.

Source code in vllm/tool_parsers/apertus_tool_parser.py
def _parse_and_diff_json(
    self, json_str: str, is_final: bool
) -> list[DeltaToolCall]:
    """
    Parses an isolated, potentially incomplete streaming JSON array and returns
    newly accumulated tool call diffs.

    Args:
        json_str: The extracted JSON array string so far
            (e.g. `[{"weather": {"city": "Par"}]`).
        is_final: True if the tool block has received its closing`<|tools_suffix|>`

    Returns:
        A list of `DeltaToolCall`
        items representing string diffs in function arguments
        to stream back to the client.
    """
    try:
        parsed, _ = partial_json_loads(json_str, Allow.ALL)
        if not isinstance(parsed, list):
            parsed = [parsed] if parsed else []
    except Exception:
        return []

    if not parsed:
        return []

    tool_calls: list[DeltaToolCall] = []
    latest_index = len(parsed) - 1

    # Catch up and finalize any tools we fully skipped over in one large text delta
    while self.current_tool_id < latest_index:
        if self.current_tool_id >= 0:
            if not self.current_tool_name_sent:
                self._emit_tool_name(parsed, self.current_tool_id, tool_calls)

            delta = self._get_tool_diff(parsed, self.current_tool_id, is_final=True)
            if delta:
                tool_calls.append(delta)

        self.current_tool_id += 1
        self.current_tool_name_sent = False
        while len(self.streamed_args_for_tool) <= self.current_tool_id:
            self.streamed_args_for_tool.append("")

    # Stream the currently active tool
    if self.current_tool_id >= 0:
        if not self.current_tool_name_sent:
            self._emit_tool_name(parsed, self.current_tool_id, tool_calls)

        delta = self._get_tool_diff(parsed, self.current_tool_id, is_final)
        if delta:
            tool_calls.append(delta)

    return tool_calls

_reset_streaming_state

_reset_streaming_state() -> None

Resets all streaming state variables for a new completion request.

This clears the delta text buffer and resets the pointers used to track the currently streaming tool index and arguments. Called implicitly during initialization and should be called between separate streams.

Source code in vllm/tool_parsers/apertus_tool_parser.py
def _reset_streaming_state(self) -> None:
    """
    Resets all streaming state variables for a new completion request.

    This clears the delta text buffer and resets the pointers used to
    track the currently streaming tool index and arguments. Called implicitly
    during initialization and should be called between separate streams.
    """
    self.buffered_delta_text = ""
    self.current_tool_id = -1
    self.current_tool_name_sent = False
    self.streamed_args_for_tool: list[str] = []

adjust_request

Adjusts the generation request to ensure special tool tokens are not skipped.

Forces skip_special_tokens=False if tools are actively being evaluated, ensuring the tools special tokens are surfaced to the engine for parsing.

Parameters:

Name Type Description Default
request ChatCompletionRequest | ResponsesRequest

The incoming OpenAI-compatible chat completion request.

required

Returns:

Type Description
ChatCompletionRequest | ResponsesRequest

The potentially modified chat completion request.

Source code in vllm/tool_parsers/apertus_tool_parser.py
def adjust_request(
    self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
    """
    Adjusts the generation request to ensure special tool tokens are not skipped.

    Forces `skip_special_tokens=False` if tools are actively being evaluated,
    ensuring the tools special tokens are surfaced to the engine for parsing.

    Args:
        request: The incoming OpenAI-compatible chat completion request.

    Returns:
        The potentially modified chat completion request.
    """
    request = super().adjust_request(request)
    if request.tools and request.tool_choice != "none":
        request.skip_special_tokens = False
    return request

extract_tool_calls

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extracts tool calls from a completely generated model response (Non-Streaming).

Parameters:

Name Type Description Default
model_output str

The full completion string generated by the model.

required
request ChatCompletionRequest

The current chat completion request context containing tool schemas.

required

Returns:

Type Description
ExtractedToolCallInformation

An ExtractedToolCallInformation object containing normal text content

ExtractedToolCallInformation

and a list of fully formatted ToolCall objects.

Examples:

>>> output = 'Let me see. <|tools_prefix|>[{"get_weather":'                 '{"loc": "Paris"}}]<|tools_suffix|>'
>>> info = parser.extract_tool_calls(output, request)
>>> info.tools_called
True
>>> info.content
'Let me see.'
>>> info.tool_calls[0].function.name
'get_weather'
Source code in vllm/tool_parsers/apertus_tool_parser.py
def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    """
    Extracts tool calls from a completely generated model response (Non-Streaming).

    Args:
        model_output: The full completion string generated by the model.
        request: The current chat completion
            request context containing tool schemas.

    Returns:
        An `ExtractedToolCallInformation` object containing normal text content
        and a list of fully formatted `ToolCall` objects.

    Examples:
        >>> output = 'Let me see. <|tools_prefix|>[{"get_weather":' \
            '{"loc": "Paris"}}]<|tools_suffix|>'
        >>> info = parser.extract_tool_calls(output, request)
        >>> info.tools_called
        True
        >>> info.content
        'Let me see.'
        >>> info.tool_calls[0].function.name
        'get_weather'
    """
    match = self.tool_call_regex.search(model_output)
    if not match:
        return ExtractedToolCallInformation(
            tools_called=False, tool_calls=[], content=model_output
        )

    try:
        # group(1) might contain trailing text if the suffix is missing
        matched_text = match.group(1)
        stripped_text = matched_text.lstrip()

        try:
            # Use raw_decode to robustly isolate
            # the valid JSON array from any trailing garbage
            parsed_json, idx = json.JSONDecoder().raw_decode(stripped_text)
            trailing_in_group = stripped_text[idx:]
        except json.JSONDecodeError:
            # Fallback sequentially to partial parser for token-truncated requests
            parsed_json, _ = partial_json_loads(matched_text, Allow.ALL)
            trailing_in_group = ""

        if not isinstance(parsed_json, list):
            parsed_json = [parsed_json] if parsed_json else []

        tool_calls: list[ToolCall] = []
        for obj in parsed_json:
            if isinstance(obj, dict) and obj:
                name, args = next(iter(obj.items()))
                tool_calls.append(
                    ToolCall(
                        type="function",
                        id=make_tool_call_id(),
                        function=FunctionCall(
                            name=name,
                            arguments=json.dumps(args, ensure_ascii=False),
                        ),
                    )
                )

        # Content combines any generated text
        # prior to and safely after the tool block
        content_str = model_output[: match.start()].strip()

        # Surface any hallucinated text inside
        # the regex group (due to missing suffix)
        if trailing_in_group.strip():
            trailing = trailing_in_group.replace(TOOL_CALLS_SUFFIX, "").strip()
            if trailing:
                content_str = (content_str + "\n" + trailing).strip()

        # Surface text natively generated after the explicit suffix
        after_suffix = (
            model_output[match.end() :].replace(TOOL_CALLS_SUFFIX, "").strip()
        )
        if after_suffix:
            content_str = (content_str + "\n" + after_suffix).strip()

        return ExtractedToolCallInformation(
            tools_called=True,
            tool_calls=tool_calls,
            content=content_str if content_str else None,
        )

    except Exception:
        logger.exception("Error extracting tool calls from Apertus response")
        return ExtractedToolCallInformation(
            tools_called=False, tool_calls=[], content=model_output
        )

extract_tool_calls_streaming

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None

Handles streaming chunks

Parameters:

Name Type Description Default
previous_text str

The complete model text generated prior to this chunk.

required
current_text str

The complete model text including this chunk.

required
delta_text str

The incremental text addition.

required
previous_token_ids Sequence[int]

Tokens generated prior to this chunk.

required
current_token_ids Sequence[int]

Total tokens generated.

required
delta_token_ids Sequence[int]

Incremental token additions.

required
request ChatCompletionRequest

The chat completion request.

required

Returns:

Type Description
DeltaMessage | None

A DeltaMessage with updated content or tool argument diffs, or None if

DeltaMessage | None

the chunk shouldn't emit visible changes yet (e.g. it was purely buffered).

Examples:

>>> prev = '<|tools_prefix|>[{"get_weather": {"loc'
>>> cur = '<|tools_prefix|>[{"get_weather": {"location": "Paris"}}'
>>> delta = 'ation": "Paris"}}'
>>> msg = parser.extract_tool_calls_streaming(
...     prev, cur, delta, ..., request
... )
>>> msg.tool_calls[0].function.arguments
'ation": "Paris"}'
Source code in vllm/tool_parsers/apertus_tool_parser.py
def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None:
    """
    Handles streaming chunks

    Args:
        previous_text: The complete model text generated prior to this chunk.
        current_text: The complete model text including this chunk.
        delta_text: The incremental text addition.
        previous_token_ids: Tokens generated prior to this chunk.
        current_token_ids: Total tokens generated.
        delta_token_ids: Incremental token additions.
        request: The chat completion request.

    Returns:
        A `DeltaMessage` with updated content or tool argument diffs, or `None` if
        the chunk shouldn't emit visible changes yet (e.g. it was purely buffered).

    Examples:
        >>> prev = '<|tools_prefix|>[{"get_weather": {"loc'
        >>> cur = '<|tools_prefix|>[{"get_weather": {"location": "Paris"}}'
        >>> delta = 'ation": "Paris"}}'
        >>> msg = parser.extract_tool_calls_streaming(
        ...     prev, cur, delta, ..., request
        ... )
        >>> msg.tool_calls[0].function.arguments
        'ation": "Paris"}'
    """
    delta_text = self._buffer_delta_text(delta_text)
    if not delta_text:
        return None

    # Fast path: normal text generation before any tools are invoked
    if TOOL_CALLS_PREFIX not in current_text:
        return DeltaMessage(content=delta_text)

    try:
        return self._extract_streaming(current_text, delta_text)
    except Exception:
        logger.exception("Error in Apertus streaming tool call extraction")
        return None