Upload streaming tool call parser python file for vLLM

Browse files

Files changed (1) hide show

nemotron_toolcall_parser_streaming.py +236 -0

nemotron_toolcall_parser_streaming.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import json
+import re
+from collections.abc import Sequence
+from typing import Union, Optional
+import partial_json_parser
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+logger = init_logger(__name__)
+@ToolParserManager.register_module("nemotron_json")
+class NemotronJSONToolParser(ToolParser):
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        # Streaming state tracking
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+        self.tool_call_ids: list[str] = []  # Track IDs for each tool call
+        # Track what we've sent so far in streaming
+        self.sent_tool_calls_count: int = 0
+        self.sent_args_length: dict[int, int] = {}  # tool_idx -> length of args sent
+        self.tool_call_start_token: str = "<TOOLCALL>"
+        self.tool_call_end_token: str = "</TOOLCALL>"
+        self.tool_call_regex = re.compile(r"<TOOLCALL>(.*?)</TOOLCALL>", re.DOTALL)
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """Extract tool calls from non-streaming (complete) output."""
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+        try:
+            # Try to extract complete <TOOLCALL>...</TOOLCALL> blocks
+            tool_call_matches = self.tool_call_regex.findall(model_output)
+            if tool_call_matches:
+                # Complete tool call block found
+                str_tool_calls = tool_call_matches[0].strip()
+            else:
+                # Incomplete - extract everything after <TOOLCALL>
+                start_idx = model_output.find(self.tool_call_start_token) + len(self.tool_call_start_token)
+                str_tool_calls = model_output[start_idx:].strip()
+            # Ensure array brackets
+            if not str_tool_calls.startswith("["):
+                str_tool_calls = "[" + str_tool_calls
+            if not str_tool_calls.endswith("]"):
+                str_tool_calls = str_tool_calls + "]"
+            # Use partial JSON parser for incomplete JSON
+            json_tool_calls = partial_json_parser.loads(str_tool_calls)
+            if not isinstance(json_tool_calls, list):
+                raise ValueError("Tool calls must be a list")
+            tool_calls = []
+            for tool_call in json_tool_calls:
+                if not isinstance(tool_call, dict):
+                    continue
+                try:
+                    tool_calls.append(ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=tool_call.get("name", ""),
+                            arguments=json.dumps(tool_call.get("arguments", {}), ensure_ascii=False) \
+                                if isinstance(tool_call.get("arguments"), dict) else str(tool_call.get("arguments", "")),
+                        ),
+                    ))
+                except Exception as e:
+                    logger.warning(f"Failed to parse tool call: {e}")
+                    continue
+            content = model_output[:model_output.find(self.tool_call_start_token)].strip()
+            return ExtractedToolCallInformation(
+                tools_called=True if tool_calls else False,
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+        except Exception as e:
+            logger.exception(f"Error extracting tool calls. Response: {model_output}")
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output,
+            )
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """Extract tool calls from streaming output.
+        This incrementally parses the <TOOLCALL> JSON as it streams in,
+        sending delta updates for each tool call and its arguments.
+        """
+        # Check if we just started tool calling
+        if self.tool_call_start_token in delta_text and self.tool_call_start_token not in previous_text:
+            # First time seeing <TOOLCALL>, return content before it
+            content_before = delta_text.split(self.tool_call_start_token)[0]
+            if content_before:
+                return DeltaMessage(content=content_before)
+            # Start of tool call section - no delta yet
+            return None
+        # Check if we're not in tool call mode yet
+        if self.tool_call_start_token not in current_text:
+            # Regular content, no tool calls
+            return DeltaMessage(content=delta_text) if delta_text else None
+        # We're inside <TOOLCALL>...</TOOLCALL>
+        # For Nemotron, the entire TOOLCALL block is generated at once
+        # So we should only parse when we have the complete </TOOLCALL>
+        # Check if we have the complete tool call block yet
+        if self.tool_call_end_token not in current_text:
+            # Incomplete tool call, don't send deltas yet
+            return None
+        # We have the complete tool call block, parse it
+        start_idx = current_text.find(self.tool_call_start_token) + len(self.tool_call_start_token)
+        end_idx = current_text.find(self.tool_call_end_token)
+        json_str = current_text[start_idx:end_idx].strip()
+        # Parse the complete JSON
+        try:
+            # Ensure we have array brackets
+            if not json_str.startswith("["):
+                json_str = "[" + json_str
+            if not json_str.endswith("]"):
+                json_str = json_str + "]"
+            # Parse complete JSON
+            tool_calls_arr = json.loads(json_str)
+            if not isinstance(tool_calls_arr, list):
+                return None
+            # Generate delta updates for new/updated tool calls
+            delta_tool_calls = []
+            for idx, tool_call in enumerate(tool_calls_arr):
+                if not isinstance(tool_call, dict):
+                    continue
+                # Ensure we have a tool ID for this call
+                while len(self.tool_call_ids) <= idx:
+                    self.tool_call_ids.append(random_uuid())
+                tool_id = self.tool_call_ids[idx]
+                tool_name = tool_call.get("name", "")
+                tool_args = tool_call.get("arguments", {})
+                # Convert arguments to JSON string
+                if isinstance(tool_args, dict):
+                    args_str = json.dumps(tool_args, ensure_ascii=False)
+                else:
+                    args_str = str(tool_args)
+                # Check if this is a new tool call
+                if idx >= self.sent_tool_calls_count:
+                    # New tool call - send ID, name, and complete arguments all at once
+                    # This matches how other models (Llama, etc.) send tool calls
+                    delta_tool_calls.append(DeltaToolCall(
+                        index=idx,
+                        id=tool_id,
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=tool_name,
+                            arguments=args_str  # Send complete JSON string
+                        )
+                    ))
+                    self.sent_tool_calls_count = idx + 1
+                    self.sent_args_length[idx] = len(args_str)
+                # NOTE: We don't send incremental updates for arguments
+                # because Nemotron generates complete tool calls in one shot
+                # Unlike thinking models that stream arguments token-by-token
+            if delta_tool_calls:
+                return DeltaMessage(tool_calls=delta_tool_calls)
+        except Exception as e:
+            # JSON parsing failed (expected for incomplete JSON)
+            logger.debug(f"Partial JSON parse failed (expected during streaming): {e}")
+            pass
+        # Check if we just completed the tool calls (end tag in this delta)
+        if self.tool_call_end_token in delta_text and self.tool_call_end_token not in previous_text:
+            # We just completed - reset state for next potential tool call
+            self.sent_tool_calls_count = 0
+            self.sent_args_length = {}
+            self.tool_call_ids = []
+        return None