Spaces:
Paused
Paused
| # | |
| # SPDX-FileCopyrightText: Hadad <[email protected]> | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # | |
| import json # Import JSON module for encoding and decoding JSON data | |
| import uuid # Import UUID module to generate unique session identifiers | |
| from typing import Any, List # Import typing annotations for type hinting | |
| from config import model # Import model configuration dictionary from config module | |
| from src.core.server import jarvis # Import the async function to interact with AI backend | |
| from src.core.parameter import parameters # Import parameters (not used directly here but imported for completeness) | |
| from src.core.session import session # Import session dictionary to store conversation histories | |
| from src.tools.audio import AudioGeneration # Import AudioGeneration class to handle audio creation | |
| from src.tools.image import ImageGeneration # Import ImageGeneration class to handle image creation | |
| from src.tools.deep_search import SearchTools # Import SearchTools class for deep search functionality | |
| import gradio as gr # Import Gradio library for UI and request handling | |
| # Define an asynchronous function 'respond' to process user messages and generate AI responses | |
| # This function handles various types of user inputs including text, commands, and file uploads | |
| # It supports multiple AI models and generation modes with customizable parameters | |
| async def respond( | |
| message, # Incoming user message, can be a string or a dictionary containing text and files | |
| history: List[Any], # List containing conversation history as pairs of user and assistant messages | |
| model_label, # Label/key to select the specific AI model from available models configuration | |
| temperature, # Sampling temperature parameter controlling randomness of AI response generation (0.0 to 2.0) | |
| top_k, # Number of highest probability tokens to keep for sampling during text generation | |
| min_p, # Minimum probability threshold for token sampling to filter low probability tokens | |
| top_p, # Cumulative probability threshold for nucleus sampling technique | |
| repetition_penalty, # Penalty factor to reduce repetitive tokens in generated text output | |
| thinking, # Boolean flag indicating if AI should operate in "thinking" mode with deeper reasoning | |
| image_gen, # Boolean flag to enable image generation commands using /image prefix | |
| audio_gen, # Boolean flag to enable audio generation commands using /audio prefix | |
| search_gen, # Boolean flag to enable deep search commands using /dp prefix | |
| request: gr.Request # Gradio request object to access session information such as session hash | |
| ): | |
| # Select the AI model based on the provided label, fallback to first model if label not found | |
| selected_model = model.get(model_label, list(model.values())[0]) | |
| # Instantiate SearchTools class to enable deep search capabilities when requested by user | |
| search_tools = SearchTools() | |
| # Retrieve session ID from the Gradio request's session hash, generate new UUID if none exists | |
| session_id = request.session_hash or str(uuid.uuid4()) | |
| # Initialize an empty conversation history list for this session if it does not already exist | |
| if session_id not in session: | |
| session[session_id] = [] | |
| # Determine the mode string based on the 'thinking' flag, affects AI response generation behavior | |
| mode = "/think" if thinking else "/no_think" | |
| # Initialize variables for storing user input text and any attached files | |
| input = "" | |
| files = None | |
| # Check if the incoming message is a dictionary which may contain both text and file attachments | |
| if isinstance(message, dict): | |
| # Extract the text content from the message dictionary, default to empty string if missing | |
| input = message.get("text", "") | |
| # Extract the first file from the files list if present, otherwise set files to None | |
| files = message.get("files")[0] if message.get("files") else None | |
| else: | |
| # If the message is a simple string, assign it directly to input variable | |
| input = message | |
| # Strip leading and trailing whitespace from the input for clean processing | |
| stripped_input = input.strip() | |
| # Convert the stripped input to lowercase for case-insensitive command detection | |
| lowered_input = stripped_input.lower() | |
| # If the input is empty after stripping whitespace, yield an empty list and exit function early | |
| if not stripped_input: | |
| yield [] | |
| return | |
| # If the input is exactly one of the command keywords without parameters, yield empty and exit early | |
| if lowered_input in ["/audio", "/image", "/dp"]: | |
| yield [] | |
| return | |
| # Convert conversation history from tuples style to messages style format for AI model consumption | |
| # Transform list of [user_msg, assistant_msg] pairs into flat list of role-content dictionaries | |
| new_history = [] | |
| for entry in history: | |
| # Ensure the entry is a list with exactly two elements: user message and assistant message | |
| if isinstance(entry, list) and len(entry) == 2: | |
| user_msg, assistant_msg = entry | |
| # Append the user message with role 'user' to the new history if message is not None | |
| if user_msg is not None: | |
| new_history.append({"role": "user", "content": user_msg}) | |
| # Append the assistant message with role 'assistant' if it exists and is not None | |
| if assistant_msg is not None: | |
| new_history.append({"role": "assistant", "content": assistant_msg}) | |
| # Update the global session dictionary with the newly formatted conversation history for this session | |
| session[session_id] = new_history | |
| # Handle audio generation command if enabled and input starts with '/audio' prefix | |
| if audio_gen and lowered_input.startswith("/audio"): | |
| # Extract the audio instruction text after the '/audio' command prefix and strip whitespace | |
| audio_instruction = input[6:].strip() | |
| # If no instruction text is provided after the command, yield empty and exit early | |
| if not audio_instruction: | |
| yield [] | |
| return | |
| try: | |
| # Asynchronously create audio content based on the instruction using AudioGeneration class | |
| audio = await AudioGeneration.create_audio(audio_instruction) | |
| # Serialize the audio data and instruction into a JSON formatted string for processing | |
| audio_generation_content = json.dumps({ | |
| "audio": audio, | |
| "audio_instruction": audio_instruction | |
| }) | |
| # Construct the conversation history including the audio generation result and formatting instructions | |
| audio_generation_result = ( | |
| new_history | |
| + [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Audio generation result:\n\n" + audio_generation_content + "\n\n\n" | |
| "Show the audio using the following HTML audio tag format, where '{audio_link}' is the URL of the generated audio:\n\n" | |
| "<audio controls src='{audio_link}' style='width:100%; max-width:100%;'></audio>\n\n" | |
| "Please replace '{audio_link}' with the actual audio URL provided in the context.\n\n" | |
| "Then, describe the generated audio based on the above information.\n\n\n" | |
| "Use the same language as the previous user input or user request.\n" | |
| "For example, if the previous user input or user request is in Indonesian, explain in Indonesian.\n" | |
| "If it is in English, explain in English. This also applies to other languages.\n\n\n" | |
| ) | |
| } | |
| ] | |
| ) | |
| # Use async generator to get descriptive text about the generated audio from AI | |
| async for audio_description in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=audio_generation_result, | |
| user_message=input, | |
| mode="/no_think", # Use non-reasoning mode to avoid extra processing overhead | |
| temperature=0.7, # Fixed temperature for consistent audio description generation | |
| top_k=20, # Limit token sampling to top 20 most probable tokens | |
| min_p=0, # Minimum probability threshold set to zero | |
| top_p=0.8, # Nucleus sampling threshold for quality control | |
| repetition_penalty=1 # No repetition penalty for this step | |
| ): | |
| # Yield the audio description wrapped in a tool role for proper UI display | |
| yield [{"role": "tool", "content": audio_description}] | |
| return | |
| except Exception: | |
| # If audio generation fails, let AI generate a contextual error message | |
| generation_failed = ( | |
| new_history | |
| + [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Audio generation failed for the user's request. The user tried to generate audio with the instruction: '" | |
| + audio_instruction + "'\n\n\n" | |
| "Please explain to the user that audio generation failed and suggest they wait 15 seconds before trying again.\n" | |
| "Be helpful and empathetic in your response.\n\n\n" | |
| "Use the same language as the previous user input or user request.\n" | |
| "For example, if the previous user input or user request is in Indonesian, explain in Indonesian.\n" | |
| "If it is in English, explain in English. This also applies to other languages.\n\n\n" | |
| ) | |
| } | |
| ] | |
| ) | |
| # Use AI to generate a contextual error message | |
| async for error_response in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=generation_failed, | |
| user_message=input, | |
| mode="/no_think", # Use non-reasoning mode for error handling | |
| temperature=0.7, # Fixed temperature for more consistent error messages | |
| top_k=20, # Limit token sampling | |
| min_p=0, # Minimum probability threshold | |
| top_p=0.8, # Nucleus sampling threshold | |
| repetition_penalty=1 # No repetition penalty | |
| ): | |
| # Yield the AI-generated error response wrapped in tool role | |
| yield [{"role": "tool", "content": error_response}] | |
| return | |
| # Handle image generation command if enabled and input starts with '/image' prefix | |
| if image_gen and lowered_input.startswith("/image"): | |
| # Extract the image generation instruction after the '/image' command prefix and strip whitespace | |
| generate_image_instruction = input[6:].strip() | |
| # If no instruction text is provided after the command, yield empty and exit early | |
| if not generate_image_instruction: | |
| yield [] | |
| return | |
| try: | |
| # Asynchronously create image content based on the instruction using ImageGeneration class | |
| image = await ImageGeneration.create_image(generate_image_instruction) | |
| # Serialize the image data and instruction into a JSON formatted string for processing | |
| image_generation_content = json.dumps({ | |
| "image": image, | |
| "generate_image_instruction": generate_image_instruction | |
| }) | |
| # Construct the conversation history including the image generation result and formatting instructions | |
| image_generation_result = ( | |
| new_history | |
| + [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Image generation result:\n\n" + image_generation_content + "\n\n\n" | |
| "Show the generated image using the following markdown syntax format, where '{image_link}' is the URL of the image:\n\n" | |
| "\n\n" | |
| "Please replace '{image_link}' with the actual image URL provided in the context.\n\n" | |
| "Then, describe the generated image based on the above information.\n\n\n" | |
| "Use the same language as the previous user input or user request.\n" | |
| "For example, if the previous user input or user request is in Indonesian, explain in Indonesian.\n" | |
| "If it is in English, explain in English. This also applies to other languages.\n\n\n" | |
| ) | |
| } | |
| ] | |
| ) | |
| # Use async generator to get descriptive text about the generated image from AI | |
| async for image_description in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=image_generation_result, | |
| user_message=input, | |
| mode="/no_think", # Use non-reasoning mode to avoid extra processing overhead | |
| temperature=0.7, # Fixed temperature for consistent image description generation | |
| top_k=20, # Limit token sampling to top 20 most probable tokens | |
| min_p=0, # Minimum probability threshold set to zero | |
| top_p=0.8, # Nucleus sampling threshold for quality control | |
| repetition_penalty=1 # No repetition penalty for this step | |
| ): | |
| # Yield the image description wrapped in a tool role for proper UI display | |
| yield [{"role": "tool", "content": image_description}] | |
| return | |
| except Exception: | |
| # If image generation fails, let AI generate a contextual error message | |
| generation_failed = ( | |
| new_history | |
| + [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Image generation failed for the user's request. The user tried to generate an image with the instruction: '" | |
| + generate_image_instruction + "'\n\n\n" | |
| "Please explain to the user that image generation failed and suggest they wait 15 seconds before trying again.\n" | |
| "Be helpful and empathetic in your response.\n\n\n" | |
| "Use the same language as the previous user input or user request.\n" | |
| "For example, if the previous user input or user request is in Indonesian, explain in Indonesian.\n" | |
| "If it is in English, explain in English. This also applies to other languages.\n\n\n" | |
| ) | |
| } | |
| ] | |
| ) | |
| # Use AI to generate a contextual error message | |
| async for error_response in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=generation_failed, | |
| user_message=input, | |
| mode="/no_think", # Use non-reasoning mode for error handling | |
| temperature=0.7, # Fixed temperature for more consistent error messages | |
| top_k=20, # Limit token sampling | |
| min_p=0, # Minimum probability threshold | |
| top_p=0.8, # Nucleus sampling threshold | |
| repetition_penalty=1 # No repetition penalty | |
| ): | |
| # Yield the AI-generated error response wrapped in tool role | |
| yield [{"role": "tool", "content": error_response}] | |
| return | |
| # Handle deep search command if enabled and input starts with '/dp' prefix | |
| if search_gen and lowered_input.startswith("/dp"): | |
| # Extract the search query after the '/dp' command prefix and strip whitespace | |
| search_query = input[3:].strip() | |
| # If no search query is provided after the command, yield empty and exit early | |
| if not search_query: | |
| yield [] | |
| return | |
| try: | |
| # Perform an asynchronous deep search using SearchTools with the given query | |
| search_results = await search_tools.search(search_query) | |
| # Serialize the search query and results (limited to first 5000 characters) into JSON string | |
| search_content = json.dumps({ | |
| "query": search_query, | |
| "search_results": search_results[:5000] | |
| }) | |
| # Construct conversation history including deep search results and detailed instructions for summarization | |
| search_instructions = ( | |
| new_history | |
| + [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Deep search results for query: '" + search_query + "':\n\n\n" + search_content + "\n\n\n" | |
| "Please analyze these search results and provide a comprehensive summary of the information.\n" | |
| "Identify the most relevant information related to the query.\n" | |
| "Format your response in a clear, structured way with appropriate headings and bullet points if needed.\n" | |
| "If the search results don't provide sufficient information, acknowledge this limitation.\n" | |
| "Please provide links or URLs from each of your search results.\n\n\n" | |
| "Use the same language as the previous user input or user request.\n" | |
| "For example, if the previous user input or user request is in Indonesian, explain in Indonesian.\n" | |
| "If it is in English, explain in English. This also applies to other languages.\n\n\n" | |
| ) | |
| } | |
| ] | |
| ) | |
| # Use async generator to process the deep search results and generate a summary response | |
| async for search_response in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=search_instructions, | |
| user_message=input, | |
| mode=mode, # Use the mode determined by the thinking flag | |
| temperature=temperature, | |
| top_k=top_k, | |
| min_p=min_p, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty | |
| ): | |
| # Yield the search summary wrapped in a tool role for proper UI display | |
| yield [{"role": "tool", "content": search_response}] | |
| return | |
| except Exception as e: | |
| # If deep search fails, let AI generate a contextual error message | |
| generation_failed = ( | |
| new_history | |
| + [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Deep search failed for the user's query: '" + search_query + "'\n\n\n" | |
| "Please explain to the user that the search operation failed and suggest they try again later.\n" | |
| "Be helpful and empathetic in your response. You can also suggest alternative approaches or workarounds.\n\n\n" | |
| "Use the same language as the previous user input or user request.\n" | |
| "For example, if the previous user input or user request is in Indonesian, explain in Indonesian.\n" | |
| "If it is in English, explain in English. This also applies to other languages.\n\n\n" | |
| ) | |
| } | |
| ] | |
| ) | |
| # Use AI to generate a contextual error message | |
| async for error_response in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=generation_failed, | |
| user_message=input, | |
| mode="/no_think", # Use non-reasoning mode for error handling | |
| temperature=0.7, # Fixed temperature for more consistent error messages | |
| top_k=20, # Limit token sampling | |
| min_p=0, # Minimum probability threshold | |
| top_p=0.8, # Nucleus sampling threshold | |
| repetition_penalty=1 # No repetition penalty | |
| ): | |
| # Yield the AI-generated error response wrapped in tool role | |
| yield [{"role": "tool", "content": error_response}] | |
| return | |
| # For all other inputs that do not match special commands, use the jarvis function to generate a normal response | |
| async for response in jarvis( | |
| session_id=session_id, | |
| model=selected_model, | |
| history=new_history, # Pass the conversation history | |
| user_message=input, | |
| mode=mode, # Use the mode determined by the thinking flag | |
| files=files, # Pass any attached files along with the message | |
| temperature=temperature, | |
| top_k=top_k, | |
| min_p=min_p, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty | |
| ): | |
| # Yield each chunk of the response as it is generated by the AI model | |
| yield response |