Spaces:

mgokg
/

MultimodalLiveApi

Sleeping

App Files Files Community

mgokg commited on Mar 9

Commit

3117482

verified ·

1 Parent(s): a52af47

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -298

app.py CHANGED Viewed

@@ -1,300 +1,24 @@
-import os
-import base64
-import json
-import numpy as np
 import gradio as gr
-# import websockets.sync.client  # No longer needed with FastRTC
-from fastrtc import (
-    #PeerConnection,
-    DataChannel,
-    MediaStreamTrack,
-    AudioFrame,
-    VideoFrame,
-)  # Import FastRTC components
-from aiortc.contrib.media import MediaPlayer, MediaRelay
-import asyncio
-__version__ = "0.0.3"
-# KEY_NAME = "AIzaSyCWPviRPxj8IMLaijLGbRIsio3dO2rp3rU"  #  Best practice:  Keep API keys out of the main code.  Use environment variables.
-# Configuration and Utilities
-class GeminiConfig:
-    """Configuration settings for Gemini API."""
-    def __init__(self):
-        self.api_key = os.environ.get("KEY_NAME")  # Use a more descriptive name
-        if not self.api_key:
-            raise ValueError("GEMINI_API_KEY environment variable is not set.")
-        self.host = "generativelanguage.googleapis.com"
-        self.model = "models/gemini-2.0-flash-exp"
-        #  FastRTC doesn't use WebSockets directly in the same way.  We'll handle the API calls differently.
-        self.base_url = f"https://{self.host}/v1alpha/{self.model}:streamGenerateContent?key={self.api_key}"
-class AudioProcessor:
-    """Handles encoding and decoding of audio data."""
-    @staticmethod
-    def encode_audio(data: np.ndarray, sample_rate: int) -> str:
-        """Encodes audio data to base64."""
-        # Ensure data is in the correct format (int16)
-        if data.dtype != np.int16:
-            data = data.astype(np.int16)
-        encoded = base64.b64encode(data.tobytes()).decode("UTF-8")
-        return encoded
-    @staticmethod
-    def process_audio_response(data: str) -> np.ndarray:
-        """Decodes audio data from base64."""
-        audio_data = base64.b64decode(data)
-        return np.frombuffer(audio_data, dtype=np.int16)
-#  We don't need a StreamHandler in the same way with FastRTC.  We'll handle streaming directly.
-class GeminiHandler:
-    """Handles interactions with the Gemini API."""
-    def __init__(self, output_sample_rate=24000, output_frame_size=480):
-        self.config = GeminiConfig()
-        self.audio_processor = AudioProcessor()
-        self.output_sample_rate = output_sample_rate
-        self.output_frame_size = output_frame_size
-        self.all_output_data = None
-        self.pc = None  # PeerConnection
-        self.dc = None  # DataChannel
-        self.audio_track = None
-        self._audio_buffer = []
-        self.relay = MediaRelay()
-    async def _send_audio_to_gemini(self, encoded_audio: str):
-        """Sends audio data to the Gemini API and processes the response."""
-        headers = {"Content-Type": "application/json"}
-        payload = {
-            "contents": [
-                {
-                    "parts": [
-                        {
-                            "text": "Respond to the audio with audio."
-                        },  #  Initial prompt, can be adjusted
-                        {"inline_data": {"mime_type": "audio/pcm;rate=24000", "data": encoded_audio}},
-                    ]
-                }
-            ]
-        }
-        # Use aiohttp for asynchronous HTTP requests
-        import aiohttp
-        async with aiohttp.ClientSession() as session:
-            async with session.post(
-                self.config.base_url, headers=headers, data=json.dumps(payload)
-            ) as response:
-                if response.status != 200:
-                    print(f"Error: Gemini API returned status {response.status}")
-                    print(await response.text())
-                    return
-                async for line in response.content:
-                    try:
-                        line = line.strip()
-                        if not line:
-                            continue
-                        #  Responses are chunked, often with multiple JSON objects per chunk.  Handle that.
-                        for chunk in line.decode("utf-8").split("\n"):
-                            if not chunk.strip():
-                                continue
-                            try:
-                                data = json.loads(chunk)
-                            except json.JSONDecodeError:
-                                print(f"JSONDecodeError: {chunk}")
-                                continue
-                            if "candidates" in data:
-                                for candidate in data["candidates"]:
-                                    for part in candidate.get("content", {}).get("parts", []):
-                                        if "inlineData" in part:
-                                            audio_data = part["inlineData"].get("data", "")
-                                            if audio_data:
-                                                await self._process_server_audio(audio_data)
-                    except Exception as e:
-                        print(f"Error processing response chunk: {e}")
-    async def _process_server_audio(self, audio_data: str):
-        """Processes and buffers audio data received from the server."""
-        audio_array = self.audio_processor.process_audio_response(audio_data)
-        if self.all_output_data is None:
-            self.all_output_data = audio_array
-        else:
-            self.all_output_data = np.concatenate((self.all_output_data, audio_array))
-        while self.all_output_data.shape[-1] >= self.output_frame_size:
-            frame = AudioFrame(
-                samples=self.output_frame_size,
-                sample_rate=self.output_sample_rate,
-                layout="mono", # mono channel
-                data=self.all_output_data[: self.output_frame_size].tobytes()
-            )
-            self.all_output_data = self.all_output_data[self.output_frame_size:]
-            if self.audio_track:
-                await self.audio_track.emit(frame)
-    async def on_track(self, track):
-        """Handles incoming media tracks."""
-        print(f"Track received: {track.kind}")
-        if track.kind == "audio":
-            self.audio_track = track  # Store the audio track
-            @track.on("frame")
-            async def on_frame(frame):
-                # Process received audio frames
-                if isinstance(frame, AudioFrame):
-                    try:
-                        # Convert the frame data to a NumPy array
-                        audio_data = np.frombuffer(frame.data, dtype=np.int16)
-                        # Encode the audio and send it to Gemini
-                        encoded_audio = self.audio_processor.encode_audio(
-                            audio_data, frame.sample_rate
-                        )  # Pass sample rate
-                        await self._send_audio_to_gemini(encoded_audio)
-                    except Exception as e:
-                        print(f"Error processing audio frame: {e}")
-    async def on_datachannel(self, channel):
-        """Handles data channel events (not used in this example, but good practice)."""
-        self.dc = channel
-        print("Data channel created")
-        @channel.on("message")
-        async def on_message(message):
-            print(f"Received message: {message}")
-    async def connect(self):
-        """Establishes the PeerConnection."""
-        self.pc = PeerConnection()
-        self.pc.on("track", self.on_track)
-        self.pc.on("datachannel", self.on_datachannel)
-        #  Create a local audio track to send data
-        self.local_audio_player = MediaPlayer("default", format="avfoundation", options={"channels": "1", "sample_rate": str(self.output_sample_rate)})
-        self.local_audio = self.relay.subscribe(self.local_audio_player.audio)
-        self.pc.addTrack(self.local_audio)
-        #  Add a data channel (optional, but good practice)
-        self.dc = self.pc.createDataChannel("data")
-        #  Create an offer and set local description
-        offer = await self.pc.createOffer()
-        await self.pc.setLocalDescription(offer)
-        print("PeerConnection established")
-        return self.pc.localDescription
-    async def set_remote_description(self, sdp, type):
-        """Sets the remote description."""
-        from aiortc import RTCSessionDescription
-        await self.pc.setRemoteDescription(RTCSessionDescription(sdp=sdp, type=type))
-        print("Remote description set")
-        if self.pc.remoteDescription.type == "offer":
-            answer = await self.pc.createAnswer()
-            await self.pc.setLocalDescription(answer)
-            return self.pc.localDescription
-    async def add_ice_candidate(self, candidate, sdpMid, sdpMLineIndex):
-        """Adds an ICE candidate."""
-        from aiortc import RTCIceCandidate
-        if candidate:
-            try:
-                ice_candidate = RTCIceCandidate(
-                    candidate=candidate, sdpMid=sdpMid, sdpMLineIndex=sdpMLineIndex
-                )
-                await self.pc.addIceCandidate(ice_candidate)
-                print("ICE candidate added")
-            except Exception as e:
-                print(f"Error adding ICE candidate: {e}")
-    def shutdown(self):
-        """Closes the PeerConnection."""
-        if self.pc:
-            asyncio.create_task(self.pc.close())  # Close in the background
-            self.pc = None
-        print("PeerConnection closed")
-# Gradio Interface
-async def registry(
-    name: str,
-    token: str | None = None,
-    **kwargs,
-):
-    """Sets up and returns the Gradio interface."""
-    gemini_handler = GeminiHandler()
-    async def connect_webrtc(sdp, type, candidates):
-        """Connects to the WebRTC client and handles ICE candidates."""
-        if gemini_handler.pc is None:
-            local_description = await gemini_handler.connect()
-            if local_description:
-                yield json.dumps(
-                    {
-                        "sdp": local_description.sdp,
-                        "type": local_description.type,
-                        "candidates": [],
-                    }
-                )  # Return initial SDP
-        if sdp and type:
-            answer = await gemini_handler.set_remote_description(sdp, type)
-            if answer:
-                yield json.dumps({"sdp": answer.sdp, "type": answer.type, "candidates": []})
-        for candidate in candidates:
-            if candidate and candidate.get("candidate"):
-                await gemini_handler.add_ice_candidate(
-                    candidate["candidate"], candidate.get("sdpMid"), candidate.get("sdpMLineIndex")
-                )
-        yield json.dumps({"sdp": "", "type": "", "candidates": []})  #  Signal completion
-    interface = gr.Blocks()
-    with interface:
-        with gr.Tabs():
-            with gr.TabItem("Voice Chat"):
-                gr.HTML(
-                    """
-                    <div style='text-align: left'>
-                        <h1>Gemini API Voice Chat</h1>
-                    </div>
-                    """
-                )
-                with gr.Row():
-                    webrtc_out = gr.JSON(label="WebRTC JSON")
-                # Use the built-in WebRTC component, but without automatic streaming.
-                webrtc = gr.WebRTC(
-                    value={"sdp": "", "type": "", "candidates": []},
-                    interactive=True,
-                    label="Voice Chat",
-                )
-                connect_button = gr.Button("Connect")
-                connect_button.click(
-                    connect_webrtc,
-                    inputs=[
-                        webrtc
-                    ],  # Pass the WebRTC component's value (SDP, type, candidates)
-                    outputs=[webrtc_out], # show the webrtc connection data
-                )
-    return interface
-# Launch the Gradio interface
-async def main():
-    interface = await registry(name="gemini-2.0-flash-exp")
-    interface.queue()  # Enable queuing for better concurrency
-    await interface.launch()
-if __name__ == "__main__":
-    asyncio.run(main())

 import gradio as gr
+from PyPDF2 import PdfReader
+import os
+def process_pdf(file):
+    # Read the PDF content
+    pdf_reader = PdfReader(file.name)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+  with gr.Blocks() as demo:
+    gr.Markdown("### File upload", elem_classes="tab-header")
+    with gr.Row():
+    text_output = gr.Textbox(label="text")
+        file_input = gr.File(label="Wähle eine PDF-Datei aus", type="filepath")
+        upload_output = gr.Textbox(label="Upload Status")
+    with gr.Row():
+        submit_button = gr.Button("upload")
+    submit_button.click(process_pdf, inputs=file_input, outputs=text_output
+    demo.launch())