Polarisailabs commited on
Commit
e185f0f
·
verified ·
1 Parent(s): be5d7c3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -256
app.py CHANGED
@@ -1,269 +1,72 @@
1
- # -*- coding: utf-8 -*-
2
- # Install required libraries if running outside Colab
3
- # !pip install gradio yt-dlp moviepy pillow speechrecognition llama-index lancedb google-generativeai
4
-
 
 
 
 
5
  import gradio as gr
6
  from moviepy import VideoFileClip
7
  from pathlib import Path
8
  import speech_recognition as sr
9
  from PIL import Image
10
- import os
11
- import shutil
12
- import json
13
- import matplotlib.pyplot as plt
14
- import yt_dlp
15
- import requests
16
- import base64
17
  from io import BytesIO
18
-
19
- # Add your existing methods here (download_video, video_to_images, video_to_audio, audio_to_text, prepare_video...)
20
-
21
  def plot_images(image_paths):
22
- images_shown = 0
23
- plt.figure(figsize=(16, 9))
24
- img_files = []
25
- for img_path in image_paths:
26
- if os.path.isfile(img_path):
27
- img_files.append(img_path)
28
- images_shown += 1
29
- if images_shown >= 7:
30
- break
31
- return img_files
32
-
33
- def download_video(video_url, output_video_path="./video_data/"):
34
- ydl_opts = {
35
- "format": "bestvideo+bestaudio/best",
36
- "merge_output_format": "mp4",
37
- "outtmpl": f"{output_video_path}/input_vid.mp4",
38
- "noplaylist": True,
39
- "quiet": False,
40
- # Uncomment and set your cookie file path if required
41
- # "cookiefile": "cookies.txt",
42
- }
43
- Path(output_video_path).mkdir(parents=True, exist_ok=True)
44
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
45
- info = ydl.extract_info(video_url, download=True)
46
- info = ydl.sanitize_info(info)
47
- return {
48
- "title": info.get("title"),
49
- "uploader": info.get("uploader"),
50
- "views": info.get("view_count"),
51
- }
52
-
53
- def video_to_images(video_path, output_folder):
54
- Path(output_folder).mkdir(parents=True, exist_ok=True)
55
- clip = VideoFileClip(video_path)
56
- clip.write_images_sequence(
57
- os.path.join(output_folder, "frame%04d.png"), fps=0.2
58
- )
59
-
60
- def video_to_audio(video_path, output_audio_path):
61
- clip = VideoFileClip(video_path)
62
- audio = clip.audio
63
- audio.write_audiofile(output_audio_path)
64
-
65
  def audio_to_text(audio_path):
66
- recognizer = sr.Recognizer()
67
- try:
68
- with sr.AudioFile(audio_path) as source:
69
- audio_data = recognizer.record(source)
70
- text = recognizer.recognize_google(audio_data)
71
- return text
72
- except sr.UnknownValueError:
73
- print("Google Speech Recognition could not understand the audio.")
74
- except sr.RequestError as e:
75
- print(f"Could not request results: {e}")
76
- return None
77
-
78
- def prepare_all_videos(
79
- video_folder="./video_data/",
80
- output_folder="./mixed_data/"
81
- ):
82
- """
83
- Processes all video files in video_folder, extracting images and text for each,
84
- and stores them in unique subfolders under output_folder.
85
- Returns a list of metadata dicts for all videos.
86
- """
87
- Path(output_folder).mkdir(parents=True, exist_ok=True)
88
- video_files = [f for f in os.listdir(video_folder) if f.lower().endswith(('.mp4', '.mov', '.avi', '.mkv'))]
89
- all_metadata = []
90
- for video_file in video_files:
91
- video_path = os.path.join(video_folder, video_file)
92
- video_name = Path(video_file).stem
93
- video_output_folder = os.path.join(output_folder, video_name)
94
- Path(video_output_folder).mkdir(parents=True, exist_ok=True)
95
- audio_path = os.path.join(video_output_folder, "output_audio.wav")
96
- # Extract images and audio
97
- video_to_images(video_path, video_output_folder)
98
- video_to_audio(video_path, audio_path)
99
- # Transcribe audio
100
- text_data = audio_to_text(audio_path)
101
- text_path = os.path.join(video_output_folder, "output_text.txt")
102
- with open(text_path, "w") as file:
103
- file.write(text_data if text_data else "")
104
- os.remove(audio_path)
105
- # Dummy metadata, you can enhance this as needed
106
- meta = {
107
- "title": video_name,
108
- "uploader": "unknown",
109
- "views": "unknown",
110
- "file": video_file
111
- }
112
- all_metadata.append({"meta": meta, "text": text_data, "folder": video_output_folder})
113
- return all_metadata
114
-
115
  from llama_index.core.indices import MultiModalVectorStoreIndex
116
- from llama_index.core import SimpleDirectoryReader, StorageContext
117
  from llama_index.vector_stores.lancedb import LanceDBVectorStore
118
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
119
  from llama_index.core import Settings
120
-
121
- def create_vector_db_for_all(image_txt_root_folder: str):
122
- """
123
- Loads all subfolders in image_txt_root_folder as documents for the vector DB.
124
- """
125
- text_store = LanceDBVectorStore(uri="lancedb", table_name="text_collection")
126
- image_store = LanceDBVectorStore(uri="lancedb", table_name="image_collection")
127
- storage_context = StorageContext.from_defaults(
128
- vector_store=text_store, image_store=image_store
129
- )
130
- Settings.embed_model = HuggingFaceEmbedding(
131
- model_name="sentence-transformers/all-MiniLM-L6-v2"
132
- )
133
- # Load all subfolders as documents
134
- documents = []
135
- for subfolder in Path(image_txt_root_folder).iterdir():
136
- if subfolder.is_dir():
137
- documents.extend(SimpleDirectoryReader(str(subfolder)).load_data())
138
- index = MultiModalVectorStoreIndex.from_documents(
139
- documents,
140
- storage_context=storage_context,
141
- )
142
- retriever_engine = index.as_retriever(
143
- similarity_top_k=2, image_similarity_top_k=3
144
- )
145
- return retriever_engine
146
-
147
  from llama_index.core.schema import ImageNode
148
-
149
- def retrieve(retriever_engine, query_str):
150
- retrieval_results = retriever_engine.retrieve(query_str)
151
- retrieved_image = []
152
- retrieved_text = []
153
- for res_node in retrieval_results:
154
- if isinstance(res_node.node, ImageNode):
155
- retrieved_image.append(res_node.node.metadata["file_path"])
156
- else:
157
- retrieved_text.append(res_node.text)
158
- return retrieved_image, retrieved_text
159
-
160
- qa_tmpl_str = (
161
- "Given the provided information, including relevant images and retrieved context from the video, \
162
- accurately and precisely answer the query without any additional prior knowledge.\n"
163
- "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
164
- "---------------------\n"
165
- "Context: {context_str}\n"
166
- "Metadata for video: {metadata_str} \n"
167
- "---------------------\n"
168
- "Query: {query_str}\n"
169
- "Answer: "
170
- )
171
-
172
- # Define model values and their corresponding labels
173
- available_models = [
174
- {"value": "meta-llama/llama-4-maverick:free", "label": "Llama"},
175
- {"value": "qwen/qwen2.5-vl-72b-instruct:free", "label": "Qwen"},
176
- {"value": "google/gemma-3-27b-it:free", "label": "Gemma"},
177
- {"value": "moonshotai/kimi-vl-a3b-thinking:free", "label": "Kimi"},
178
- {"value": "google/gemini-2.0-flash-exp:free", "label": "Gemini"},
179
- # Add more models here if needed
180
- ]
181
-
182
- # Helper to get value from label or vice versa
183
- model_value_to_label = {item["value"]: item["label"] for item in available_models}
184
- model_label_to_value = {item["label"]: item["value"] for item in available_models}
185
-
186
- # Gradio interface function
187
- def gradio_chat(query, model_label):
188
- output_video_path = "./video_data/"
189
- output_folder = "./mixed_data/"
190
-
191
- try:
192
- # Process all videos
193
- all_metadata = prepare_all_videos(output_video_path, output_folder)
194
- # Combine metadata for all videos
195
- metadata_str = json.dumps([item["meta"] for item in all_metadata])
196
- retriever_engine = create_vector_db_for_all(output_folder)
197
-
198
- img, txt = retrieve(retriever_engine=retriever_engine, query_str=query)
199
- context_str = "".join(txt)
200
- prompt = qa_tmpl_str.format(
201
- context_str=context_str, query_str=query, metadata_str=metadata_str
202
- )
203
-
204
- OPENROUTER_API_KEY = os.environ['OPENROUTER_API_KEY']
205
- headers = {
206
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
207
- "Content-Type": "application/json",
208
- "HTTP-Referer": "<YOUR_SITE_URL>",
209
- "X-Title": "<YOUR_SITE_NAME>",
210
- }
211
-
212
- model_name = model_label_to_value.get(model_label, available_models[0]["value"])
213
-
214
- messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
215
- image_paths = []
216
- for img_path in img:
217
- try:
218
- image = Image.open(img_path)
219
- buffered = BytesIO()
220
- image.save(buffered, format="JPEG")
221
- img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
222
- messages[0]["content"].append({
223
- "type": "image_url",
224
- "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
225
- })
226
- image_paths.append(img_path)
227
- except Exception as e:
228
- print(f"Error loading image {img_path}: {e}")
229
-
230
- data = {
231
- "model": model_name,
232
- "messages": messages,
233
- }
234
-
235
- response = requests.post(
236
- url="https://openrouter.ai/api/v1/chat/completions",
237
- headers=headers,
238
- data=json.dumps(data)
239
- )
240
- response.raise_for_status()
241
- result_text = response.json()['choices'][0]['message']['content']
242
-
243
- return result_text, image_paths
244
- except Exception as e:
245
- return f"Error: {str(e)}", []
246
-
247
- # Gradio UI
248
-
249
- gradio_ui = gr.Interface(
250
- fn=gradio_chat,
251
- inputs=[
252
- gr.Textbox(label="",placeholder="Try: Best island in Maldives"),
253
- gr.Dropdown(
254
- choices=[item["label"] for item in available_models],
255
- value=available_models[0]["label"],
256
- label="Select Model:"
257
- )
258
- ],
259
- outputs=[
260
- gr.Textbox(label="Vega Response:"),
261
- gr.Gallery(label="Relevant Images", allow_preview=True),
262
- ],
263
- title="",
264
- description="",
265
- theme = gr.themes.Default(primary_hue="sky")
266
- )
267
-
268
- if __name__ == "__main__":
269
- gradio_ui.launch(share=True)
 
1
+ _H='./mixed_data/'
2
+ _G='text'
3
+ _F='uploader'
4
+ _E='title'
5
+ _D='./video_data/'
6
+ _C='value'
7
+ _B='label'
8
+ _A=True
9
  import gradio as gr
10
  from moviepy import VideoFileClip
11
  from pathlib import Path
12
  import speech_recognition as sr
13
  from PIL import Image
14
+ import os,shutil,json,matplotlib.pyplot as plt,yt_dlp,requests,base64
 
 
 
 
 
 
15
  from io import BytesIO
 
 
 
16
  def plot_images(image_paths):
17
+ A=0;plt.figure(figsize=(16,9));B=[]
18
+ for C in image_paths:
19
+ if os.path.isfile(C):
20
+ B.append(C);A+=1
21
+ if A>=7:break
22
+ return B
23
+ def download_video(video_url,output_video_path=_D):
24
+ B=output_video_path;D={'format':'bestvideo+bestaudio/best','merge_output_format':'mp4','outtmpl':f"{B}/input_vid.mp4",'noplaylist':_A,'quiet':False};Path(B).mkdir(parents=_A,exist_ok=_A)
25
+ with yt_dlp.YoutubeDL(D)as C:A=C.extract_info(video_url,download=_A);A=C.sanitize_info(A);return{_E:A.get(_E),_F:A.get(_F),'views':A.get('view_count')}
26
+ def video_to_images(video_path,output_folder):A=output_folder;Path(A).mkdir(parents=_A,exist_ok=_A);B=VideoFileClip(video_path);B.write_images_sequence(os.path.join(A,'frame%04d.png'),fps=.2)
27
+ def video_to_audio(video_path,output_audio_path):A=VideoFileClip(video_path);B=A.audio;B.write_audiofile(output_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def audio_to_text(audio_path):
29
+ A=sr.Recognizer()
30
+ try:
31
+ with sr.AudioFile(audio_path)as B:C=A.record(B);D=A.recognize_google(C);return D
32
+ except sr.UnknownValueError:print('Google Speech Recognition could not understand the audio.')
33
+ except sr.RequestError as E:print(f"Could not request results: {E}")
34
+ def prepare_all_videos(video_folder=_D,output_folder=_H):
35
+ '\n Processes all video files in video_folder, extracting images and text for each,\n and stores them in unique subfolders under output_folder.\n Returns a list of metadata dicts for all videos.\n ';J='unknown';F=output_folder;E=video_folder;Path(F).mkdir(parents=_A,exist_ok=_A);K=[A for A in os.listdir(E)if A.lower().endswith(('.mp4','.mov','.avi','.mkv'))];G=[]
36
+ for B in K:
37
+ H=os.path.join(E,B);I=Path(B).stem;A=os.path.join(F,I);Path(A).mkdir(parents=_A,exist_ok=_A);C=os.path.join(A,'output_audio.wav');video_to_images(H,A);video_to_audio(H,C);D=audio_to_text(C);L=os.path.join(A,'output_text.txt')
38
+ with open(L,'w')as M:M.write(D if D else'')
39
+ os.remove(C);N={_E:I,_F:J,'views':J,'file':B};G.append({'meta':N,_G:D,'folder':A})
40
+ return G
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  from llama_index.core.indices import MultiModalVectorStoreIndex
42
+ from llama_index.core import SimpleDirectoryReader,StorageContext
43
  from llama_index.vector_stores.lancedb import LanceDBVectorStore
44
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
45
  from llama_index.core import Settings
46
+ def create_vector_db_for_all(image_txt_root_folder):
47
+ '\n Loads all subfolders in image_txt_root_folder as documents for the vector DB.\n ';C='lancedb';D=LanceDBVectorStore(uri=C,table_name='text_collection');E=LanceDBVectorStore(uri=C,table_name='image_collection');F=StorageContext.from_defaults(vector_store=D,image_store=E);Settings.embed_model=HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2');A=[]
48
+ for B in Path(image_txt_root_folder).iterdir():
49
+ if B.is_dir():A.extend(SimpleDirectoryReader(str(B)).load_data())
50
+ G=MultiModalVectorStoreIndex.from_documents(A,storage_context=F);H=G.as_retriever(similarity_top_k=2,image_similarity_top_k=3);return H
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  from llama_index.core.schema import ImageNode
52
+ def retrieve(retriever_engine,query_str):
53
+ D=retriever_engine.retrieve(query_str);B=[];C=[]
54
+ for A in D:
55
+ if isinstance(A.node,ImageNode):B.append(A.node.metadata['file_path'])
56
+ else:C.append(A.text)
57
+ return B,C
58
+ qa_tmpl_str='Given the provided information, including relevant images and retrieved context from the video, accurately and precisely answer the query without any additional prior knowledge.\nPlease ensure honesty and responsibility, refraining from any racist or sexist remarks.\n---------------------\nContext: {context_str}\nMetadata for video: {metadata_str} \n---------------------\nQuery: {query_str}\nAnswer: '
59
+ available_models=[{_C:'meta-llama/llama-4-maverick:free',_B:'Llama'},{_C:'qwen/qwen2.5-vl-72b-instruct:free',_B:'Qwen'},{_C:'google/gemma-3-27b-it:free',_B:'Gemma'},{_C:'moonshotai/kimi-vl-a3b-thinking:free',_B:'Kimi'},{_C:'google/gemini-2.0-flash-exp:free',_B:'Gemini'}]
60
+ model_value_to_label={A[_C]:A[_B]for A in available_models}
61
+ model_label_to_value={A[_B]:A[_C]for A in available_models}
62
+ def gradio_chat(query,model_label):
63
+ K='image_url';J='type';D=query;C='content';L=_D;E=_H
64
+ try:
65
+ M=prepare_all_videos(L,E);N=json.dumps([A['meta']for A in M]);O=create_vector_db_for_all(E);P,Q=retrieve(retriever_engine=O,query_str=D);R=''.join(Q);S=qa_tmpl_str.format(context_str=R,query_str=D,metadata_str=N);T=os.environ['OPENROUTER_API_KEY'];U={'Authorization':f"Bearer {T}",'Content-Type':'application/json','HTTP-Referer':'<YOUR_SITE_URL>','X-Title':'<YOUR_SITE_NAME>'};V=model_label_to_value.get(model_label,available_models[0][_C]);F=[{'role':'user',C:[{J:_G,_G:S}]}];G=[]
66
+ for A in P:
67
+ try:W=Image.open(A);H=BytesIO();W.save(H,format='JPEG');X=base64.b64encode(H.getvalue()).decode('utf-8');F[0][C].append({J:K,K:{'url':f"data:image/jpeg;base64,{X}"}});G.append(A)
68
+ except Exception as B:print(f"Error loading image {A}: {B}")
69
+ Y={'model':V,'messages':F};I=requests.post(url='https://openrouter.ai/api/v1/chat/completions',headers=U,data=json.dumps(Y));I.raise_for_status();Z=I.json()['choices'][0]['message'][C];return Z,G
70
+ except Exception as B:return f"Error: {str(B)}",[]
71
+ gradio_ui=gr.Interface(fn=gradio_chat,inputs=[gr.Textbox(label='',placeholder='Try: Best island in Maldives'),gr.Dropdown(choices=[A[_B]for A in available_models],value=available_models[0][_B],label='Select Model:')],outputs=[gr.Textbox(label='Vega Response:'),gr.Gallery(label='Relevant Images',allow_preview=_A)],title='',description='',theme=gr.themes.Default(primary_hue='sky'))
72
+ if __name__=='__main__':gradio_ui.launch(share=_A)