Bman21 commited on
Commit
a03bd33
·
verified ·
1 Parent(s): 64e3bee

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -159
app.py DELETED
@@ -1,159 +0,0 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import os
4
- import faiss
5
- import numpy as np
6
- from sentence_transformers import SentenceTransformer
7
- import PyPDF2
8
- import pytesseract
9
- from pdf2image import convert_from_path
10
- import gdown
11
- import pickle
12
-
13
- # --- Configuration ---
14
- MODEL_NAME = "openai/gpt-oss-20b"
15
- SECURE_HF_TOKEN = os.environ.get("HF_TOKEN")
16
-
17
- if not SECURE_HF_TOKEN:
18
- raise ValueError("HF_TOKEN environment variable not set. Add a Secret in Space settings.")
19
-
20
- client = InferenceClient(token=SECURE_HF_TOKEN, model=MODEL_NAME)
21
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
22
- chunks, sources = [], []
23
-
24
- # --- Google Drive PDFs: FILE_ID → Subject ---
25
- drive_files = {
26
-
27
-
28
- "1wP4uK_GA6rjg_4maZpmbDNSynhvZnCNo": "PSM",
29
- "1LtwDGeWLF357elmtbA-R_uujHqmM2jC_": "PSM2",
30
- "1wnqsdt0st5wy60zAg7DKZFeAl_vMgu4T": "FMT concise reddy",
31
- "1BhySXMqZxcnLSccq-D0UB9-rGb1YIsN1": "FMT",
32
- "1sNoc8qLR5VznT28MIrJ0CvCgW6OunF_v": "Pediatrics",
33
- "1s9772ypXMudsLSdHn1xhtGcZCFFghYB0": "Medicine1",
34
-
35
-
36
- "1sAAwpNCqfbjB-d5GqImj50qZzUDopvtR": "Gynae",
37
- "1rvgHxpzvE7v4Ed13UXBTaxn3GXljfccE": "ENT",
38
- "1vd7wg3HlQanVl8Nk-W90wJOyYRzunu0g": "Ophthalmology",
39
- # Add remaining FILE_IDs here
40
- }
41
-
42
- notes_folder = "notes"
43
- os.makedirs(notes_folder, exist_ok=True)
44
- cache_file = os.path.join(notes_folder, "embeddings_cache.pkl")
45
-
46
- # --- Load cached embeddings if available ---
47
- if os.path.exists(cache_file):
48
- with open(cache_file, "rb") as f:
49
- chunks, sources, embeddings = pickle.load(f)
50
- dim = embeddings.shape[1]
51
- index = faiss.IndexFlatL2(dim)
52
- index.add(np.array(embeddings).astype("float32"))
53
- else:
54
- # --- Download PDFs and extract text safely ---
55
- for file_id, subject in drive_files.items():
56
- output_path = os.path.join(notes_folder, f"{subject}.pdf")
57
- if not os.path.exists(output_path):
58
- url = f"https://drive.google.com/uc?id={file_id}"
59
- try:
60
- gdown.download(url, output_path, quiet=False)
61
- except Exception as e:
62
- print(f"Warning: Could not download {subject}: {e}")
63
- continue # skip this PDF
64
-
65
- # Extract text
66
- text = ""
67
- try:
68
- reader = PyPDF2.PdfReader(output_path)
69
- text = " ".join([p.extract_text() for p in reader.pages if p.extract_text()])
70
- except:
71
- text = ""
72
-
73
- # OCR if text empty
74
- if not text.strip():
75
- try:
76
- images = convert_from_path(output_path)
77
- for img in images:
78
- text += pytesseract.image_to_string(img) + " "
79
- except Exception as e:
80
- print(f"Warning: OCR failed for {subject}: {e}")
81
- continue
82
-
83
- # Split into chunks
84
- file_chunks = [text[i:i+500] for i in range(0, len(text), 500)]
85
- chunks.extend(file_chunks)
86
- sources.extend([subject] * len(file_chunks))
87
-
88
- # Build FAISS index
89
- if chunks:
90
- embeddings = embedder.encode(chunks)
91
- dim = embeddings.shape[1]
92
- index = faiss.IndexFlatL2(dim)
93
- index.add(np.array(embeddings).astype("float32"))
94
- # Save to cache
95
- with open(cache_file, "wb") as f:
96
- pickle.dump((chunks, sources, embeddings), f)
97
- else:
98
- index = None
99
-
100
- # --- Chat respond function with source display ---
101
- def respond(message, history: list, system_message, max_tokens, temperature, top_p):
102
- context = ""
103
- source_names = set()
104
- if index is not None and len(chunks) > 0:
105
- query_emb = embedder.encode([message])
106
- query_emb = np.array(query_emb).astype("float32")
107
- k = min(3, len(chunks))
108
- D, I = index.search(query_emb, k=k)
109
- retrieved_chunks = [chunks[i] for i in I[0] if i != -1]
110
- retrieved_sources = [sources[i] for i in I[0] if i != -1]
111
- if retrieved_chunks:
112
- context = "\n".join([retrieved_chunks[j] for j in range(len(retrieved_chunks))])
113
- source_names.update(retrieved_sources)
114
-
115
- messages = [{"role": "system", "content": system_message}]
116
- messages.extend(history)
117
-
118
- source_text = ""
119
- if source_names:
120
- source_text = "Sources: " + ", ".join(sorted(source_names)) + "\n\n"
121
-
122
- prompt_content = f"{source_text}Answer using the following notes if relevant:\n{context}\n\nQuestion: {message}"
123
- messages.append({"role": "user", "content": prompt_content})
124
-
125
- response = ""
126
- for message_chunk in client.chat_completion(
127
- messages,
128
- max_tokens=max_tokens,
129
- stream=True,
130
- temperature=temperature,
131
- top_p=top_p,
132
- ):
133
- choices = message_chunk.choices
134
- token = ""
135
- if len(choices) and choices[0].delta.content:
136
- token = choices[0].delta.content
137
- response += token
138
- yield response
139
-
140
- # --- Gradio interface ---
141
- chatbot = gr.ChatInterface(
142
- respond,
143
- type="messages",
144
- additional_inputs=[
145
- gr.Textbox(value="Hey, need help?", label="System message"),
146
- gr.Slider(1, 5000, value=3000, step=1, label="Max new tokens"),
147
- gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"),
148
- gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"),
149
- ],
150
- )
151
-
152
- with gr.Blocks() as demo:
153
- with gr.Sidebar():
154
- gr.Markdown("# AI Chatbot with Google Drive PDFs & Source Display")
155
- gr.Markdown("Handles failed downloads, OCR, caching, and shows sources automatically")
156
- chatbot.render()
157
-
158
- if __name__ == "__main__":
159
- demo.launch()