Bellok commited on
Commit
1635a41
·
1 Parent(s): 81c2b00

feat(rag): enable additional datasets and add hybrid search fallback

Browse files

- Uncommented and enabled 'arxiv', 'edustories', 'manuals', 'enterprise', and 'prompt-report' dataset packs for broader knowledge coverage
- Added hybrid fallback mechanism in query_warbler() to perform pure semantic search if fractalstat hybrid yields no results, improving retrieval reliability
- Updated document count in Gradio UI from '2.6M+' to '100k+' for accuracy
- Removed auto-sharing from demo launch for controlled access

This expands the RAG system's data sources, enhances query robustness with fallback logic, corrects misinformation, and secures the demo deployment.

Files changed (1) hide show
  1. app.py +28 -9
app.py CHANGED
@@ -82,14 +82,12 @@ if len(documents) == 0:
82
 
83
  # Enable all available HF dataset packs for maximum knowledge diversity
84
  datasets_to_download = [
85
- #"arxiv", # Physics and mathematics papers
86
- #"edustories", # Educational narratives and stories
87
  "novels", # Fiction literature
88
- #"manuals", # Technical documentation
89
- #"enterprise", # Business and corporate content
90
- "npc-dialogue", # Game character conversations
91
- #"portuguese-edu", # Portuguese educational content
92
- #"prompt-report" # AI prompt engineering reports
93
  ]
94
 
95
  total_docs = 0
@@ -225,6 +223,27 @@ def query_warbler(query_text: str, max_results: int = 5, use_hybrid: bool = True
225
  else:
226
  print(f"DEBUG: No results above threshold: {query.confidence_threshold}")
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  # Format results
229
  output = f"## Query Results\n\n"
230
  output += f"**Query:** {query_text}\n\n"
@@ -267,7 +286,7 @@ with gr.Blocks(title="Warbler CDA - FractalStat RAG") as demo:
267
  Semantic retrieval with 8D FractalStat multi-dimensional addressing.
268
 
269
  **Features:**
270
- - 2.6M+ documents from arXiv, education, fiction, and more
271
  - Hybrid semantic + FractalStat scoring
272
  - Bob the Skeptic bias detection
273
  - Narrative coherence analysis
@@ -344,4 +363,4 @@ with gr.Blocks(title="Warbler CDA - FractalStat RAG") as demo:
344
  """)
345
 
346
  if __name__ == "__main__":
347
- demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
 
82
 
83
  # Enable all available HF dataset packs for maximum knowledge diversity
84
  datasets_to_download = [
85
+ "arxiv", # Physics and mathematics papers
86
+ "edustories", # Educational narratives and stories
87
  "novels", # Fiction literature
88
+ "manuals", # Technical documentation
89
+ "enterprise", # Business and corporate content
90
+ "prompt-report" # AI prompt engineering reports
 
 
91
  ]
92
 
93
  total_docs = 0
 
223
  else:
224
  print(f"DEBUG: No results above threshold: {query.confidence_threshold}")
225
 
226
+ # Hybrid Fallback: If hybrid mode and no results, fall back to pure semantic search
227
+ if use_hybrid and len(assembly.results) == 0:
228
+ print(f"DEBUG: Hybrid returned 0 results, falling back to pure semantic search")
229
+ # Reset timer for fallback query
230
+ fallback_start = time.time()
231
+ query.confidence_threshold = 0.2 # Lower threshold for semantic fallback
232
+ query.fractalstat_hybrid = False # Disable hybrid for this query
233
+
234
+ # Re-execute query with semantic-only mode
235
+ assembly = api.retrieve_context(query)
236
+ fallback_ms = (time.time() - fallback_start) * 1000
237
+ elapsed_ms = (time.time() - start_time) * 1000 # Update total time
238
+
239
+ print(f"DEBUG: Semantic fallback completed in {fallback_ms:.0f}ms, found {len(assembly.results)} results")
240
+ if assembly.results:
241
+ print(f"DEBUG: Top 3 relevance scores from semantic: {[r.relevance_score for r in assembly.results[:3]]}")
242
+ print(f"DEBUG: Hybrid fallback successful - results returned via semantic search")
243
+
244
+ hybrid_fallback_used = use_hybrid and len(assembly.results) > 0 and not query.fractalstat_hybrid
245
+ query_mode_display = f"{query_mode} (+ Semantic Fallback)" if hybrid_fallback_used else query_mode
246
+
247
  # Format results
248
  output = f"## Query Results\n\n"
249
  output += f"**Query:** {query_text}\n\n"
 
286
  Semantic retrieval with 8D FractalStat multi-dimensional addressing.
287
 
288
  **Features:**
289
+ - 100k+ documents from arXiv, education, fiction, and more
290
  - Hybrid semantic + FractalStat scoring
291
  - Bob the Skeptic bias detection
292
  - Narrative coherence analysis
 
363
  """)
364
 
365
  if __name__ == "__main__":
366
+ demo.launch(server_name="0.0.0.0", server_port=7860)