Spaces:

Bellok
/

warbler-cda

Running on Zero

Bellok commited on 27 days ago

Commit

1635a41

1 Parent(s): 81c2b00

feat(rag): enable additional datasets and add hybrid search fallback

- Uncommented and enabled 'arxiv', 'edustories', 'manuals', 'enterprise', and 'prompt-report' dataset packs for broader knowledge coverage
- Added hybrid fallback mechanism in query_warbler() to perform pure semantic search if fractalstat hybrid yields no results, improving retrieval reliability
- Updated document count in Gradio UI from '2.6M+' to '100k+' for accuracy
- Removed auto-sharing from demo launch for controlled access

This expands the RAG system's data sources, enhances query robustness with fallback logic, corrects misinformation, and secures the demo deployment.

Files changed (1) hide show

app.py +28 -9

app.py CHANGED Viewed

@@ -82,14 +82,12 @@ if len(documents) == 0:
             # Enable all available HF dataset packs for maximum knowledge diversity
             datasets_to_download = [
-                #"arxiv",      # Physics and mathematics papers
-                #"edustories", # Educational narratives and stories
                 "novels",     # Fiction literature
-                #"manuals",    # Technical documentation
-                #"enterprise", # Business and corporate content
-                "npc-dialogue", # Game character conversations
-                #"portuguese-edu", # Portuguese educational content
-                #"prompt-report"   # AI prompt engineering reports
             ]
             total_docs = 0
@@ -225,6 +223,27 @@ def query_warbler(query_text: str, max_results: int = 5, use_hybrid: bool = True
     else:
         print(f"DEBUG: No results above threshold: {query.confidence_threshold}")
     # Format results
     output = f"## Query Results\n\n"
     output += f"**Query:** {query_text}\n\n"
@@ -267,7 +286,7 @@ with gr.Blocks(title="Warbler CDA - FractalStat RAG") as demo:
     Semantic retrieval with 8D FractalStat multi-dimensional addressing.
     **Features:**
-    - 2.6M+ documents from arXiv, education, fiction, and more
     - Hybrid semantic + FractalStat scoring
     - Bob the Skeptic bias detection
     - Narrative coherence analysis
@@ -344,4 +363,4 @@ with gr.Blocks(title="Warbler CDA - FractalStat RAG") as demo:
         """)
 if __name__ == "__main__":
-    demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

             # Enable all available HF dataset packs for maximum knowledge diversity
             datasets_to_download = [
+                "arxiv",      # Physics and mathematics papers
+                "edustories", # Educational narratives and stories
                 "novels",     # Fiction literature
+                "manuals",    # Technical documentation
+                "enterprise", # Business and corporate content
+                "prompt-report"   # AI prompt engineering reports
             ]
             total_docs = 0
     else:
         print(f"DEBUG: No results above threshold: {query.confidence_threshold}")
+        # Hybrid Fallback: If hybrid mode and no results, fall back to pure semantic search
+        if use_hybrid and len(assembly.results) == 0:
+            print(f"DEBUG: Hybrid returned 0 results, falling back to pure semantic search")
+            # Reset timer for fallback query
+            fallback_start = time.time()
+            query.confidence_threshold = 0.2  # Lower threshold for semantic fallback
+            query.fractalstat_hybrid = False  # Disable hybrid for this query
+            # Re-execute query with semantic-only mode
+            assembly = api.retrieve_context(query)
+            fallback_ms = (time.time() - fallback_start) * 1000
+            elapsed_ms = (time.time() - start_time) * 1000  # Update total time
+            print(f"DEBUG: Semantic fallback completed in {fallback_ms:.0f}ms, found {len(assembly.results)} results")
+            if assembly.results:
+                print(f"DEBUG: Top 3 relevance scores from semantic: {[r.relevance_score for r in assembly.results[:3]]}")
+                print(f"DEBUG: Hybrid fallback successful - results returned via semantic search")
+    hybrid_fallback_used = use_hybrid and len(assembly.results) > 0 and not query.fractalstat_hybrid
+    query_mode_display = f"{query_mode} (+ Semantic Fallback)" if hybrid_fallback_used else query_mode
     # Format results
     output = f"## Query Results\n\n"
     output += f"**Query:** {query_text}\n\n"
     Semantic retrieval with 8D FractalStat multi-dimensional addressing.
     **Features:**
+    - 100k+ documents from arXiv, education, fiction, and more
     - Hybrid semantic + FractalStat scoring
     - Bob the Skeptic bias detection
     - Narrative coherence analysis
         """)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)