Spaces:

Omartificial-Intelligence-Space
/

context-caching-gemini-pdf-qa

Sleeping

App Files Files Community

Omartificial-Intelligence-Space commited on Jul 12

Commit

1512726

verified ·

1 Parent(s): 9021fab

Create check_tokens.py

Browse files

Files changed (1) hide show

check_tokens.py +177 -0

check_tokens.py ADDED Viewed

	@@ -0,0 +1,177 @@

+#!/usr/bin/env python3
+"""
+Utility script to check if documents meet token requirements for Gemini API caching
+"""
+import os
+import io
+import httpx
+from google import genai
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+def check_document_tokens(file_path=None, url=None):
+    """Check if a document meets the minimum token requirements for caching"""
+    # Initialize client
+    client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))
+    print("🔍 Document Token Checker")
+    print("=" * 50)
+    try:
+        if file_path:
+            print(f"📄 Checking local file: {file_path}")
+            with open(file_path, 'rb') as f:
+                file_content = f.read()
+            file_io = io.BytesIO(file_content)
+            document_name = file_path
+        elif url:
+            print(f"📄 Checking URL: {url}")
+            response = httpx.get(url)
+            response.raise_for_status()
+            file_io = io.BytesIO(response.content)
+            document_name = url
+        else:
+            print("❌ Error: Please provide either file_path or url")
+            return
+        print("📤 Uploading to Gemini File API...")
+        # Upload to Gemini File API
+        document = client.files.upload(
+            file=file_io,
+            config=dict(mime_type='application/pdf')
+        )
+        print("✅ File uploaded successfully!")
+        # Try to create a cache to check token count
+        print("💾 Attempting to create cache to check token count...")
+        try:
+            cache = client.caches.create(
+                model="gemini-2.0-flash-001",
+                config=genai.types.CreateCachedContentConfig(
+                    system_instruction="Test system instruction for token counting.",
+                    contents=[document],
+                )
+            )
+            token_count = getattr(cache.usage_metadata, 'cached_token_count', 0)
+            print(f"📊 Token count: {token_count:,}")
+            print(f"📏 Minimum required: 4,096")
+            if token_count >= 4096:
+                print("✅ Document meets caching requirements!")
+                print("💡 This document is suitable for caching.")
+                # Calculate cost benefits
+                questions = [5, 10, 20, 50]
+                print("\n💰 Cost-Benefit Analysis:")
+                print("Questions | Without Cache | With Cache | Savings")
+                print("-" * 50)
+                for q in questions:
+                    without_cache = token_count * q
+                    with_cache = token_count + (50 * q)  # Assuming 50 tokens per question
+                    savings = ((without_cache - with_cache) / without_cache) * 100
+                    print(f"{q:9d} | {without_cache:12,} | {with_cache:10,} | {savings:6.1f}%")
+            else:
+                print("❌ Document does not meet caching requirements")
+                print(f"📝 Need {4096 - token_count:,} more tokens")
+                print("💡 Consider:")
+                print("   • Uploading a longer document")
+                print("   • Combining multiple documents")
+                print("   • Using regular analysis (without caching)")
+            # Clean up
+            print(f"\n🗑️ Cleaning up test cache...")
+            client.caches.delete(cache.name)
+            print("✅ Test cache deleted!")
+        except Exception as e:
+            if "Cached content is too small" in str(e):
+                print("❌ Document is too small for caching")
+                print("💡 This document has fewer than 4,096 tokens")
+                print("📝 Recommendations:")
+                print("   • Upload a longer document")
+                print("   • Combine multiple small documents")
+                print("   • Use regular analysis without caching")
+            else:
+                print(f"❌ Error creating cache: {e}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+def estimate_tokens_from_file_size(file_path):
+    """Rough estimation of tokens based on file size"""
+    try:
+        file_size = os.path.getsize(file_path)
+        # Rough estimation: 1 token ≈ 4 characters, 1 character ≈ 1 byte for text
+        # For PDFs, this is very rough as they contain formatting, images, etc.
+        estimated_tokens = file_size // 4
+        print(f"📏 File size: {file_size:,} bytes")
+        print(f"📊 Estimated tokens: {estimated_tokens:,}")
+        if estimated_tokens >= 4096:
+            print("✅ Likely meets caching requirements")
+        else:
+            print("❌ Likely too small for caching")
+    except Exception as e:
+        print(f"❌ Error estimating tokens: {e}")
+def main():
+    """Main function with interactive menu"""
+    print("🎯 Gemini API Document Token Checker")
+    print("=" * 60)
+    # Check if API key is set
+    if not os.getenv('GOOGLE_API_KEY'):
+        print("❌ Error: GOOGLE_API_KEY not found in environment variables")
+        print("Please set your API key in the .env file")
+        return
+    while True:
+        print("\n📋 Options:")
+        print("1. Check local PDF file")
+        print("2. Check PDF from URL")
+        print("3. Estimate tokens from file size")
+        print("4. Exit")
+        choice = input("\nEnter your choice (1-4): ").strip()
+        if choice == '1':
+            file_path = input("Enter the path to your PDF file: ").strip()
+            if os.path.exists(file_path):
+                check_document_tokens(file_path=file_path)
+            else:
+                print("❌ File not found!")
+        elif choice == '2':
+            url = input("Enter the URL to your PDF: ").strip()
+            check_document_tokens(url=url)
+        elif choice == '3':
+            file_path = input("Enter the path to your PDF file: ").strip()
+            if os.path.exists(file_path):
+                estimate_tokens_from_file_size(file_path)
+            else:
+                print("❌ File not found!")
+        elif choice == '4':
+            print("👋 Goodbye!")
+            break
+        else:
+            print("❌ Invalid choice. Please enter 1-4.")
+if __name__ == "__main__":
+    main()