Spaces:

MGZON
/

api-mg

Runtime error

App Files Files Community

MGZON commited on Aug 26

Commit

10775fb

verified ·

1 Parent(s): 0b445a6

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -14

app.py CHANGED Viewed

@@ -43,15 +43,8 @@ async def root():
 @app.get("/health")
 async def health_check():
     logger.info(f"Health check endpoint called at {time.time()}")
-    if not models_loaded:
-        logger.info("Returning 'loading' status")
-        return JSONResponse(
-            content={"status": "loading", "message": "Models are still loading"},
-            headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
-        )
-    logger.info("Returning 'healthy' status")
     return JSONResponse(
-        content={"status": "healthy"},
         headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
     )
@@ -88,9 +81,9 @@ async def load_models():
         logger.info(f"Loading Mistral model from {gguf_path}")
         mistral = Llama(
             model_path=gguf_path,
-            n_ctx=1024,  # قللنا n_ctx عشان نقلل استهلاك الذاكرة
-            n_threads=1,  # قللنا الـ threads إلى 1 عشان نقلل الحمل
-            n_batch=256,  # قللنا n_batch عشان نقلل الحمل
             verbose=True
         )
         logger.info(f"Successfully loaded Mistral model from {gguf_path} in {time.time() - start_time} seconds")
@@ -154,7 +147,7 @@ if __name__ == "__main__":
         port=8080,
         log_level="info",
         workers=1,
-        timeout_keep_alive=30,
-        limit_concurrency=10,  # تقليل الاتصالات المتزامنة
-        limit_max_requests=100  # تقليل عدد الطلبات القصوى
     )

 @app.get("/health")
 async def health_check():
     logger.info(f"Health check endpoint called at {time.time()}")
     return JSONResponse(
+        content={"status": "healthy" if models_loaded else "loading"},
         headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
     )
         logger.info(f"Loading Mistral model from {gguf_path}")
         mistral = Llama(
             model_path=gguf_path,
+            n_ctx=512,  # قللنا n_ctx أكتر عشان نقلل الذاكرة
+            n_threads=1,  # thread واحد بس
+            n_batch=128,  # قللنا n_batch أكتر
             verbose=True
         )
         logger.info(f"Successfully loaded Mistral model from {gguf_path} in {time.time() - start_time} seconds")
         port=8080,
         log_level="info",
         workers=1,
+        timeout_keep_alive=15,  # تقليل وقت keep-alive
+        limit_concurrency=5,   # تقليل الاتصالات المتزامنة أكتر
+        limit_max_requests=50  # تقليل عدد الطلبات القصوى أكتر
     )