MGZON commited on
Commit
10775fb
·
verified ·
1 Parent(s): 0b445a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -14
app.py CHANGED
@@ -43,15 +43,8 @@ async def root():
43
  @app.get("/health")
44
  async def health_check():
45
  logger.info(f"Health check endpoint called at {time.time()}")
46
- if not models_loaded:
47
- logger.info("Returning 'loading' status")
48
- return JSONResponse(
49
- content={"status": "loading", "message": "Models are still loading"},
50
- headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
51
- )
52
- logger.info("Returning 'healthy' status")
53
  return JSONResponse(
54
- content={"status": "healthy"},
55
  headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
56
  )
57
 
@@ -88,9 +81,9 @@ async def load_models():
88
  logger.info(f"Loading Mistral model from {gguf_path}")
89
  mistral = Llama(
90
  model_path=gguf_path,
91
- n_ctx=1024, # قللنا n_ctx عشان نقلل استهلاك الذاكرة
92
- n_threads=1, # قللنا الـ threads إلى 1 عشان نقلل الحمل
93
- n_batch=256, # قللنا n_batch عشان نقلل الحمل
94
  verbose=True
95
  )
96
  logger.info(f"Successfully loaded Mistral model from {gguf_path} in {time.time() - start_time} seconds")
@@ -154,7 +147,7 @@ if __name__ == "__main__":
154
  port=8080,
155
  log_level="info",
156
  workers=1,
157
- timeout_keep_alive=30,
158
- limit_concurrency=10, # تقليل الاتصالات المتزامنة
159
- limit_max_requests=100 # تقليل عدد الطلبات القصوى
160
  )
 
43
  @app.get("/health")
44
  async def health_check():
45
  logger.info(f"Health check endpoint called at {time.time()}")
 
 
 
 
 
 
 
46
  return JSONResponse(
47
+ content={"status": "healthy" if models_loaded else "loading"},
48
  headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
49
  )
50
 
 
81
  logger.info(f"Loading Mistral model from {gguf_path}")
82
  mistral = Llama(
83
  model_path=gguf_path,
84
+ n_ctx=512, # قللنا n_ctx أكتر عشان نقلل الذاكرة
85
+ n_threads=1, # thread واحد بس
86
+ n_batch=128, # قللنا n_batch أكتر
87
  verbose=True
88
  )
89
  logger.info(f"Successfully loaded Mistral model from {gguf_path} in {time.time() - start_time} seconds")
 
147
  port=8080,
148
  log_level="info",
149
  workers=1,
150
+ timeout_keep_alive=15, # تقليل وقت keep-alive
151
+ limit_concurrency=5, # تقليل الاتصالات المتزامنة أكتر
152
+ limit_max_requests=50 # تقليل عدد الطلبات القصوى أكتر
153
  )