|
|
import os |
|
|
from fastapi import FastAPI, HTTPException |
|
|
from pydantic import BaseModel |
|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
import logging |
|
|
from fastapi.responses import StreamingResponse |
|
|
|
|
|
|
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
MODEL_REPO = "Qwen/Qwen2.5-3B-Instruct-GGUF" |
|
|
MODEL_FILE = "qwen2.5-3b-instruct-q2_k.gguf" |
|
|
|
|
|
|
|
|
logger.info(f"{MODEL_FILE} modeli Hugging Face Hub'dan indiriliyor...") |
|
|
try: |
|
|
model_path = hf_hub_download( |
|
|
repo_id=MODEL_REPO, |
|
|
filename=MODEL_FILE |
|
|
) |
|
|
logger.info(f"Model başarıyla {model_path} adresine indirildi.") |
|
|
except Exception as e: |
|
|
logger.error(f"Model indirilemedi: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
logger.info("GGUF modeli yükleniyor...") |
|
|
try: |
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=4096, |
|
|
n_gpu_layers=0, |
|
|
verbose=True |
|
|
) |
|
|
logger.info("Model başarıyla yüklendi.") |
|
|
except Exception as e: |
|
|
logger.error(f"Model yüklenirken hata oluştu: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
app = FastAPI( |
|
|
title="Qwen 2.5 API (Streaming - No RAG)", |
|
|
description="Sadece Qwen 2.5 modelini stream eder." |
|
|
) |
|
|
|
|
|
|
|
|
origins = [ |
|
|
"https://deede.tr", |
|
|
"http://deede.tr", |
|
|
"*" |
|
|
] |
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=origins, |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
class ChatRequest(BaseModel): |
|
|
prompt: str |
|
|
|
|
|
|
|
|
async def stream_generator(messages): |
|
|
""" |
|
|
Sadece LLM'den gelen token'ları stream eder. |
|
|
""" |
|
|
try: |
|
|
stream = llm.create_chat_completion( |
|
|
messages=messages, |
|
|
max_tokens=1024, |
|
|
temperature=0.7, |
|
|
stream=True |
|
|
) |
|
|
|
|
|
for chunk in stream: |
|
|
content = chunk['choices'][0]['delta'].get('content', None) |
|
|
if content: |
|
|
yield content |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"LLM stream hatası: {e}") |
|
|
yield f" [LLM STREAM HATASI: {e}]" |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
def read_root(): |
|
|
return {"status": "Streaming API çalışıyor (No RAG)", "model_repo": MODEL_REPO} |
|
|
|
|
|
@app.post("/api/chat") |
|
|
async def chat_with_rag(request: ChatRequest): |
|
|
user_prompt = request.prompt |
|
|
logger.info(f"Gelen prompt: {user_prompt}") |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "Sen Qwen, Alibaba Cloud tarafından yaratılmış bir yapay zeka asistansısın. Sana sorulan sorulara yardımcı olacak şekilde cevap ver." |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": user_prompt |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
return StreamingResponse( |
|
|
stream_generator(messages), |
|
|
media_type="text/plain" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |