NIRAJz's picture
Upload 23 files
6d55fec verified
from pydantic import BaseModel, Field, ConfigDict
from typing import List, Dict, Any, Optional
from enum import Enum
class MetricType(str, Enum):
ACCURACY = "accuracy"
FAITHFULNESS = "faithfulness"
RELEVANCE = "relevance"
TOXICITY = "toxicity"
CONTEXT_PRECISION = "context_precision"
CONTEXT_RECALL = "context_recall"
class APIProvider(str, Enum):
GROQ = "groq"
OPENAI = "openai"
class EvaluationRequest(BaseModel):
model_config = ConfigDict(protected_namespaces=())
questions: List[str] = Field(..., description="Questions to evaluate")
ground_truths: List[str] = Field(..., description="Ground truth answers")
model_responses: Optional[List[str]] = Field(None, description="Model responses")
contexts: Optional[List[str]] = Field(None, description="Contexts for evaluation")
metrics: List[MetricType] = Field(default=["accuracy", "faithfulness", "relevance"])
judge_model: str = Field(default="openai/gpt-oss-20b")
max_concurrent: int = Field(default=5, description="Max concurrent evaluations")
api_provider: APIProvider = Field(default=APIProvider.GROQ, description="API provider for evaluation")
class EvaluationResult(BaseModel):
model_config = ConfigDict(protected_namespaces=())
question: str
ground_truth: str
model_response: str
metrics: Dict[MetricType, float]
explanations: Dict[MetricType, str]
processing_time: float
overall_score: float = Field(..., description="Overall weighted score (0-100)")
class EvaluationSummary(BaseModel):
model_config = ConfigDict(protected_namespaces=())
total_questions: int
average_scores: Dict[MetricType, float]
individual_results: List[EvaluationResult]
total_processing_time: float
model_used: str
api_provider: str
overall_score: float = Field(..., description="Overall weighted score across all questions")