|
|
"""Pydantic models for Pinecone-processed standard records.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
from typing import Any |
|
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field, field_validator |
|
|
|
|
|
|
|
|
class PineconeRecord(BaseModel): |
|
|
"""A single standard record ready for Pinecone upsert.""" |
|
|
|
|
|
model_config = ConfigDict( |
|
|
json_encoders={ |
|
|
|
|
|
type(None): lambda v: None, |
|
|
}, |
|
|
|
|
|
populate_by_name=True, |
|
|
) |
|
|
|
|
|
|
|
|
id: str = Field(alias="_id", serialization_alias="_id") |
|
|
|
|
|
|
|
|
content: str |
|
|
|
|
|
|
|
|
standard_set_id: str |
|
|
standard_set_title: str |
|
|
subject: str |
|
|
normalized_subject: str | None = None |
|
|
education_levels: list[str] |
|
|
document_id: str | None = None |
|
|
document_valid: str | None = None |
|
|
publication_status: str | None = None |
|
|
jurisdiction_id: str |
|
|
jurisdiction_title: str |
|
|
|
|
|
|
|
|
asn_identifier: str | None = None |
|
|
statement_notation: str | None = None |
|
|
statement_label: str | None = None |
|
|
depth: int |
|
|
is_leaf: bool |
|
|
is_root: bool |
|
|
|
|
|
|
|
|
parent_id: str | None = None |
|
|
root_id: str |
|
|
ancestor_ids: list[str] |
|
|
child_ids: list[str] |
|
|
sibling_count: int |
|
|
|
|
|
@field_validator("education_levels", mode="before") |
|
|
@classmethod |
|
|
def process_education_levels(cls, v: Any) -> list[str]: |
|
|
""" |
|
|
Process education_levels: split comma-separated strings, flatten, dedupe. |
|
|
|
|
|
Handles cases where source data has comma-separated values within array |
|
|
elements (e.g., ["01,02"] instead of ["01", "02"]). |
|
|
|
|
|
Args: |
|
|
v: Input value (list[str] or list with comma-separated strings) |
|
|
|
|
|
Returns: |
|
|
Flattened, deduplicated list of grade level strings |
|
|
""" |
|
|
if not isinstance(v, list): |
|
|
return [] |
|
|
|
|
|
|
|
|
flattened: list[str] = [] |
|
|
for item in v: |
|
|
if isinstance(item, str): |
|
|
|
|
|
split_items = [s.strip() for s in item.split(",") if s.strip()] |
|
|
flattened.extend(split_items) |
|
|
|
|
|
|
|
|
seen: set[str] = set() |
|
|
result: list[str] = [] |
|
|
for item in flattened: |
|
|
if item not in seen: |
|
|
seen.add(item) |
|
|
result.append(item) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
class ProcessedStandardSet(BaseModel): |
|
|
"""Container for processed standard set records ready for Pinecone.""" |
|
|
|
|
|
records: list[PineconeRecord] |
|
|
|