File size: 2,352 Bytes
7602502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Manages local data storage and metadata tracking."""

from __future__ import annotations

import json
from dataclasses import dataclass

from loguru import logger

from tools.config import get_settings
from tools.models import StandardSetResponse

settings = get_settings()

# Data directories (from config)
RAW_DATA_DIR = settings.raw_data_dir
STANDARD_SETS_DIR = settings.standard_sets_dir
PROCESSED_DATA_DIR = settings.processed_data_dir


@dataclass
class StandardSetInfo:
    """Information about a downloaded standard set with processing status."""

    set_id: str
    title: str
    subject: str
    education_levels: list[str]
    jurisdiction: str
    publication_status: str
    valid_year: str
    processed: bool


def list_downloaded_standard_sets() -> list[StandardSetInfo]:
    """
    List all downloaded standard sets from the standardSets directory.

    Returns:
        List of StandardSetInfo with standard set info and processing status
    """
    if not STANDARD_SETS_DIR.exists():
        return []

    datasets = []
    for set_dir in STANDARD_SETS_DIR.iterdir():
        if not set_dir.is_dir():
            continue

        data_file = set_dir / "data.json"
        if not data_file.exists():
            continue

        try:
            with open(data_file, encoding="utf-8") as f:
                raw_data = json.load(f)

            # Parse the API response wrapper
            response = StandardSetResponse(**raw_data)
            standard_set = response.data

            # Build the dataset info
            dataset_info = StandardSetInfo(
                set_id=standard_set.id,
                title=standard_set.title,
                subject=standard_set.subject,
                education_levels=standard_set.educationLevels,
                jurisdiction=standard_set.jurisdiction.title,
                publication_status=standard_set.document.publicationStatus or "Unknown",
                valid_year=standard_set.document.valid,
                processed=False,  # TODO: Check against processed directory
            )

            datasets.append(dataset_info)

        except (json.JSONDecodeError, IOError, Exception) as e:
            logger.warning(f"Failed to read {data_file}: {e}")
            continue

    logger.debug(f"Found {len(datasets)} downloaded standard sets")
    return datasets