File size: 4,315 Bytes
eeb0f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Nutrition Dataset - Download & Process
Downloads and processes dietary recommendation data into ChromaDB
Dataset: issai/LLM_for_Dietary_Recommendation_System (50 patient profiles)
"""

from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os

def download_nutrition():
    """Download Dietary Recommendation dataset from HuggingFace"""
    
    print("πŸ“₯ Downloading Dietary Recommendation dataset...")
    print("   Source: issai/LLM_for_Dietary_Recommendation_System")
    
    try:
        dataset = load_dataset("issai/LLM_for_Dietary_Recommendation_System")
        
        os.makedirs("data_mining/datasets", exist_ok=True)
        
        df = dataset['train'].to_pandas()
        
        output_path = "data_mining/datasets/nutrition_diet.csv"
        df.to_csv(output_path, index=False)
        
        file_size = os.path.getsize(output_path) / (1024 * 1024)
        
        print(f"βœ… Downloaded: {output_path}")
        print(f"πŸ“Š Records: {len(df)}")
        print(f"πŸ“Š File size: {file_size:.2f} MB")
        
        return True
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return False

def process_nutrition():
    """Process Nutrition dataset and build ChromaDB"""
    
    print("\nπŸ”¨ Processing Nutrition dataset...")
    
    csv_path = "data_mining/datasets/nutrition_diet.csv"
    if not os.path.exists(csv_path):
        print(f"❌ Dataset not found: {csv_path}")
        return False
    
    df = pd.read_csv(csv_path)
    print(f"πŸ“Š Loaded {len(df)} records")
    
    print("πŸ€– Loading embedding model...")
    embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
    
    print("πŸ’Ύ Initializing ChromaDB...")
    os.makedirs("data_mining/output", exist_ok=True)
    client = chromadb.PersistentClient(path="data_mining/output/nutrition_chroma")
    
    collection = client.get_or_create_collection(
        name="nutrition",
        metadata={"hnsw:space": "cosine"}
    )
    
    print("πŸ“ Processing nutrition data...")
    
    text_columns = []
    for col in ['profile', 'recommendation', 'diet_plan', 'text', 'content']:
        if col in df.columns:
            text_columns.append(col)
    
    if not text_columns:
        text_columns = df.columns.tolist()
    
    print(f"   Using columns: {text_columns}")
    
    processed = 0
    
    for idx, row in df.iterrows():
        text_parts = []
        for col in text_columns:
            value = str(row[col])
            if value and value != 'nan' and len(value) > 5:
                text_parts.append(f"{col}: {value}")
        
        text = "\n".join(text_parts)
        
        if len(text) < 20:
            continue
        
        embedding = embedder.encode(text)
        
        collection.add(
            ids=[f"nutrition_{processed:05d}"],
            embeddings=[embedding.tolist()],
            documents=[text],
            metadatas=[{
                'domain': 'nutrition',
                'agent': 'NutritionAgent',
                'source': 'LLM_Dietary_Recommendation',
                'index': processed
            }]
        )
        
        processed += 1
        
        if (processed % 10) == 0:
            print(f"  Processed {processed}/{len(df)} records...")
    
    print(f"βœ… Processed {processed} nutrition records")
    print(f"πŸ’Ύ Database saved to: data_mining/output/nutrition_chroma/")
    
    db_path = "data_mining/output/nutrition_chroma"
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(db_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    
    print(f"πŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB")
    
    return True

def main():
    """Main function - download and process"""
    print("=" * 60)
    print("Nutrition Dataset - Download & Process")
    print("=" * 60)
    
    if not download_nutrition():
        return False
    
    if not process_nutrition():
        return False
    
    print("\n" + "=" * 60)
    print("βœ… Nutrition dataset ready!")
    print("=" * 60)
    return True

if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)