Spaces:
Runtime error
Runtime error
| """ | |
| Nutrition Dataset - Download & Process | |
| Downloads and processes dietary recommendation data into ChromaDB | |
| Dataset: issai/LLM_for_Dietary_Recommendation_System (50 patient profiles) | |
| """ | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| def download_nutrition(): | |
| """Download Dietary Recommendation dataset from HuggingFace""" | |
| print("π₯ Downloading Dietary Recommendation dataset...") | |
| print(" Source: issai/LLM_for_Dietary_Recommendation_System") | |
| try: | |
| dataset = load_dataset("issai/LLM_for_Dietary_Recommendation_System") | |
| os.makedirs("data_mining/datasets", exist_ok=True) | |
| df = dataset['train'].to_pandas() | |
| output_path = "data_mining/datasets/nutrition_diet.csv" | |
| df.to_csv(output_path, index=False) | |
| file_size = os.path.getsize(output_path) / (1024 * 1024) | |
| print(f"β Downloaded: {output_path}") | |
| print(f"π Records: {len(df)}") | |
| print(f"π File size: {file_size:.2f} MB") | |
| return True | |
| except Exception as e: | |
| print(f"β Download failed: {e}") | |
| return False | |
| def process_nutrition(): | |
| """Process Nutrition dataset and build ChromaDB""" | |
| print("\nπ¨ Processing Nutrition dataset...") | |
| csv_path = "data_mining/datasets/nutrition_diet.csv" | |
| if not os.path.exists(csv_path): | |
| print(f"β Dataset not found: {csv_path}") | |
| return False | |
| df = pd.read_csv(csv_path) | |
| print(f"π Loaded {len(df)} records") | |
| print("π€ Loading embedding model...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| print("πΎ Initializing ChromaDB...") | |
| os.makedirs("data_mining/output", exist_ok=True) | |
| client = chromadb.PersistentClient(path="data_mining/output/nutrition_chroma") | |
| collection = client.get_or_create_collection( | |
| name="nutrition", | |
| metadata={"hnsw:space": "cosine"} | |
| ) | |
| print("π Processing nutrition data...") | |
| text_columns = [] | |
| for col in ['profile', 'recommendation', 'diet_plan', 'text', 'content']: | |
| if col in df.columns: | |
| text_columns.append(col) | |
| if not text_columns: | |
| text_columns = df.columns.tolist() | |
| print(f" Using columns: {text_columns}") | |
| processed = 0 | |
| for idx, row in df.iterrows(): | |
| text_parts = [] | |
| for col in text_columns: | |
| value = str(row[col]) | |
| if value and value != 'nan' and len(value) > 5: | |
| text_parts.append(f"{col}: {value}") | |
| text = "\n".join(text_parts) | |
| if len(text) < 20: | |
| continue | |
| embedding = embedder.encode(text) | |
| collection.add( | |
| ids=[f"nutrition_{processed:05d}"], | |
| embeddings=[embedding.tolist()], | |
| documents=[text], | |
| metadatas=[{ | |
| 'domain': 'nutrition', | |
| 'agent': 'NutritionAgent', | |
| 'source': 'LLM_Dietary_Recommendation', | |
| 'index': processed | |
| }] | |
| ) | |
| processed += 1 | |
| if (processed % 10) == 0: | |
| print(f" Processed {processed}/{len(df)} records...") | |
| print(f"β Processed {processed} nutrition records") | |
| print(f"πΎ Database saved to: data_mining/output/nutrition_chroma/") | |
| db_path = "data_mining/output/nutrition_chroma" | |
| total_size = 0 | |
| for dirpath, dirnames, filenames in os.walk(db_path): | |
| for filename in filenames: | |
| filepath = os.path.join(dirpath, filename) | |
| total_size += os.path.getsize(filepath) | |
| print(f"π Database size: {total_size / (1024 * 1024):.2f} MB") | |
| return True | |
| def main(): | |
| """Main function - download and process""" | |
| print("=" * 60) | |
| print("Nutrition Dataset - Download & Process") | |
| print("=" * 60) | |
| if not download_nutrition(): | |
| return False | |
| if not process_nutrition(): | |
| return False | |
| print("\n" + "=" * 60) | |
| print("β Nutrition dataset ready!") | |
| print("=" * 60) | |
| return True | |
| if __name__ == "__main__": | |
| success = main() | |
| exit(0 if success else 1) | |