Spaces:
Runtime error
Runtime error
File size: 4,315 Bytes
eeb0f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
"""
Nutrition Dataset - Download & Process
Downloads and processes dietary recommendation data into ChromaDB
Dataset: issai/LLM_for_Dietary_Recommendation_System (50 patient profiles)
"""
from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
def download_nutrition():
"""Download Dietary Recommendation dataset from HuggingFace"""
print("π₯ Downloading Dietary Recommendation dataset...")
print(" Source: issai/LLM_for_Dietary_Recommendation_System")
try:
dataset = load_dataset("issai/LLM_for_Dietary_Recommendation_System")
os.makedirs("data_mining/datasets", exist_ok=True)
df = dataset['train'].to_pandas()
output_path = "data_mining/datasets/nutrition_diet.csv"
df.to_csv(output_path, index=False)
file_size = os.path.getsize(output_path) / (1024 * 1024)
print(f"β
Downloaded: {output_path}")
print(f"π Records: {len(df)}")
print(f"π File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"β Download failed: {e}")
return False
def process_nutrition():
"""Process Nutrition dataset and build ChromaDB"""
print("\nπ¨ Processing Nutrition dataset...")
csv_path = "data_mining/datasets/nutrition_diet.csv"
if not os.path.exists(csv_path):
print(f"β Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"π Loaded {len(df)} records")
print("π€ Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
print("πΎ Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/nutrition_chroma")
collection = client.get_or_create_collection(
name="nutrition",
metadata={"hnsw:space": "cosine"}
)
print("π Processing nutrition data...")
text_columns = []
for col in ['profile', 'recommendation', 'diet_plan', 'text', 'content']:
if col in df.columns:
text_columns.append(col)
if not text_columns:
text_columns = df.columns.tolist()
print(f" Using columns: {text_columns}")
processed = 0
for idx, row in df.iterrows():
text_parts = []
for col in text_columns:
value = str(row[col])
if value and value != 'nan' and len(value) > 5:
text_parts.append(f"{col}: {value}")
text = "\n".join(text_parts)
if len(text) < 20:
continue
embedding = embedder.encode(text)
collection.add(
ids=[f"nutrition_{processed:05d}"],
embeddings=[embedding.tolist()],
documents=[text],
metadatas=[{
'domain': 'nutrition',
'agent': 'NutritionAgent',
'source': 'LLM_Dietary_Recommendation',
'index': processed
}]
)
processed += 1
if (processed % 10) == 0:
print(f" Processed {processed}/{len(df)} records...")
print(f"β
Processed {processed} nutrition records")
print(f"πΎ Database saved to: data_mining/output/nutrition_chroma/")
db_path = "data_mining/output/nutrition_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"π Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("Nutrition Dataset - Download & Process")
print("=" * 60)
if not download_nutrition():
return False
if not process_nutrition():
return False
print("\n" + "=" * 60)
print("β
Nutrition dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)
|