Spaces:
Runtime error
Runtime error
| # Setup RAG system - One command to rule them all | |
| # Usage: bash scripts/setup_rag.sh | |
| set -e # Exit on error | |
| # Colors | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| RED='\033[0;31m' | |
| BLUE='\033[0;34m' | |
| NC='\033[0m' # No Color | |
| echo -e "${BLUE}" | |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo "β π₯ HeoCare RAG System Setup (HuggingFace) β" | |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo -e "${NC}" | |
| # 0. Cleanup old files and databases | |
| echo -e "${BLUE}π§Ή Cleaning up old files and databases...${NC}" | |
| # Remove old PDF/MD files from data_mining (if any) | |
| if find data_mining -name "*.pdf" -o -name "*.md" ! -name "README.md" 2>/dev/null | grep -q .; then | |
| echo -e "${YELLOW} Removing old PDF/MD files...${NC}" | |
| find data_mining -name "*.pdf" -type f -delete 2>/dev/null || true | |
| find data_mining -name "*.md" -type f ! -name "README.md" -delete 2>/dev/null || true | |
| echo -e "${GREEN} β Old documents removed${NC}" | |
| fi | |
| # Clear temporary datasets and output folders | |
| if [ -d "data_mining/datasets" ] || [ -d "data_mining/output" ]; then | |
| echo -e "${YELLOW} Clearing temporary folders...${NC}" | |
| rm -rf data_mining/datasets 2>/dev/null || true | |
| rm -rf data_mining/output 2>/dev/null || true | |
| echo -e "${GREEN} β Temporary folders cleared${NC}" | |
| fi | |
| # Clear old vector stores (will be regenerated) | |
| if [ -d "rag/vector_store" ]; then | |
| echo -e "${YELLOW} Clearing old vector stores...${NC}" | |
| rm -rf rag/vector_store/* 2>/dev/null || true | |
| echo -e "${GREEN} β Old vector stores cleared${NC}" | |
| fi | |
| # Clear Python cache | |
| if [ -d "__pycache__" ] || find . -type d -name "__pycache__" 2>/dev/null | grep -q .; then | |
| echo -e "${YELLOW} Clearing Python cache...${NC}" | |
| find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true | |
| find . -type f -name "*.pyc" -delete 2>/dev/null || true | |
| echo -e "${GREEN} β Python cache cleared${NC}" | |
| fi | |
| echo -e "${GREEN}β Cleanup complete!${NC}" | |
| # 1. Check Python | |
| echo -e "${BLUE}π Checking Python...${NC}" | |
| if ! command -v python3 &> /dev/null; then | |
| echo -e "${RED}β Python3 not found!${NC}" | |
| echo "Please install Python 3.8 or higher" | |
| exit 1 | |
| fi | |
| PYTHON_VERSION=$(python3 --version) | |
| echo -e "${GREEN}β ${PYTHON_VERSION}${NC}" | |
| # 2. Check pip | |
| echo -e "\n${BLUE}π¦ Checking pip...${NC}" | |
| if ! command -v pip3 &> /dev/null && ! command -v pip &> /dev/null; then | |
| echo -e "${RED}β pip not found!${NC}" | |
| exit 1 | |
| fi | |
| echo -e "${GREEN}β pip found${NC}" | |
| # 3. Install dependencies | |
| echo -e "\n${BLUE}π¦ Installing dependencies...${NC}" | |
| echo -e "${YELLOW}This may take a few minutes...${NC}" | |
| # Check if requirements.txt exists | |
| if [ -f "requirements.txt" ]; then | |
| pip3 install -q -r requirements.txt || pip install -q -r requirements.txt | |
| echo -e "${GREEN}β Dependencies installed from requirements.txt${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ requirements.txt not found, installing core packages...${NC}" | |
| pip3 install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests || \ | |
| pip install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests | |
| echo -e "${GREEN}β Core dependencies installed${NC}" | |
| fi | |
| # 4. Create directories | |
| echo -e "\n${BLUE}π Creating directories...${NC}" | |
| mkdir -p rag/vector_store | |
| mkdir -p data_mining/{datasets,output} | |
| mkdir -p chroma_db | |
| echo -e "${GREEN}β Directories created${NC}" | |
| # 5. Setup ViMedical Vietnamese Disease Dataset | |
| echo -e "\n${BLUE}π₯ Setting up ViMedical Vietnamese Disease Dataset...${NC}" | |
| echo -e "${YELLOW}This will download and process 603 Vietnamese diseases...${NC}" | |
| # Check if already exists | |
| if [ -d "rag/vector_store/medical_diseases" ]; then | |
| echo -e "${YELLOW}β οΈ ViMedical database already exists, skipping...${NC}" | |
| else | |
| # Create temp directory | |
| mkdir -p data_mining/datasets | |
| mkdir -p data_mining/output | |
| # Run ViMedical setup | |
| python3 data_mining/mining_vimedical.py || python data_mining/mining_vimedical.py | |
| if [ $? -eq 0 ]; then | |
| # Move to RAG directory | |
| mkdir -p rag/vector_store | |
| mv data_mining/output/medical_chroma rag/vector_store/medical_diseases | |
| echo -e "${GREEN}β ViMedical dataset ready (603 diseases)${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ ViMedical setup failed, continuing...${NC}" | |
| fi | |
| # Cleanup | |
| rm -rf data_mining/datasets | |
| rm -rf data_mining/output | |
| fi | |
| # 6. Setup MentalChat16K Mental Health Dataset | |
| echo -e "\n${BLUE}π§ Setting up MentalChat16K Mental Health Dataset...${NC}" | |
| echo -e "${YELLOW}This will download and process 16K mental health conversations...${NC}" | |
| # Check if already exists | |
| if [ -d "rag/vector_store/mental_health" ]; then | |
| echo -e "${YELLOW}β οΈ Mental Health database already exists, skipping...${NC}" | |
| else | |
| # Create temp directory | |
| mkdir -p data_mining/datasets | |
| mkdir -p data_mining/output | |
| # Run MentalChat setup | |
| python3 data_mining/mining_mentalchat.py || python data_mining/mining_mentalchat.py | |
| if [ $? -eq 0 ]; then | |
| # Move to RAG directory | |
| mkdir -p rag/vector_store | |
| mv data_mining/output/mental_health_chroma rag/vector_store/mental_health | |
| echo -e "${GREEN}β Mental Health dataset ready (16K conversations)${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ Mental Health setup failed, continuing...${NC}" | |
| fi | |
| # Cleanup | |
| rm -rf data_mining/datasets | |
| rm -rf data_mining/output | |
| fi | |
| # 7. Setup Nutrition Dataset (Dietary Profiles) | |
| echo -e "\n${BLUE}π₯ Setting up Nutrition Dataset (Dietary Profiles)...${NC}" | |
| echo -e "${YELLOW}This will download 50 dietary profiles...${NC}" | |
| if [ -d "rag/vector_store/nutrition" ]; then | |
| echo -e "${YELLOW}β οΈ Nutrition database already exists, skipping...${NC}" | |
| else | |
| mkdir -p data_mining/datasets data_mining/output | |
| python3 data_mining/mining_nutrition.py || python data_mining/mining_nutrition.py | |
| if [ $? -eq 0 ]; then | |
| mkdir -p rag/vector_store | |
| mv data_mining/output/nutrition_chroma rag/vector_store/nutrition | |
| echo -e "${GREEN}β Nutrition profiles ready (50 profiles)${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ Nutrition setup failed, continuing...${NC}" | |
| fi | |
| rm -rf data_mining/datasets data_mining/output | |
| fi | |
| # 7b. Setup Vietnamese Food Nutrition Database | |
| echo -e "\n${BLUE}π Setting up Vietnamese Food Nutrition Database...${NC}" | |
| echo -e "${YELLOW}This will create 73 Vietnamese foods with nutrition facts...${NC}" | |
| if [ -d "rag/vector_store/vietnamese_nutrition" ]; then | |
| echo -e "${YELLOW}β οΈ Vietnamese nutrition database already exists, skipping...${NC}" | |
| else | |
| mkdir -p data_mining/datasets data_mining/output | |
| python3 data_mining/mining_vietnamese_nutrition.py || python data_mining/mining_vietnamese_nutrition.py | |
| if [ $? -eq 0 ]; then | |
| mkdir -p rag/vector_store | |
| mv data_mining/output/vietnamese_nutrition_chroma rag/vector_store/vietnamese_nutrition | |
| echo -e "${GREEN}β Vietnamese food nutrition ready (73 foods)${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ Vietnamese nutrition setup failed, continuing...${NC}" | |
| fi | |
| rm -rf data_mining/datasets data_mining/output | |
| fi | |
| # 8. Setup Fitness Dataset | |
| echo -e "\n${BLUE}πͺ Setting up Fitness Dataset...${NC}" | |
| echo -e "${YELLOW}This will download and process gym exercises...${NC}" | |
| if [ -d "rag/vector_store/fitness" ]; then | |
| echo -e "${YELLOW}β οΈ Fitness database already exists, skipping...${NC}" | |
| else | |
| mkdir -p data_mining/datasets data_mining/output | |
| python3 data_mining/mining_fitness.py || python data_mining/mining_fitness.py | |
| if [ $? -eq 0 ]; then | |
| mkdir -p rag/vector_store | |
| mv data_mining/output/fitness_chroma rag/vector_store/fitness | |
| echo -e "${GREEN}β Fitness dataset ready${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ Fitness setup failed, continuing...${NC}" | |
| fi | |
| rm -rf data_mining/datasets data_mining/output | |
| fi | |
| # 9. Setup COVID-19 Dataset (DEPRECATED - Skipped) | |
| echo -e "\n${BLUE}π¦ COVID-19 Dataset...${NC}" | |
| echo -e "${YELLOW}βοΈ Skipping (dataset deprecated, already have Medical Q&A)${NC}" | |
| # 10. Setup Vietnamese Medical Q&A Dataset | |
| echo -e "\n${BLUE}π¬ Setting up Vietnamese Medical Q&A Dataset...${NC}" | |
| echo -e "${YELLOW}This will download and process 9.3K medical Q&A pairs from HuggingFace...${NC}" | |
| if [ -d "rag/vector_store/symptom_qa" ] && [ -d "rag/vector_store/general_health_qa" ]; then | |
| echo -e "${YELLOW}β οΈ Medical Q&A databases already exist, skipping...${NC}" | |
| else | |
| mkdir -p data_mining/datasets data_mining/output | |
| python3 data_mining/mining_medical_qa.py || python data_mining/mining_medical_qa.py | |
| if [ $? -eq 0 ]; then | |
| mkdir -p rag/vector_store | |
| mv data_mining/output/symptom_qa_chroma rag/vector_store/symptom_qa | |
| mv data_mining/output/general_health_qa_chroma rag/vector_store/general_health_qa | |
| echo -e "${GREEN}β Medical Q&A datasets ready (Symptom + General Health)${NC}" | |
| else | |
| echo -e "${YELLOW}β οΈ Medical Q&A setup failed, continuing...${NC}" | |
| fi | |
| rm -rf data_mining/datasets data_mining/output | |
| fi | |
| # 11. Verify RAG | |
| echo -e "\n${BLUE}β Verifying RAG system...${NC}" | |
| python3 scripts/check_rag_status.py 2>/dev/null || python scripts/check_rag_status.py 2>/dev/null || echo "β οΈ Verification skipped" | |
| # 12. Generate Training Data (DISABLED - Not needed without fine-tuning) | |
| # echo -e "\n${BLUE}π€ Generating synthetic training data...${NC}" | |
| # echo -e "${YELLOW}This will create ~200 conversations for fine-tuning...${NC}" | |
| # | |
| # if [ -d "fine_tuning/training_data" ] && [ "$(ls -A fine_tuning/training_data 2>/dev/null)" ]; then | |
| # echo -e "${YELLOW}β οΈ Training data already exists, skipping generation...${NC}" | |
| # else | |
| # python3 scripts/generate_training_data.py || python scripts/generate_training_data.py | |
| # if [ $? -eq 0 ]; then | |
| # echo -e "${GREEN}β Training data generated!${NC}" | |
| # else | |
| # echo -e "${YELLOW}β οΈ Training data generation failed, continuing...${NC}" | |
| # fi | |
| # fi | |
| # 13. Fine-tune Models (DISABLED - Custom API doesn't support fine-tuning) | |
| # Fine-tuning requires OpenAI official API, which costs money and is not necessary | |
| # The app works well with base model + RAG without fine-tuning | |
| # | |
| # echo -e "\n${BLUE}π Fine-tuning agents...${NC}" | |
| # echo -e "${YELLOW}This will fine-tune all agents with synthetic data (takes 30-60 min, costs ~\$2)${NC}" | |
| # echo -e "${YELLOW}Do you want to fine-tune now? (y/N)${NC}" | |
| # read -t 10 -n 1 -r FINETUNE_CHOICE || FINETUNE_CHOICE="n" | |
| # echo | |
| # | |
| # if [[ $FINETUNE_CHOICE =~ ^[Yy]$ ]]; then | |
| # echo -e "${BLUE}π Starting fine-tuning...${NC}" | |
| # python3 scripts/auto_finetune.py || python scripts/auto_finetune.py | |
| # if [ $? -eq 0 ]; then | |
| # echo -e "${GREEN}β Fine-tuning complete!${NC}" | |
| # else | |
| # echo -e "${YELLOW}β οΈ Fine-tuning failed, check errors above${NC}" | |
| # fi | |
| # else | |
| # echo -e "${YELLOW}βοΈ Skipping fine-tuning (you can run it later with: python scripts/auto_finetune.py)${NC}" | |
| # fi | |
| echo -e "\n${YELLOW}βΉοΈ Training data generation and fine-tuning are disabled${NC}" | |
| echo -e "${YELLOW} Reason: Custom API doesn't support fine-tuning (404 error)${NC}" | |
| echo -e "${YELLOW} App works well with base model + RAG without fine-tuning${NC}" | |
| # Done | |
| echo -e "\n${GREEN}" | |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo "β π Setup Complete! β" | |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo -e "${NC}" | |
| echo -e "${BLUE}π What was set up:${NC}" | |
| echo " β RAG databases (6 specialized databases, ~160 MB)" | |
| echo " - ViMedical Diseases (603 diseases)" | |
| echo " - Mental Health (16K conversations)" | |
| echo " - Nutrition Plans" | |
| echo " - Vietnamese Food (73 items)" | |
| echo " - Fitness Exercises (1.66K)" | |
| echo " - Medical Q&A (9.3K pairs)" | |
| echo "" | |
| echo -e "${BLUE}π Next steps:${NC}" | |
| echo " 1. python app.py" | |
| echo " 2. Open http://localhost:7860 in your browser" | |
| echo "" | |
| echo -e "${BLUE}π‘ Tips:${NC}" | |
| echo " - Check RAG status: python scripts/check_rag_status.py" | |
| echo " - App works with base model + RAG (no fine-tuning needed)" | |
| echo "" | |