my-gradio-app / scripts /setup_rag.sh
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
raw
history blame
12.9 kB
#!/bin/bash
# Setup RAG system - One command to rule them all
# Usage: bash scripts/setup_rag.sh
set -e # Exit on error
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}"
echo "╔════════════════════════════════════════════════════════════╗"
echo "β•‘ πŸ₯ HeoCare RAG System Setup (HuggingFace) β•‘"
echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
echo -e "${NC}"
# 0. Cleanup old files and databases
echo -e "${BLUE}🧹 Cleaning up old files and databases...${NC}"
# Remove old PDF/MD files from data_mining (if any)
if find data_mining -name "*.pdf" -o -name "*.md" ! -name "README.md" 2>/dev/null | grep -q .; then
echo -e "${YELLOW} Removing old PDF/MD files...${NC}"
find data_mining -name "*.pdf" -type f -delete 2>/dev/null || true
find data_mining -name "*.md" -type f ! -name "README.md" -delete 2>/dev/null || true
echo -e "${GREEN} βœ… Old documents removed${NC}"
fi
# Clear temporary datasets and output folders
if [ -d "data_mining/datasets" ] || [ -d "data_mining/output" ]; then
echo -e "${YELLOW} Clearing temporary folders...${NC}"
rm -rf data_mining/datasets 2>/dev/null || true
rm -rf data_mining/output 2>/dev/null || true
echo -e "${GREEN} βœ… Temporary folders cleared${NC}"
fi
# Clear old vector stores (will be regenerated)
if [ -d "rag/vector_store" ]; then
echo -e "${YELLOW} Clearing old vector stores...${NC}"
rm -rf rag/vector_store/* 2>/dev/null || true
echo -e "${GREEN} βœ… Old vector stores cleared${NC}"
fi
# Clear Python cache
if [ -d "__pycache__" ] || find . -type d -name "__pycache__" 2>/dev/null | grep -q .; then
echo -e "${YELLOW} Clearing Python cache...${NC}"
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
find . -type f -name "*.pyc" -delete 2>/dev/null || true
echo -e "${GREEN} βœ… Python cache cleared${NC}"
fi
echo -e "${GREEN}βœ… Cleanup complete!${NC}"
# 1. Check Python
echo -e "${BLUE}🐍 Checking Python...${NC}"
if ! command -v python3 &> /dev/null; then
echo -e "${RED}❌ Python3 not found!${NC}"
echo "Please install Python 3.8 or higher"
exit 1
fi
PYTHON_VERSION=$(python3 --version)
echo -e "${GREEN}βœ… ${PYTHON_VERSION}${NC}"
# 2. Check pip
echo -e "\n${BLUE}πŸ“¦ Checking pip...${NC}"
if ! command -v pip3 &> /dev/null && ! command -v pip &> /dev/null; then
echo -e "${RED}❌ pip not found!${NC}"
exit 1
fi
echo -e "${GREEN}βœ… pip found${NC}"
# 3. Install dependencies
echo -e "\n${BLUE}πŸ“¦ Installing dependencies...${NC}"
echo -e "${YELLOW}This may take a few minutes...${NC}"
# Check if requirements.txt exists
if [ -f "requirements.txt" ]; then
pip3 install -q -r requirements.txt || pip install -q -r requirements.txt
echo -e "${GREEN}βœ… Dependencies installed from requirements.txt${NC}"
else
echo -e "${YELLOW}⚠️ requirements.txt not found, installing core packages...${NC}"
pip3 install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests || \
pip install -q langchain langchain-chroma langchain-huggingface chromadb tqdm beautifulsoup4 requests
echo -e "${GREEN}βœ… Core dependencies installed${NC}"
fi
# 4. Create directories
echo -e "\n${BLUE}πŸ“ Creating directories...${NC}"
mkdir -p rag/vector_store
mkdir -p data_mining/{datasets,output}
mkdir -p chroma_db
echo -e "${GREEN}βœ… Directories created${NC}"
# 5. Setup ViMedical Vietnamese Disease Dataset
echo -e "\n${BLUE}πŸ₯ Setting up ViMedical Vietnamese Disease Dataset...${NC}"
echo -e "${YELLOW}This will download and process 603 Vietnamese diseases...${NC}"
# Check if already exists
if [ -d "rag/vector_store/medical_diseases" ]; then
echo -e "${YELLOW}⚠️ ViMedical database already exists, skipping...${NC}"
else
# Create temp directory
mkdir -p data_mining/datasets
mkdir -p data_mining/output
# Run ViMedical setup
python3 data_mining/mining_vimedical.py || python data_mining/mining_vimedical.py
if [ $? -eq 0 ]; then
# Move to RAG directory
mkdir -p rag/vector_store
mv data_mining/output/medical_chroma rag/vector_store/medical_diseases
echo -e "${GREEN}βœ… ViMedical dataset ready (603 diseases)${NC}"
else
echo -e "${YELLOW}⚠️ ViMedical setup failed, continuing...${NC}"
fi
# Cleanup
rm -rf data_mining/datasets
rm -rf data_mining/output
fi
# 6. Setup MentalChat16K Mental Health Dataset
echo -e "\n${BLUE}🧠 Setting up MentalChat16K Mental Health Dataset...${NC}"
echo -e "${YELLOW}This will download and process 16K mental health conversations...${NC}"
# Check if already exists
if [ -d "rag/vector_store/mental_health" ]; then
echo -e "${YELLOW}⚠️ Mental Health database already exists, skipping...${NC}"
else
# Create temp directory
mkdir -p data_mining/datasets
mkdir -p data_mining/output
# Run MentalChat setup
python3 data_mining/mining_mentalchat.py || python data_mining/mining_mentalchat.py
if [ $? -eq 0 ]; then
# Move to RAG directory
mkdir -p rag/vector_store
mv data_mining/output/mental_health_chroma rag/vector_store/mental_health
echo -e "${GREEN}βœ… Mental Health dataset ready (16K conversations)${NC}"
else
echo -e "${YELLOW}⚠️ Mental Health setup failed, continuing...${NC}"
fi
# Cleanup
rm -rf data_mining/datasets
rm -rf data_mining/output
fi
# 7. Setup Nutrition Dataset (Dietary Profiles)
echo -e "\n${BLUE}πŸ₯— Setting up Nutrition Dataset (Dietary Profiles)...${NC}"
echo -e "${YELLOW}This will download 50 dietary profiles...${NC}"
if [ -d "rag/vector_store/nutrition" ]; then
echo -e "${YELLOW}⚠️ Nutrition database already exists, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_nutrition.py || python data_mining/mining_nutrition.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/nutrition_chroma rag/vector_store/nutrition
echo -e "${GREEN}βœ… Nutrition profiles ready (50 profiles)${NC}"
else
echo -e "${YELLOW}⚠️ Nutrition setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 7b. Setup Vietnamese Food Nutrition Database
echo -e "\n${BLUE}🍜 Setting up Vietnamese Food Nutrition Database...${NC}"
echo -e "${YELLOW}This will create 73 Vietnamese foods with nutrition facts...${NC}"
if [ -d "rag/vector_store/vietnamese_nutrition" ]; then
echo -e "${YELLOW}⚠️ Vietnamese nutrition database already exists, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_vietnamese_nutrition.py || python data_mining/mining_vietnamese_nutrition.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/vietnamese_nutrition_chroma rag/vector_store/vietnamese_nutrition
echo -e "${GREEN}βœ… Vietnamese food nutrition ready (73 foods)${NC}"
else
echo -e "${YELLOW}⚠️ Vietnamese nutrition setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 8. Setup Fitness Dataset
echo -e "\n${BLUE}πŸ’ͺ Setting up Fitness Dataset...${NC}"
echo -e "${YELLOW}This will download and process gym exercises...${NC}"
if [ -d "rag/vector_store/fitness" ]; then
echo -e "${YELLOW}⚠️ Fitness database already exists, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_fitness.py || python data_mining/mining_fitness.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/fitness_chroma rag/vector_store/fitness
echo -e "${GREEN}βœ… Fitness dataset ready${NC}"
else
echo -e "${YELLOW}⚠️ Fitness setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 9. Setup COVID-19 Dataset (DEPRECATED - Skipped)
echo -e "\n${BLUE}🦠 COVID-19 Dataset...${NC}"
echo -e "${YELLOW}⏭️ Skipping (dataset deprecated, already have Medical Q&A)${NC}"
# 10. Setup Vietnamese Medical Q&A Dataset
echo -e "\n${BLUE}πŸ’¬ Setting up Vietnamese Medical Q&A Dataset...${NC}"
echo -e "${YELLOW}This will download and process 9.3K medical Q&A pairs from HuggingFace...${NC}"
if [ -d "rag/vector_store/symptom_qa" ] && [ -d "rag/vector_store/general_health_qa" ]; then
echo -e "${YELLOW}⚠️ Medical Q&A databases already exist, skipping...${NC}"
else
mkdir -p data_mining/datasets data_mining/output
python3 data_mining/mining_medical_qa.py || python data_mining/mining_medical_qa.py
if [ $? -eq 0 ]; then
mkdir -p rag/vector_store
mv data_mining/output/symptom_qa_chroma rag/vector_store/symptom_qa
mv data_mining/output/general_health_qa_chroma rag/vector_store/general_health_qa
echo -e "${GREEN}βœ… Medical Q&A datasets ready (Symptom + General Health)${NC}"
else
echo -e "${YELLOW}⚠️ Medical Q&A setup failed, continuing...${NC}"
fi
rm -rf data_mining/datasets data_mining/output
fi
# 11. Verify RAG
echo -e "\n${BLUE}βœ… Verifying RAG system...${NC}"
python3 scripts/check_rag_status.py 2>/dev/null || python scripts/check_rag_status.py 2>/dev/null || echo "⚠️ Verification skipped"
# 12. Generate Training Data (DISABLED - Not needed without fine-tuning)
# echo -e "\n${BLUE}πŸ€– Generating synthetic training data...${NC}"
# echo -e "${YELLOW}This will create ~200 conversations for fine-tuning...${NC}"
#
# if [ -d "fine_tuning/training_data" ] && [ "$(ls -A fine_tuning/training_data 2>/dev/null)" ]; then
# echo -e "${YELLOW}⚠️ Training data already exists, skipping generation...${NC}"
# else
# python3 scripts/generate_training_data.py || python scripts/generate_training_data.py
# if [ $? -eq 0 ]; then
# echo -e "${GREEN}βœ… Training data generated!${NC}"
# else
# echo -e "${YELLOW}⚠️ Training data generation failed, continuing...${NC}"
# fi
# fi
# 13. Fine-tune Models (DISABLED - Custom API doesn't support fine-tuning)
# Fine-tuning requires OpenAI official API, which costs money and is not necessary
# The app works well with base model + RAG without fine-tuning
#
# echo -e "\n${BLUE}πŸŽ“ Fine-tuning agents...${NC}"
# echo -e "${YELLOW}This will fine-tune all agents with synthetic data (takes 30-60 min, costs ~\$2)${NC}"
# echo -e "${YELLOW}Do you want to fine-tune now? (y/N)${NC}"
# read -t 10 -n 1 -r FINETUNE_CHOICE || FINETUNE_CHOICE="n"
# echo
#
# if [[ $FINETUNE_CHOICE =~ ^[Yy]$ ]]; then
# echo -e "${BLUE}πŸš€ Starting fine-tuning...${NC}"
# python3 scripts/auto_finetune.py || python scripts/auto_finetune.py
# if [ $? -eq 0 ]; then
# echo -e "${GREEN}βœ… Fine-tuning complete!${NC}"
# else
# echo -e "${YELLOW}⚠️ Fine-tuning failed, check errors above${NC}"
# fi
# else
# echo -e "${YELLOW}⏭️ Skipping fine-tuning (you can run it later with: python scripts/auto_finetune.py)${NC}"
# fi
echo -e "\n${YELLOW}ℹ️ Training data generation and fine-tuning are disabled${NC}"
echo -e "${YELLOW} Reason: Custom API doesn't support fine-tuning (404 error)${NC}"
echo -e "${YELLOW} App works well with base model + RAG without fine-tuning${NC}"
# Done
echo -e "\n${GREEN}"
echo "╔════════════════════════════════════════════════════════════╗"
echo "β•‘ πŸŽ‰ Setup Complete! β•‘"
echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
echo -e "${NC}"
echo -e "${BLUE}πŸ“Š What was set up:${NC}"
echo " βœ… RAG databases (6 specialized databases, ~160 MB)"
echo " - ViMedical Diseases (603 diseases)"
echo " - Mental Health (16K conversations)"
echo " - Nutrition Plans"
echo " - Vietnamese Food (73 items)"
echo " - Fitness Exercises (1.66K)"
echo " - Medical Q&A (9.3K pairs)"
echo ""
echo -e "${BLUE}πŸš€ Next steps:${NC}"
echo " 1. python app.py"
echo " 2. Open http://localhost:7860 in your browser"
echo ""
echo -e "${BLUE}πŸ’‘ Tips:${NC}"
echo " - Check RAG status: python scripts/check_rag_status.py"
echo " - App works with base model + RAG (no fine-tuning needed)"
echo ""