import os
import json

import gradio as gr
import numpy as np
import pandas as pd
import torch

from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import hf_hub_download, InferenceClient


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

EMBEDDING_NAME = "Qwen/Qwen3-Embedding-0.6B"
#EMBEDDING_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# HISTORY = [
#         {"role":"system", "content":"You are a helpful assistant."}
#     ]
MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
#MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, padding_side="left")
MODEL = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    quantization_config=quantization_config,
)

# if TOKENIZER.pad_token is None:
#     TOKENIZER.pad_token = TOKENIZER.eos_token

# client = InferenceClient(
#     model=MODEL_NAME,
#     token=os.environ.get("DATASET_TOK")
# )


def get_vecstore_from_json(embedding_name):
    json_path = hf_hub_download(
        repo_id="yucxy/cv",
        filename="cv_vectors_20250814-1613.json",
        repo_type="dataset",
        token=os.environ.get("DATASET_TOK")
    )
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    embeddings = HuggingFaceEmbeddings(model_name=embedding_name)
    vector_store = InMemoryVectorStore(embeddings)

    texts = [item["content"] for item in data]
    metadatas = [item["metadata"] for item in data]
    embeddings_array = [np.array(item["embedding"], dtype=np.float32) for item in data]

    vector_store.add_texts(texts=texts, metadatas=metadatas, embeddings=embeddings_array)

    return vector_store

VECSTORE = get_vecstore_from_json(EMBEDDING_NAME)


def ask_question(message, history):
    print("test3...")
    results = VECSTORE.similarity_search(message, k=1)
    top_0 = results[0].page_content

    prompt = f"""
    You must use ONLY provided references when answering.

    Reference:
    \"\"\"{top_0}\"\"\"

    Question:
    {message}

    Answer:
    """

    #prompt = "You are a helpful chatbot"

    history.append({"role":"user", "content":prompt})

    text = TOKENIZER.apply_chat_template(
        history,
        tokenize=False,
        add_generation_prompt=True,
    )
    model_inputs = TOKENIZER([text], return_tensors="pt").to(MODEL.device)

    with torch.no_grad():
        generated_ids = MODEL.generate(
            **model_inputs,
            max_new_tokens=128,
            pad_token_id=TOKENIZER.eos_token_id,
        )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

    try:
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0

    content = TOKENIZER.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
    print("test4...")
    # response = client.text_generation(
    #     prompt,
    #     max_new_tokens=100,
    #     temperature=0.3,
    #     do_sample=True,
    #     repetition_penalty=1.1
    # )

    # print("test5...")

    # response = response.strip()
    # if len(response) > 200:
    #     response = response[:200] + "..."

    # print("test6...")

    return content


if __name__ == "__main__":
    print("test...")


    print("test2...")


    gr.ChatInterface(
        fn=ask_question,
        type="messages",
        chatbot=gr.Chatbot([], type="messages", height=300),
        textbox=gr.Textbox(placeholder="Ask me a question about my CV", container=False, scale=7),
        title="CV Semantic Search Chatbot Demo",
        description="Ask the chatbot a question about my CV",
        theme="ocean",
    ).launch()


# import os
# import json
# import numpy as np
# from numpy.linalg import norm
# import gradio as gr
# from huggingface_hub import InferenceClient
# from sentence_transformers import SentenceTransformer

# # Configuration
# EMBEDDING_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
# JSON_FILE_PATH = "cv_vectors_20250814-1613.json"  # Local file path

# # Initialize models
# try:
#     embedding_model = SentenceTransformer(EMBEDDING_NAME)
#     client = InferenceClient(model=MODEL_NAME, token=os.environ.get("DATASET_TOK"))
#     print("Models initialized successfully")
# except Exception as e:
#     print(f"Error initializing models: {e}")
#     embedding_model = None
#     client = None

# def load_document_chunks():
#     """Load document chunks from local JSON file"""
#     try:
#         # Check if file exists locally
#         if not os.path.exists(JSON_FILE_PATH):
#             print(f"File {JSON_FILE_PATH} not found locally")
#             return [], []
        
#         with open(JSON_FILE_PATH, "r", encoding="utf-8") as f:
#             data = json.load(f)
        
#         # Extract chunks and embeddings
#         chunks = [item["content"] for item in data]
#         embeddings = [np.array(item["embedding"], dtype=np.float32) for item in data]
        
#         print(f"Loaded {len(chunks)} document chunks from local file")
#         return chunks, embeddings
        
#     except Exception as e:
#         print(f"Error loading local document: {e}")
#         return [], []

# def cosine_similarity(vec_a, vec_b):
#     """Compute cosine similarity between two vectors"""
#     return np.dot(vec_a, vec_b) / (norm(vec_a) * norm(vec_b))

# def find_most_relevant_chunk(query, chunks, embeddings):
#     """Find the most relevant chunk using cosine similarity"""
#     if not chunks or not embeddings:
#         return "No document content available."
    
#     # Embed the query
#     query_embedding = embedding_model.encode(query)
    
#     # Compute similarities
#     similarities = []
#     for chunk_embedding in embeddings:
#         similarity = cosine_similarity(query_embedding, chunk_embedding)
#         similarities.append(similarity)
    
#     # Get the most relevant chunk
#     most_relevant_idx = np.argmax(similarities)
#     return chunks[most_relevant_idx]

# # Pre-load document data when the app starts
# DOC_CHUNKS, DOC_EMBEDDINGS = load_document_chunks()

# def ask_question(message, history):
#     print("Processing question...")
    
#     if client is None:
#         return "Error: Model not initialized"
    
#     try:
#         # Find relevant context using pre-loaded data
#         relevant_context = find_most_relevant_chunk(message, DOC_CHUNKS, DOC_EMBEDDINGS)
        
#         # Create RAG prompt
#         prompt = f"""Based on the following reference information, please answer the question. If the reference doesn't contain the answer, say you don't know.

# REFERENCE:
# \"\"\"{relevant_context}\"\"\"

# QUESTION: {message}

# ANSWER:"""
        
#         # Generate response
#         response = client.text_generation(
#             prompt,
#             max_new_tokens=100,
#             temperature=0.3,
#             do_sample=True,
#             repetition_penalty=1.1
#         )

#         return response.strip()
        
#     except Exception as e:
#         print(f"Error: {e}")
#         return f"Sorry, I encountered an error: {str(e)}"

# if __name__ == "__main__":
#     print("Starting application...")
    
#     gr.ChatInterface(
#         fn=ask_question,
#         title="CV RAG Chatbot",
#         description="Ask questions about my CV",
#         theme="soft"
#     ).launch(share=False)