import os import json import gradio as gr import numpy as np import pandas as pd import torch from langchain_core.vectorstores import InMemoryVectorStore from langchain_huggingface import HuggingFaceEmbeddings from huggingface_hub import hf_hub_download from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from huggingface_hub import hf_hub_download, InferenceClient quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4" ) EMBEDDING_NAME = "Qwen/Qwen3-Embedding-0.6B" #EMBEDDING_NAME = "sentence-transformers/all-MiniLM-L6-v2" # HISTORY = [ # {"role":"system", "content":"You are a helpful assistant."} # ] MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat" #MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, padding_side="left") MODEL = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, torch_dtype="auto", device_map="auto", quantization_config=quantization_config, ) # if TOKENIZER.pad_token is None: # TOKENIZER.pad_token = TOKENIZER.eos_token # client = InferenceClient( # model=MODEL_NAME, # token=os.environ.get("DATASET_TOK") # ) def get_vecstore_from_json(embedding_name): json_path = hf_hub_download( repo_id="yucxy/cv", filename="cv_vectors_20250814-1613.json", repo_type="dataset", token=os.environ.get("DATASET_TOK") ) with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) embeddings = HuggingFaceEmbeddings(model_name=embedding_name) vector_store = InMemoryVectorStore(embeddings) texts = [item["content"] for item in data] metadatas = [item["metadata"] for item in data] embeddings_array = [np.array(item["embedding"], dtype=np.float32) for item in data] vector_store.add_texts(texts=texts, metadatas=metadatas, embeddings=embeddings_array) return vector_store VECSTORE = get_vecstore_from_json(EMBEDDING_NAME) def ask_question(message, history): print("test3...") results = VECSTORE.similarity_search(message, k=1) top_0 = results[0].page_content prompt = f""" You must use ONLY provided references when answering. Reference: \"\"\"{top_0}\"\"\" Question: {message} Answer: """ #prompt = "You are a helpful chatbot" history.append({"role":"user", "content":prompt}) text = TOKENIZER.apply_chat_template( history, tokenize=False, add_generation_prompt=True, ) model_inputs = TOKENIZER([text], return_tensors="pt").to(MODEL.device) with torch.no_grad(): generated_ids = MODEL.generate( **model_inputs, max_new_tokens=128, pad_token_id=TOKENIZER.eos_token_id, ) output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() try: index = len(output_ids) - output_ids[::-1].index(151668) except ValueError: index = 0 content = TOKENIZER.decode(output_ids[index:], skip_special_tokens=True).strip("\n") print("test4...") # response = client.text_generation( # prompt, # max_new_tokens=100, # temperature=0.3, # do_sample=True, # repetition_penalty=1.1 # ) # print("test5...") # response = response.strip() # if len(response) > 200: # response = response[:200] + "..." # print("test6...") return content if __name__ == "__main__": print("test...") print("test2...") gr.ChatInterface( fn=ask_question, type="messages", chatbot=gr.Chatbot([], type="messages", height=300), textbox=gr.Textbox(placeholder="Ask me a question about my CV", container=False, scale=7), title="CV Semantic Search Chatbot Demo", description="Ask the chatbot a question about my CV", theme="ocean", ).launch() # import os # import json # import numpy as np # from numpy.linalg import norm # import gradio as gr # from huggingface_hub import InferenceClient # from sentence_transformers import SentenceTransformer # # Configuration # EMBEDDING_NAME = "sentence-transformers/all-MiniLM-L6-v2" # MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat" # JSON_FILE_PATH = "cv_vectors_20250814-1613.json" # Local file path # # Initialize models # try: # embedding_model = SentenceTransformer(EMBEDDING_NAME) # client = InferenceClient(model=MODEL_NAME, token=os.environ.get("DATASET_TOK")) # print("Models initialized successfully") # except Exception as e: # print(f"Error initializing models: {e}") # embedding_model = None # client = None # def load_document_chunks(): # """Load document chunks from local JSON file""" # try: # # Check if file exists locally # if not os.path.exists(JSON_FILE_PATH): # print(f"File {JSON_FILE_PATH} not found locally") # return [], [] # with open(JSON_FILE_PATH, "r", encoding="utf-8") as f: # data = json.load(f) # # Extract chunks and embeddings # chunks = [item["content"] for item in data] # embeddings = [np.array(item["embedding"], dtype=np.float32) for item in data] # print(f"Loaded {len(chunks)} document chunks from local file") # return chunks, embeddings # except Exception as e: # print(f"Error loading local document: {e}") # return [], [] # def cosine_similarity(vec_a, vec_b): # """Compute cosine similarity between two vectors""" # return np.dot(vec_a, vec_b) / (norm(vec_a) * norm(vec_b)) # def find_most_relevant_chunk(query, chunks, embeddings): # """Find the most relevant chunk using cosine similarity""" # if not chunks or not embeddings: # return "No document content available." # # Embed the query # query_embedding = embedding_model.encode(query) # # Compute similarities # similarities = [] # for chunk_embedding in embeddings: # similarity = cosine_similarity(query_embedding, chunk_embedding) # similarities.append(similarity) # # Get the most relevant chunk # most_relevant_idx = np.argmax(similarities) # return chunks[most_relevant_idx] # # Pre-load document data when the app starts # DOC_CHUNKS, DOC_EMBEDDINGS = load_document_chunks() # def ask_question(message, history): # print("Processing question...") # if client is None: # return "Error: Model not initialized" # try: # # Find relevant context using pre-loaded data # relevant_context = find_most_relevant_chunk(message, DOC_CHUNKS, DOC_EMBEDDINGS) # # Create RAG prompt # prompt = f"""Based on the following reference information, please answer the question. If the reference doesn't contain the answer, say you don't know. # REFERENCE: # \"\"\"{relevant_context}\"\"\" # QUESTION: {message} # ANSWER:""" # # Generate response # response = client.text_generation( # prompt, # max_new_tokens=100, # temperature=0.3, # do_sample=True, # repetition_penalty=1.1 # ) # return response.strip() # except Exception as e: # print(f"Error: {e}") # return f"Sorry, I encountered an error: {str(e)}" # if __name__ == "__main__": # print("Starting application...") # gr.ChatInterface( # fn=ask_question, # title="CV RAG Chatbot", # description="Ask questions about my CV", # theme="soft" # ).launch(share=False)