Spaces:
Runtime error
Runtime error
| from langchain_community.chat_models import ChatOpenAI | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import LLMChain | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import tiktoken | |
| from typing import Union | |
| from config import OPENAI_API_KEY, SERP_API_KEY | |
| import os | |
| from serpapi import GoogleSearch | |
| def get_google_scrape(query, api_key = SERP_API_KEY, num = 25): | |
| search = GoogleSearch({ | |
| "q": query, | |
| "api_key": api_key, | |
| "start": "1", | |
| "end": "10", | |
| "num": num, | |
| }) | |
| result = search.get_dict() | |
| return result | |
| def init_extractor( | |
| template: str, | |
| openai_api_key: Union[str, None] = None, | |
| max_tokens: int = 1000, | |
| chunk_size: int = 300, | |
| chunk_overlap: int = 40 | |
| ): | |
| if openai_api_key is None and 'OPENAI_API_KEY' not in os.environ: | |
| raise Exception('No OpenAI API key provided') | |
| openai_api_key = openai_api_key or os.environ['OPENAI_API_KEY'] | |
| # instantiate the OpenAI API wrapper | |
| llm = ChatOpenAI( | |
| model_name='gpt-3.5-turbo-16k', | |
| openai_api_key=OPENAI_API_KEY, | |
| max_tokens=max_tokens, | |
| temperature=0.0 | |
| ) | |
| # initialize prompt template | |
| prompt = PromptTemplate( | |
| template=template, | |
| input_variables=['refs'] | |
| ) | |
| # instantiate the LLMChain extractor model | |
| extractor = LLMChain( | |
| prompt=prompt, | |
| llm=llm | |
| ) | |
| text_splitter = tiktoken_splitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| return extractor, text_splitter | |
| def tiktoken_splitter(chunk_size=300, chunk_overlap=40): | |
| tokenizer = tiktoken.get_encoding('p50k_base') | |
| # create length function | |
| def len_fn(text): | |
| tokens = tokenizer.encode( | |
| text, disallowed_special=() | |
| ) | |
| return len(tokens) | |
| # initialize the text splitter | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len_fn, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| return text_splitter |