Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import asyncio | |
| from llama_index.core.query_engine import CitationQueryEngine | |
| from llama_index.core import VectorStoreIndex | |
| from llama_index.core import Settings | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.llms.gemini import Gemini | |
| from llama_index.core.postprocessor import SimilarityPostprocessor | |
| from llama_index.core.storage.docstore import SimpleDocumentStore | |
| from llama_index.core import StorageContext, load_index_from_storage | |
| from llama_index.core.data_structs import Node | |
| from llama_index.core.schema import NodeWithScore | |
| import re | |
| import pandas as pd | |
| import gradio as gr | |
| import logging | |
| #Enable logging to see what's happening behind the scenes | |
| logging.basicConfig(level=logging.INFO) | |
| token_w = os.environ['token_w'] | |
| HF_TOKEN=os.environ['token_r'] | |
| API_KEY=os.environ["GOOGLE_API_KEY"] | |
| generation_config = { | |
| "temperature": 0, | |
| # "top_p": 1, | |
| # "top_k": 1, | |
| "max_output_tokens":8192, | |
| } | |
| safety_settings = [ | |
| { | |
| "category": "HARM_CATEGORY_HARASSMENT", | |
| "threshold": "BLOCK_NONE" | |
| }, | |
| { | |
| "category": "HARM_CATEGORY_HATE_SPEECH", | |
| "threshold": "BLOCK_NONE" | |
| }, | |
| { | |
| "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", | |
| "threshold": "BLOCK_NONE" | |
| }, | |
| { | |
| "category": "HARM_CATEGORY_DANGEROUS_CONTENT", | |
| "threshold": "BLOCK_NONE" | |
| }, | |
| ] | |
| llm = Gemini( | |
| model="models/gemini-1.5-flash-002", | |
| generation_config=generation_config, | |
| safety_settings=safety_settings, | |
| ) | |
| # Setup embedder | |
| embed_model_name = "BAAI/bge-small-en-v1.5" | |
| embed_model = HuggingFaceEmbedding(model_name=embed_model_name) | |
| Settings.llm = llm | |
| Settings.embed_model = embed_model | |
| # rebuild storage context | |
| storage_context = StorageContext.from_defaults(persist_dir="VectorStore") | |
| # load index | |
| index_persisted = load_index_from_storage(storage_context, index_id="vector_index") | |
| async def remove_ref(text): | |
| """Removes content after 'Reference Papers' (case-insensitive).""" | |
| split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE) | |
| return split_text[0].strip() if len(split_text) > 1 else text.strip() | |
| async def clean_trial_text(text): | |
| """Removes intro text from references if present.""" | |
| sections, cleaned_sections, in_references = text.split('\n'), [], False | |
| has_intro_text, found_numbers, reference_title_index = False, False, -1 | |
| for i, line in enumerate(sections): | |
| if re.match(r'Reference Papers\s*$', line, re.IGNORECASE): | |
| in_references, reference_title_index = True, len(cleaned_sections) | |
| cleaned_sections.append(line) | |
| continue | |
| if in_references and not found_numbers: | |
| if re.match(r'\d+\.', line.strip()): | |
| found_numbers = True | |
| else: | |
| if line.strip(): | |
| has_intro_text = True | |
| cleaned_sections.append(line) | |
| continue | |
| if not in_references: | |
| cleaned_sections.append(line) | |
| if in_references and not has_intro_text and reference_title_index != -1: | |
| cleaned_sections.pop(reference_title_index) | |
| return '\n'.join(cleaned_sections).strip() | |
| async def get_criteria(study_information, top_k): | |
| """Fetches eligibility criteria and metadata for a study.""" | |
| criteria_response = await query_engine_get_study.aquery(f""" | |
| Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information. | |
| ### Instruction: | |
| Find suitable papers that are relevant or similar to the provided clinical trial information (### Clinical Trial Information). | |
| Prioritize the following topics when finding related studies: | |
| 1. Study Objectives | |
| 2. Study Design and Phases | |
| 3. Conditions | |
| 4. Intervention/Treatment | |
| Criteria Generation: | |
| As a clinical researcher, generate new eligibility criteria for the given clinical trial information. | |
| Analyze the information from all {top_k} related studies to generate new precise eligibility criteria. | |
| Ensure that the criteria are specific for the given clinical trial information (### Clinical Trial Information). | |
| Please follow the pattern of the output (### Pattern of the output). | |
| -------------------------------------------------- | |
| ### Clinical Trial Information | |
| {study_information} | |
| -------------------------------------------------- | |
| ### Pattern of the Output | |
| Inclusion Criteria | |
| 1. | |
| 2. | |
| ... | |
| Exclusion Criteria | |
| 1. | |
| 2. | |
| ... | |
| """) | |
| metadata_list = [source.node.get_metadata_str() for source in criteria_response.source_nodes] | |
| return criteria_response.response, metadata_list | |
| async def process_reference(metadata_list): | |
| """Formats metadata list into a numbered string.""" | |
| return "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)]) | |
| async def get_response(criteria, reference): | |
| """Processes eligibility criteria and updates references to match new numbering.""" | |
| response = await llm.acomplete(f""" | |
| ### Task Description: | |
| You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria. | |
| ### Instructions: | |
| 1. Review the eligibility criteria provided, which include references to metadata numbers (e.g., [1], [2], etc.). Identify all reference numbers that are actually used in the criteria. | |
| 2. Remove metadata of reference papers (### Metadata of Reference Papers) that does not have a corresponding reference in the eligibility criteria. This will ensure only relevant references are kept. | |
| 3. Reorder the remaining metadata so that they are numbered sequentially, starting from 1. | |
| 4. Update the reference numbers in the eligibility criteria accordingly to reflect the new order. | |
| 5. Maintain Criteria Consistency: Ensure that the eligibility criteria remain exactly the same in terms of content, but the reference numbers are updated to match the new numbering of metadata. | |
| -------------------------------------------------- | |
| ### Eligibility Criteria | |
| {criteria} | |
| -------------------------------------------------- | |
| ### Metadata of Reference Papers | |
| {reference} | |
| -------------------------------------------------- | |
| ### Pattern of the Output | |
| Inclusion Criteria | |
| 1. | |
| 2. | |
| ... | |
| Exclusion Criteria | |
| 1. | |
| 2. | |
| ... | |
| Reference Papers | |
| 1.NCT ID: | |
| Study Name: | |
| Condition: | |
| Intervention/Treatment: | |
| 2.NCT ID: | |
| Study Name: | |
| Condition: | |
| Intervention/Treatment: | |
| . | |
| . | |
| .""") | |
| response_text = response.text | |
| return response_text | |
| async def extract_criteria(text): | |
| """Extracts inclusion and exclusion criteria from text.""" | |
| patterns = { | |
| "inclusion": r'Inclusion Criteria:?(.*?)(?=Exclusion Criteria)', | |
| "exclusion": r'Exclusion Criteria:?(.*?)(?=Reference Papers|\n\n\n)' | |
| } | |
| inclusion = re.search(patterns["inclusion"], text, re.DOTALL | re.IGNORECASE) | |
| exclusion = re.search(patterns["exclusion"], text, re.DOTALL | re.IGNORECASE) | |
| return ( | |
| "Inclusion Criteria:\n" + (inclusion.group(1).strip() if inclusion else "Not found") + "\n\n" + | |
| "Exclusion Criteria:\n" + (exclusion.group(1).strip() if exclusion else "Not found") | |
| ) | |
| async def run_function_on_text(top_k, study_obj, study_type, phase, purpose, allocation, intervention_model, Masking, conditions, interventions, location_countries, removed_location_countries): | |
| """Runs the main function to process study information and generate formatted output.""" | |
| query_engine_get_study = CitationQueryEngine.from_args( | |
| index_persisted, | |
| similarity_top_k=top_k, | |
| citation_chunk_size=2048, | |
| verbose=True, | |
| node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)], | |
| use_async=True | |
| ) | |
| study_information = f""" | |
| # Study Objectives/Description | |
| {study_obj} | |
| # Intervention | |
| {interventions} | |
| # Location | |
| - Location_Countries: {location_countries} | |
| - Removed Location: {removed_location_countries} | |
| # Conditions | |
| Cancer {conditions} | |
| # Study Design | |
| - Study Type: {study_type} | |
| - Phase: {phase} | |
| - Primary Purpose: {purpose} | |
| - Allocation: {allocation} | |
| - Interventional Model: {intervention_model} | |
| - Masking: None {Masking} | |
| """ | |
| criteria, metadata_list = await get_criteria(study_information, top_k) | |
| if criteria != "Empty Response": | |
| processed_ref = await process_reference(metadata_list) | |
| response = await get_response(criteria, processed_ref) | |
| combine_criteria = await extract_criteria(response) | |
| # Extract and format references | |
| pattern = r'Reference Papers\s*(.+)$' | |
| match = re.search(pattern, response, re.DOTALL | re.IGNORECASE) | |
| ext_ref = match.group(1) if match else "" | |
| split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:] | |
| formatted_ref = [] | |
| for i, ref in enumerate(split_ref, 1): | |
| nct_id = re.search(r'NCT[_ ]ID: (NCT\d+)', ref) | |
| if not nct_id: | |
| nct_id = re.search(r'(NCT\d+)', ref) | |
| if not nct_id: | |
| continue | |
| study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|;|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL) | |
| condition = re.search(r'Condition:?\s*(.*?)(?=\n|;|Intervention/Treatment:|$)', ref, re.DOTALL) | |
| intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL) | |
| formatted_ref.append([ | |
| i, | |
| f'<a href="https://clinicaltrials.gov/study/{nct_id.group(1)}"><u>{nct_id.group(1)}</u></a>', | |
| study_name.group(1).strip() if study_name else "", | |
| condition.group(1).strip() if condition else "", | |
| intervention.group(1).strip() if intervention else "" | |
| ]) | |
| else: | |
| combine_criteria, formatted_ref = "Empty Response", [] | |
| return combine_criteria, formatted_ref | |
| # # LLM.complete | |
| # complete_response = await llm.acomplete(f""" | |
| # Based on the provided instructions and clinical trial information, generate the new eligibility criteria by analyzing clinical trial information(### Clinical Trial Information). | |
| # ### Instruction: | |
| # Criteria generation: | |
| # As a clinical researcher, generate new eligibility criteria for given clinical trial information. | |
| # Ensure the criteria are clear, specific, and reasonable for a clinical research information. | |
| # Prioritize the following topics in clinical trial information.: | |
| # 1. Study Objectives | |
| # 2. Study Design and Phases | |
| # 3. Conditions | |
| # 4. Intervention/Treatment | |
| # Please follow the pattern of the output(### Pattern of the output). | |
| # -------------------------------------------------- | |
| # ### Clinical Trial Information | |
| # {study_information} | |
| # -------------------------------------------------- | |
| # ### Pattern of the output | |
| # Inclusion Criteria | |
| # 1. | |
| # 2. | |
| # . | |
| # . | |
| # . | |
| # Exclusion Criteria | |
| # 1. | |
| # 2. | |
| # . | |
| # . | |
| # . | |
| # """ | |
| # ) | |
| # combine_response = await llm.acomplete(f""" | |
| # Based on the provided instructions clinical, clinical trial information, and criteria information, generate the appropriate eligibility criteria for ### Clinical Trial Information by analyze clinical trial information(### Clinical Trial Information), criteria 1 (### Criteria 1) and criteria 2 (### Criteria 2). | |
| # ### Instruction: | |
| # Criteria generation: | |
| # As a clinical researcher, generate appropriate eligibility criteria by analyzing given information. | |
| # Ensure the criteria are clear, specific, and reasonable for a clinical research information(### Clinical Trial Information). | |
| # Prioritize the following topics in clinical trial information.: | |
| # 1. Study Objectives | |
| # 2. Study Design and Phases | |
| # 3. Conditions | |
| # 4. Intervention/Treatment | |
| # Do not generate redundant inclusion and exclusion criteria. For example, if a criterion is included in one set of inclusion or exclusion criteria, do not include it again. | |
| # Reference Papers generation: | |
| # Please give us NCT IDs and study names from the references list in ### Criteria 1. | |
| # Please follow the pattern of the output(### Pattern of the output). | |
| # -------------------------------------------------- | |
| # ### Clinical Trial Information | |
| # {study_information} | |
| # -------------------------------------------------- | |
| # ### Criteria 1 | |
| # {query_response} | |
| # -------------------------------------------------- | |
| # ### Criteria 2 | |
| # {complete_response} | |
| # -------------------------------------------------- | |
| # ### Pattern of the output | |
| # Inclusion Criteria | |
| # 1. | |
| # 2. | |
| # . | |
| # . | |
| # . | |
| # Exclusion Criteria | |
| # 1. | |
| # 2. | |
| # . | |
| # . | |
| # . | |
| # Reference Papers | |
| # 1.NCT ID: | |
| # Study Name: | |
| # Condition: | |
| # Intervention/Treatment: | |
| # 2.NCT ID: | |
| # Study Name: | |
| # Condition: | |
| # Intervention/Treatment: | |
| # . | |
| # . | |
| # . | |
| # """ | |
| # ) | |
| # return query_response | |
| # return query_response,complete_response,combine_response | |
| # Place holder | |
| place_holder = f"""Study Objectives | |
| The purpose of this study is to evaluate the safety, tolerance and efficacy of Liposomal Paclitaxel With Nedaplatin as First-line in patients with Advanced or Recurrent Esophageal Carcinoma | |
| Conditions: Esophageal Carcinoma | |
| Intervention / Treatment: | |
| DRUG: Liposomal Paclitaxel, | |
| DRUG: Nedaplatin | |
| Location: China | |
| Study Design and Phases | |
| Study Type: INTERVENTIONAL | |
| Phase: PHASE2 Primary Purpose: | |
| TREATMENT Allocation: NA | |
| Interventional Model: SINGLE_GROUP Masking: NONE | |
| """ | |
| objective_place_holder = f"""Example: The purpose of this study is to evaluate the safety, tolerance and efficacy of Liposomal Paclitaxel With Nedaplatin as First-line in patients with Advanced or Recurrent Esophageal Carcinoma | |
| """ | |
| conditions_place_holder = f"""Example: Esophageal Carcinoma | |
| """ | |
| interventions_place_holder = f"""Example: | |
| - Drug: irinotecan hydrochloride | |
| - Given IV | |
| - Other Names: | |
| - Campto | |
| - Camptosar | |
| - CPT-11 | |
| - irinotecan | |
| - U-101440E | |
| - Drug: Amoxicillin hydrate | |
| - Amoxicillin hydrate (potency) | |
| - Biological: Pneumococcal Vaccine | |
| - Subcutaneously on Day 0 | |
| - Other Names: | |
| - Prevnar | |
| - Drug: Doxorubicin, Cotrimoxazole, Carboplatin, Ifosfamide | |
| - Drug: Irinotecan | |
| - Irinotecan will be administered at a dose of 180mg/m2 IV over 90 minutes on day 21 every 42 days. | |
| - Other Names: | |
| - CAMPTOSAR™ | |
| - Drug: Placeblo | |
| - Placebo tablet | |
| """ | |
| with gr.Blocks() as demo: | |
| # Study description | |
| with gr.Row(): | |
| gr.Markdown("# Research Information"), | |
| with gr.Row(): | |
| study_obj_box = gr.Textbox( | |
| label="Study Objective / Study Description", | |
| placeholder=objective_place_holder, | |
| lines=10) | |
| # Conditions | |
| with gr.Row(): | |
| gr.Markdown("# Conditions"), | |
| with gr.Row(): | |
| conditions_box = gr.Textbox( | |
| label="Conditions / Disease", | |
| info="Primary Disease or Condition of Cancer Being Studied in the Trial, or the Focus of the Study", | |
| placeholder=conditions_place_holder, | |
| ) | |
| #Interventions | |
| with gr.Row(): | |
| gr.Markdown("# Interventions / Drugs"), | |
| with gr.Row(): | |
| intervention_box = gr.Textbox( | |
| label="Intervention type", | |
| info="A process or action studied in a clinical trial, including drugs, devices, procedures, vaccines, or noninvasive approaches.", | |
| placeholder=interventions_place_holder, | |
| # lines=5, | |
| ) | |
| # Study Design | |
| with gr.Row(): | |
| gr.Markdown("# Study Design"), | |
| with gr.Column(): | |
| study_type_box = gr.Radio( | |
| ["Expanded Access", "Interventional", "Observational"], | |
| label="Study Type", | |
| ) | |
| phase_box= gr.CheckboxGroup( | |
| ["Not Applicable", "Early Phase 1", "Phase 1", "Phase 2", "Phase 3", "Phase 4"], | |
| label="Phase" | |
| ) | |
| purpose_box = gr.Radio( | |
| ["Treatment", "Prevention", "Diagnostic", "Educational/Counseling/Training", "Supportive Care", "Screening", "Health Services Research", "Basic Science", "Device Feasibility", "Other"], | |
| label="Primary Purpose" | |
| ) | |
| allocation_box = gr.Radio( | |
| ["Randomized", "Non-Randomized", "N/A"], | |
| label="Allocation" | |
| ) | |
| intervention_model_box = gr.Radio( | |
| ["Parallel", "Single-Group", "Crossover", "Factorial", "Sequential"], | |
| label="Interventional Model" | |
| ) | |
| masking_box = gr.Radio( | |
| ["None (Open Label)", "Single", "Double", "Triple", "Quadruple"], | |
| label="Masking" | |
| ) | |
| #Location | |
| with gr.Row(): | |
| gr.Markdown("# Location"), | |
| with gr.Column(): | |
| location_box = gr.Textbox( | |
| label="Location (Countries)", | |
| ) | |
| removed_location_box = gr.Textbox( | |
| label="Removed Location (Countries)", | |
| ) | |
| # Reference paper | |
| with gr.Row(): | |
| gr.Markdown("# Reference paper"), | |
| with gr.Row(): | |
| top_k_box = gr.Slider( | |
| label="Amount of reference paper", | |
| value=10, | |
| minimum=0, | |
| maximum=30, | |
| step=1, | |
| ) | |
| # Submit & Clear | |
| with gr.Row(): | |
| submit_button = gr.Button("Submit") | |
| clear_button = gr.Button("Clear") | |
| # Output | |
| with gr.Row(): | |
| gr.Markdown("# Eligibility Criteria Generation"), | |
| with gr.Row(): | |
| with gr.Column(): | |
| base_box = gr.Textbox( | |
| label="Response", | |
| lines=15, | |
| interactive=False) | |
| with gr.Row(): | |
| ref_table = gr.Dataframe( | |
| label="Reference", | |
| headers=["No.",'Link', 'Study name', 'Intervention', 'Condition'], | |
| datatype=["markdown","html","markdown", "markdown","markdown"], | |
| wrap=True, | |
| interactive=False) | |
| # with gr.Column(): | |
| # rag_box = gr.Textbox( | |
| # label="Response 2", | |
| # lines=15, | |
| # interactive=False) | |
| # with gr.Column(): | |
| # combine_box = gr.Textbox( | |
| # label="Response 3", | |
| # lines=15, | |
| # interactive=False) | |
| with gr.Row(): | |
| regenerate_button = gr.Button("Regenerate") | |
| inputs_information = [top_k_box, study_obj_box, study_type_box, phase_box, purpose_box, allocation_box, intervention_model_box, masking_box, conditions_box, intervention_box, location_box, removed_location_box] | |
| outputs_information = [base_box,ref_table] | |
| # outputs_information = [base_box, rag_box,combine_box] | |
| submit_button.click( | |
| run_function_on_text, | |
| inputs=inputs_information, | |
| outputs=outputs_information | |
| ) | |
| regenerate_button.click( | |
| run_function_on_text, | |
| inputs=inputs_information, | |
| outputs=outputs_information | |
| ) | |
| clear_button.click(lambda : [None] * len(inputs_information), outputs=inputs_information) | |
| # with gr.Row(): | |
| # selected_response = gr.Radio( | |
| # choices=[ | |
| # "Response 1", | |
| # "Response 2", | |
| # "Response 3", | |
| # "All responses are equally good", | |
| # "Neither response is satisfactory" | |
| # ], | |
| # label="Select the best response" | |
| # ) | |
| # with gr.Row(): | |
| # flag_button = gr.Button("Flag Selected Response") | |
| # #Flagging | |
| # dataset_name = "ravistech/feedback-demo-space" | |
| # hf_writer = gr.HuggingFaceDatasetSaver(hf_token=token_w, dataset_name=dataset_name, private=True) | |
| # hf_writer.setup([selected_response, study_obj_box, study_type_box, phase_box, purpose_box, allocation_box, intervention_model_box, masking_box, conditions_box, intervention_box, location_box, removed_location_box, top_k_box, base_box, rag_box, combine_box],dataset_name) | |
| # flag_button.click(lambda *args: hf_writer.flag(list(args)), [selected_response, study_obj_box, study_type_box, phase_box, purpose_box, allocation_box, intervention_model_box, masking_box, conditions_box, intervention_box, location_box, removed_location_box, top_k_box, base_box, rag_box, combine_box], None, preprocess=False) | |
| #Clear all | |
| with gr.Row(): | |
| clear_all_button = gr.Button("Clear All") | |
| # flag_response = [selected_response] | |
| all_information = inputs_information + outputs_information #+ flag_response | |
| clear_all_button.click(lambda : [None] * len(all_information), outputs=all_information) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |