Spaces:

ravistech
/

Ravis-gemini

Sleeping

App Files Files

buildinqq commited on Nov 5, 2024

Commit

2281b4b

verified ·

1 Parent(s): 061497c

Update app.py

Browse files

2 steps to improve reference accuracy

Files changed (1) hide show

app.py +148 -89

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ from llama_index.llms.gemini import Gemini
 from llama_index.core.postprocessor import SimilarityPostprocessor
 from llama_index.core.storage.docstore import SimpleDocumentStore
 from llama_index.core import StorageContext, load_index_from_storage
 import re
 import pandas as pd
 import gradio as gr
@@ -101,9 +103,118 @@ async def clean_trial_text(text):
         cleaned_sections.pop(reference_title_index)
     return '\n'.join(cleaned_sections).strip()
-async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
   # Set up query engine
   query_engine_get_study = CitationQueryEngine.from_args(
     index_persisted,
@@ -114,7 +225,7 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
     use_async=True
   )
-  #Build prompt
   study_information = f"""
   #Study Objectives/Study Description
   {study_obj}
@@ -138,88 +249,44 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
     - Masking: None {Masking}
   """
-  # Query
-  query_response = await query_engine_get_study.aquery(f"""
-      Based on the provided instructions and clinical trial information, generate the new eligibility criteria by analyzing the related studies and clinical trial information.
-      ### Instruction:
-      Find suitable papers that have relevant or similar to the clinical trial information(### Clinical Trial Information).
-      Prioritize the following topics when finding related studies:
-      1. Study Objectives
-      2. Study Design and Phases
-      3. Conditions
-      4. Intervention/Treatment
-      Criteria generation:
-      As a clinical researcher, generate new eligibility criteria for given clinical trial information.
-      Analyze the information from related studies for more precise new eligibility criteria generation.
-      Ensure the criteria are clear, specific, and reasonable for a clinical research information.
-      Please generate list of Reference Papers
-      Reference Papers generation:
-      Please give us NCT IDs and study names for {top_k} used papers.
-      Please follows the pattern of the output(### Pattern of the output).
-      --------------------------------------------------
-      ### Clinical Trial Information
-      {study_information}
-      --------------------------------------------------
-      ### Pattern of the output
-      Exclusion Criteria
-      1.
-      2.
-      .
-      .
-      .
-      Inclusion Criteria
-      1.
-      2.
-      .
-      .
-      .
-      Reference Papers
-      1.NCT ID:
-        Study Name:
-        Condition:
-        Intervention/Treatment:
-      2.NCT ID:
-        Study Name:
-        Condition:
-        Intervention/Treatment:
-      .
-      .
-      .
-      """
-    )
-  #Extract ref
-  if query_response.response != "Empty Response":
-      # pattern = r'Reference Papers:?\s*(.+)$'
-      pattern = r'(?:Reference Papers\n)(.*?)(?:\n\n[A-Za-z]|$)'
-      match = re.search(pattern, query_response.response, re.DOTALL | re.IGNORECASE)
-      ext_ref = match.group(1) if match and match.group(1) else ''
-      split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
-      # print(split_ref)
-      formatted_ref = []
-      n=0
-      for ref in split_ref:
-        n+=1
-        nct_match = re.search(r'(NCT\d+)', ref)
         if nct_match:
-            # nct_id = nct_match.group(1)
-            # study_name = re.search(r'Study Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
-            # condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
-            # intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
             nct_id = nct_match.group(1)
-            study_name = ref
-            condition = ""
-            intervention = ""
         else:
-            continue
         formatted_trial = [
             n,
@@ -229,20 +296,12 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
             intervention
         ]
         formatted_ref.append(formatted_trial)
   else:
     formatted_ref = []
-  # return query_response,formatted_ref
-  #Extract criteria
-  if query_response.response == "Empty Response":
-    return query_response
-  else:
-    combine_criteira = re.sub(r'#+\s*', '', query_response.response).strip()
-    combine_criteira = re.sub(r'\*\*', '', combine_criteira).strip()
-    combine_criteira = re.sub(r'(Criteria)\n\s*\n(\d+\.)', r'\1\n\2', combine_criteira).strip()
-    combine_criteira = await clean_trial_text(combine_criteira)
-    return query_response,formatted_ref
   # # LLM.complete
   # complete_response  = await llm.acomplete(f"""

 from llama_index.core.postprocessor import SimilarityPostprocessor
 from llama_index.core.storage.docstore import SimpleDocumentStore
 from llama_index.core import StorageContext, load_index_from_storage
+from llama_index.core.data_structs import Node
+from llama_index.core.schema import NodeWithScore
 import re
 import pandas as pd
 import gradio as gr
         cleaned_sections.pop(reference_title_index)
     return '\n'.join(cleaned_sections).strip()
+async def get_criteria(study_information, top_k):
+    criteria_response = await query_engine_get_study.aquery(f"""
+      Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
+      ### Instruction:
+      Find suitable papers that are relevant or similar to the provided clinical trial information (### Clinical Trial Information).
+      Prioritize the following topics when finding related studies:
+      1. Study Objectives
+      2. Study Design and Phases
+      3. Conditions
+      4. Intervention/Treatment
+      Criteria Generation:
+      As a clinical researcher, generate new eligibility criteria for the given clinical trial information.
+      Analyze the information from all {top_k} related studies to generate new precise eligibility criteria.
+      Ensure that the criteria are specific for the given clinical trial information (### Clinical Trial Information).
+      Please follow the pattern of the output (### Pattern of the output).
+      --------------------------------------------------
+      ### Clinical Trial Information
+      {study_information}
+      --------------------------------------------------
+      ### Pattern of the Output
+      Inclusion Criteria
+      1.
+      2.
+      ...
+      Exclusion Criteria
+      1.
+      2.
+      ...
+      """
+    )
+    metadata_list = []
+    for source in criteria_response.source_nodes:
+        meta_data = source.node.get_metadata_str()
+        metadata_list.append(meta_data)
+    return criteria_response.response, metadata_list
+async def process_reference(metadata_list):
+    # Join the metadata elements with numbering and format as a string separated by newline
+    joined_str = "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
+    return joined_str
+async def get_response(criteria, reference):
+    response = await llm.acomplete(f"""
+    ### Task Description:
+    You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
+    ### Instructions:
+    1. Review the eligibility criteria provided, which include references to metadata numbers (e.g., [1], [2], etc.). Identify all reference numbers that are actually used in the criteria.
+    2. Remove metadata of reference papers (### Metadata of Reference Papers) that does not have a corresponding reference in the eligibility criteria. This will ensure only relevant references are kept.
+    3. Reorder the remaining metadata so that they are numbered sequentially, starting from 1.
+    4. Update the reference numbers in the eligibility criteria accordingly to reflect the new order.
+    5. Maintain Criteria Consistency: Ensure that the eligibility criteria remain exactly the same in terms of content, but the reference numbers are updated to match the new numbering of metadata.
+    --------------------------------------------------
+    ### Eligibility Criteria
+    {criteria}
+    --------------------------------------------------
+    ### Metadata of Reference Papers
+    {reference}
+    --------------------------------------------------
+    ### Pattern of the Output
+    Inclusion Criteria
+    1.
+    2.
+    ...
+    Exclusion Criteria
+    1.
+    2.
+    ...
+    Reference Papers
+    1.NCT ID:
+      Study Name:
+      Condition:
+      Intervention/Treatment:
+    2.NCT ID:
+      Study Name:
+      Condition:
+      Intervention/Treatment:
+    .
+    .
+    .""")
+    response_text = response.text
+    return response_text
+async def extract_criteria(text):
+  # Define patterns for inclusion and exclusion criteria
+  inclusion_pattern = r'Inclusion Criteria:(.*?)(?=Exclusion Criteria)'
+  exclusion_pattern = r'Exclusion Criteria:(.*?)(?=Reference Papers|\n\n\n)'
+  # Search and clean inclusion criteria
+  inclusion_match = re.search(inclusion_pattern, text, re.DOTALL)
+  cleaned_inclusion = inclusion_match.group(1).strip() if inclusion_match else "Not found"
+  # Search and clean exclusion criteria
+  exclusion_match = re.search(exclusion_pattern, text, re.DOTALL)
+  cleaned_exclusion = exclusion_match.group(1).strip() if exclusion_match else "Not found"
+  # Format and return results
+  return (
+      "Inclusion Criteria:\n" + cleaned_inclusion + "\n\n" +
+      "Exclusion Criteria:\n" + cleaned_exclusion
+  )
+async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
   # Set up query engine
   query_engine_get_study = CitationQueryEngine.from_args(
     index_persisted,
     use_async=True
   )
+  # Build prompt
   study_information = f"""
   #Study Objectives/Study Description
   {study_obj}
     - Masking: None {Masking}
   """
+    # Call step 1
+  criteria, metadata_list = await get_criteria(study_information, top_k)
+  if criteria != "Empty Response":
+    processed_ref = await process_reference(metadata_list)
+    # Call stpe 2
+    response = await get_response(criteria, processed_ref)
+    # Extract Criteria
+    combine_criteria = extract_criteria(response)
+    # Extract Ref
+    pattern = r'Reference Papers\s*(.+)$'
+    # pattern = r'Reference Papers:?\s*(.*?)(?:\n\n.*$|$)'
+    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
+    ext_ref = match.group(1) if match and match.group(1) else ''
+    split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
+    formatted_ref = []
+    n=0
+    for ref in split_ref:
+        nct_match = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
         if nct_match:
             nct_id = nct_match.group(1)
         else:
+            nct_match = re.search(r'(NCT\d+)', ref)
+            if nct_match:
+                nct_id = nct_match.group(1)
+            else:
+                continue
+        n+=1
+        study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
+        condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
+        intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
+        study_name = re.sub(r'\*+', '', study_name).strip()
+        condition = re.sub(r'\*+', '', condition).strip()
+        intervention = re.sub(r'\*+', '', intervention).strip()
         formatted_trial = [
             n,
             intervention
         ]
         formatted_ref.append(formatted_trial)
   else:
+    combine_criteria = "Empty Response"
     formatted_ref = []
+  return combine_criteria, formatted_ref
   # # LLM.complete
   # complete_response  = await llm.acomplete(f"""