Spaces:

ravistech
/

Ravis-gemini

Sleeping

App Files Files

buildinqq commited on Nov 5, 2024

Commit

9f028e1

verified ·

1 Parent(s): 8d03c60

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -120

app.py CHANGED Viewed

@@ -68,27 +68,21 @@ storage_context = StorageContext.from_defaults(persist_dir="VectorStore")
 index_persisted = load_index_from_storage(storage_context, index_id="vector_index")
 async def remove_ref(text):
     split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE)
-    if len(split_text) > 1:
-        return split_text[0].strip()
-    return text.strip()
 async def clean_trial_text(text):
-    sections = text.split('\n')
-    cleaned_sections = []
-    in_references = False
-    found_numbers = False
-    has_intro_text = False
-    reference_title_index = -1
     for i, line in enumerate(sections):
         if re.match(r'Reference Papers\s*$', line, re.IGNORECASE):
-            in_references = True
-            reference_title_index = len(cleaned_sections)
-            cleaned_sections.append(line)
             continue
         if in_references and not found_numbers:
             if re.match(r'\d+\.', line.strip()):
                 found_numbers = True
@@ -97,13 +91,17 @@ async def clean_trial_text(text):
                     has_intro_text = True
                 cleaned_sections.append(line)
                 continue
         if not in_references:
             cleaned_sections.append(line)
     if in_references and not has_intro_text and reference_title_index != -1:
         cleaned_sections.pop(reference_title_index)
     return '\n'.join(cleaned_sections).strip()
 async def get_criteria(study_information, top_k):
     criteria_response = await query_engine_get_study.aquery(f"""
       Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
@@ -135,23 +133,16 @@ async def get_criteria(study_information, top_k):
       1.
       2.
       ...
-      """
-    )
-    metadata_list = []
-    for source in criteria_response.source_nodes:
-        meta_data = source.node.get_metadata_str()
-        metadata_list.append(meta_data)
     return criteria_response.response, metadata_list
 async def process_reference(metadata_list):
-    # Join the metadata elements with numbering and format as a string separated by newline
-    joined_str = "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
-    return joined_str
 async def get_response(criteria, reference):
     response = await llm.acomplete(f"""
     ### Task Description:
     You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
@@ -196,112 +187,89 @@ async def get_response(criteria, reference):
     return response_text
 async def extract_criteria(text):
-  # Define patterns for inclusion and exclusion criteria
-  inclusion_pattern = r'Inclusion Criteria:(.*?)(?=Exclusion Criteria)'
-  exclusion_pattern = r'Exclusion Criteria:(.*?)(?=Reference Papers|\n\n\n)'
-  # Search and clean inclusion criteria
-  inclusion_match = re.search(inclusion_pattern, text, re.DOTALL)
-  cleaned_inclusion = inclusion_match.group(1).strip() if inclusion_match else "Not found"
-  # Search and clean exclusion criteria
-  exclusion_match = re.search(exclusion_pattern, text, re.DOTALL)
-  cleaned_exclusion = exclusion_match.group(1).strip() if exclusion_match else "Not found"
-  # Format and return results
-  return (
-      "Inclusion Criteria:\n" + cleaned_inclusion + "\n\n" +
-      "Exclusion Criteria:\n" + cleaned_exclusion
-  )
-async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
-  # Set up query engine
-  query_engine_get_study = CitationQueryEngine.from_args(
-    index_persisted,
-    similarity_top_k=top_k,
-    citation_chunk_size=2048,
-    verbose=True,
-    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
-    use_async=True
-  )
-  # Build prompt
-  study_information = f"""
-  #Study Objectives/Study Description
-  {study_obj}
-  #Intervention
-  {interventions}
-  #Location
     - Location_Countries: {location_countries}
     - Removed Location: {removed_location_countries}
-  #Conditions
-   Cancer {conditions}
-  #Study Design
     - Study Type: {study_type}
     - Phase: {phase}
     - Primary Purpose: {purpose}
     - Allocation: {allocation}
     - Interventional Model: {intervention_model}
     - Masking: None {Masking}
-  """
-    # Call step 1
-  criteria, metadata_list = await get_criteria(study_information, top_k)
-  if criteria != "Empty Response":
-    processed_ref = await process_reference(metadata_list)
-    # Call stpe 2
-    response = await get_response(criteria, processed_ref)
-    # Extract Criteria
-    combine_criteria = extract_criteria(response)
-    # Extract Ref
-    pattern = r'Reference Papers\s*(.+)$'
-    # pattern = r'Reference Papers:?\s*(.*?)(?:\n\n.*$|$)'
-    match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
-    ext_ref = match.group(1) if match and match.group(1) else ''
-    split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
-    formatted_ref = []
-    n=0
-    for ref in split_ref:
-        nct_match = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
-        if nct_match:
-            nct_id = nct_match.group(1)
-        else:
-            nct_match = re.search(r'(NCT\d+)', ref)
-            if nct_match:
-                nct_id = nct_match.group(1)
-            else:
-                continue
-        n+=1
-        study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
-        condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
-        intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
-        study_name = re.sub(r'\*+', '', study_name).strip()
-        condition = re.sub(r'\*+', '', condition).strip()
-        intervention = re.sub(r'\*+', '', intervention).strip()
-        formatted_trial = [
-            n,
-            f'<a href="https://clinicaltrials.gov/study/{nct_id}"><u>{nct_id}</u></a>',
-            study_name,
-            condition,
-            intervention
-        ]
-        formatted_ref.append(formatted_trial)
-  else:
-    combine_criteria = "Empty Response"
-    formatted_ref = []
-  return combine_criteria, formatted_ref
   # # LLM.complete
   # complete_response  = await llm.acomplete(f"""
@@ -530,7 +498,7 @@ with gr.Blocks() as demo:
   with gr.Row():
       top_k_box = gr.Slider(
           label="Amount of reference paper",
-          value=5,
           minimum=0,
           maximum=30,
           step=1,

 index_persisted = load_index_from_storage(storage_context, index_id="vector_index")
 async def remove_ref(text):
+    """Removes content after 'Reference Papers' (case-insensitive)."""
     split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE)
+    return split_text[0].strip() if len(split_text) > 1 else text.strip()
 async def clean_trial_text(text):
+    """Removes intro text from references if present."""
+    sections, cleaned_sections, in_references = text.split('\n'), [], False
+    has_intro_text, found_numbers, reference_title_index = False, False, -1
     for i, line in enumerate(sections):
         if re.match(r'Reference Papers\s*$', line, re.IGNORECASE):
+            in_references, reference_title_index = True, len(cleaned_sections)
+            cleaned_sections.append(line)
             continue
         if in_references and not found_numbers:
             if re.match(r'\d+\.', line.strip()):
                 found_numbers = True
                     has_intro_text = True
                 cleaned_sections.append(line)
                 continue
         if not in_references:
             cleaned_sections.append(line)
     if in_references and not has_intro_text and reference_title_index != -1:
         cleaned_sections.pop(reference_title_index)
     return '\n'.join(cleaned_sections).strip()
 async def get_criteria(study_information, top_k):
+    """Fetches eligibility criteria and metadata for a study."""
     criteria_response = await query_engine_get_study.aquery(f"""
       Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
       1.
       2.
       ...
+      """)
+    metadata_list = [source.node.get_metadata_str() for source in criteria_response.source_nodes]
     return criteria_response.response, metadata_list
 async def process_reference(metadata_list):
+    """Formats metadata list into a numbered string."""
+    return "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
 async def get_response(criteria, reference):
+    """Processes eligibility criteria and updates references to match new numbering."""
     response = await llm.acomplete(f"""
     ### Task Description:
     You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
     return response_text
 async def extract_criteria(text):
+    """Extracts inclusion and exclusion criteria from text."""
+    patterns = {
+        "inclusion": r'Inclusion Criteria:?(.*?)(?=Exclusion Criteria)',
+        "exclusion": r'Exclusion Criteria:?(.*?)(?=Reference Papers|\n\n\n)'
+    }
+    inclusion = re.search(patterns["inclusion"], text, re.DOTALL | re.IGNORECASE)
+    exclusion = re.search(patterns["exclusion"], text, re.DOTALL | re.IGNORECASE)
+    return (
+        "Inclusion Criteria:\n" + (inclusion.group(1).strip() if inclusion else "Not found") + "\n\n" +
+        "Exclusion Criteria:\n" + (exclusion.group(1).strip() if exclusion else "Not found")
+    )
+async def run_function_on_text(top_k, study_obj, study_type, phase, purpose, allocation, intervention_model, Masking, conditions, interventions, location_countries, removed_location_countries):
+    """Runs the main function to process study information and generate formatted output."""
+    query_engine_get_study = CitationQueryEngine.from_args(
+        index_persisted,
+        similarity_top_k=top_k,
+        citation_chunk_size=2048,
+        verbose=True,
+        node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
+        use_async=True
+    )
+    study_information = f"""
+    # Study Objectives/Description
+    {study_obj}
+    # Intervention
+    {interventions}
+    # Location
     - Location_Countries: {location_countries}
     - Removed Location: {removed_location_countries}
+    # Conditions
+    Cancer {conditions}
+    # Study Design
     - Study Type: {study_type}
     - Phase: {phase}
     - Primary Purpose: {purpose}
     - Allocation: {allocation}
     - Interventional Model: {intervention_model}
     - Masking: None {Masking}
+    """
+    criteria, metadata_list = await get_criteria(study_information, top_k)
+    if criteria != "Empty Response":
+        processed_ref = await process_reference(metadata_list)
+        response = await get_response(criteria, processed_ref)
+        combine_criteria = await extract_criteria(response)
+        # Extract and format references
+        pattern = r'Reference Papers\s*(.+)$'
+        match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
+        ext_ref = match.group(1) if match else ""
+        split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
+        formatted_ref = []
+        for i, ref in enumerate(split_ref, 1):
+            nct_id = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
+            if not nct_id:
+                nct_id = re.search(r'(NCT\d+)', ref)
+            if not nct_id:
+                continue
+            study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|;|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL)
+            condition = re.search(r'Condition:?\s*(.*?)(?=\n|;|Intervention/Treatment:|$)', ref, re.DOTALL)
+            intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL)
+            formatted_ref.append([
+                i,
+                f'<a href="https://clinicaltrials.gov/study/{nct_id.group(1)}"><u>{nct_id.group(1)}</u></a>',
+                study_name.group(1).strip() if study_name else "",
+                condition.group(1).strip() if condition else "",
+                intervention.group(1).strip() if intervention else ""
+            ])
+    else:
+        combine_criteria, formatted_ref = "Empty Response", []
+    return combine_criteria, formatted_ref
   # # LLM.complete
   # complete_response  = await llm.acomplete(f"""
   with gr.Row():
       top_k_box = gr.Slider(
           label="Amount of reference paper",
+          value=10,
           minimum=0,
           maximum=30,
           step=1,