Spaces:
Sleeping
Sleeping
Update app.py
Browse files2 steps to improve reference accuracy
app.py
CHANGED
|
@@ -9,6 +9,8 @@ from llama_index.llms.gemini import Gemini
|
|
| 9 |
from llama_index.core.postprocessor import SimilarityPostprocessor
|
| 10 |
from llama_index.core.storage.docstore import SimpleDocumentStore
|
| 11 |
from llama_index.core import StorageContext, load_index_from_storage
|
|
|
|
|
|
|
| 12 |
import re
|
| 13 |
import pandas as pd
|
| 14 |
import gradio as gr
|
|
@@ -101,9 +103,118 @@ async def clean_trial_text(text):
|
|
| 101 |
cleaned_sections.pop(reference_title_index)
|
| 102 |
return '\n'.join(cleaned_sections).strip()
|
| 103 |
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Set up query engine
|
| 108 |
query_engine_get_study = CitationQueryEngine.from_args(
|
| 109 |
index_persisted,
|
|
@@ -114,7 +225,7 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
|
|
| 114 |
use_async=True
|
| 115 |
)
|
| 116 |
|
| 117 |
-
#Build prompt
|
| 118 |
study_information = f"""
|
| 119 |
#Study Objectives/Study Description
|
| 120 |
{study_obj}
|
|
@@ -138,88 +249,44 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
|
|
| 138 |
- Masking: None {Masking}
|
| 139 |
"""
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
Find suitable papers that have relevant or similar to the clinical trial information(### Clinical Trial Information).
|
| 146 |
-
Prioritize the following topics when finding related studies:
|
| 147 |
-
1. Study Objectives
|
| 148 |
-
2. Study Design and Phases
|
| 149 |
-
3. Conditions
|
| 150 |
-
4. Intervention/Treatment
|
| 151 |
-
Criteria generation:
|
| 152 |
-
As a clinical researcher, generate new eligibility criteria for given clinical trial information.
|
| 153 |
-
Analyze the information from related studies for more precise new eligibility criteria generation.
|
| 154 |
-
Ensure the criteria are clear, specific, and reasonable for a clinical research information.
|
| 155 |
-
Please generate list of Reference Papers
|
| 156 |
-
|
| 157 |
-
Reference Papers generation:
|
| 158 |
-
Please give us NCT IDs and study names for {top_k} used papers.
|
| 159 |
-
|
| 160 |
-
Please follows the pattern of the output(### Pattern of the output).
|
| 161 |
-
--------------------------------------------------
|
| 162 |
-
### Clinical Trial Information
|
| 163 |
-
{study_information}
|
| 164 |
-
--------------------------------------------------
|
| 165 |
-
### Pattern of the output
|
| 166 |
-
|
| 167 |
-
Exclusion Criteria
|
| 168 |
-
1.
|
| 169 |
-
2.
|
| 170 |
-
.
|
| 171 |
-
.
|
| 172 |
-
.
|
| 173 |
-
|
| 174 |
-
Inclusion Criteria
|
| 175 |
-
1.
|
| 176 |
-
2.
|
| 177 |
-
.
|
| 178 |
-
.
|
| 179 |
-
.
|
| 180 |
-
|
| 181 |
-
Reference Papers
|
| 182 |
-
1.NCT ID:
|
| 183 |
-
Study Name:
|
| 184 |
-
Condition:
|
| 185 |
-
Intervention/Treatment:
|
| 186 |
-
2.NCT ID:
|
| 187 |
-
Study Name:
|
| 188 |
-
Condition:
|
| 189 |
-
Intervention/Treatment:
|
| 190 |
-
.
|
| 191 |
-
.
|
| 192 |
-
.
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
-
|
| 196 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
match = re.search(pattern, query_response.response, re.DOTALL | re.IGNORECASE)
|
| 203 |
-
ext_ref = match.group(1) if match and match.group(1) else ''
|
| 204 |
-
split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
|
| 205 |
-
# print(split_ref)
|
| 206 |
-
|
| 207 |
-
formatted_ref = []
|
| 208 |
-
n=0
|
| 209 |
-
for ref in split_ref:
|
| 210 |
-
n+=1
|
| 211 |
-
nct_match = re.search(r'(NCT\d+)', ref)
|
| 212 |
if nct_match:
|
| 213 |
-
# nct_id = nct_match.group(1)
|
| 214 |
-
# study_name = re.search(r'Study Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
|
| 215 |
-
# condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
|
| 216 |
-
# intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
|
| 217 |
nct_id = nct_match.group(1)
|
| 218 |
-
study_name = ref
|
| 219 |
-
condition = ""
|
| 220 |
-
intervention = ""
|
| 221 |
else:
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
formatted_trial = [
|
| 225 |
n,
|
|
@@ -229,20 +296,12 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
|
|
| 229 |
intervention
|
| 230 |
]
|
| 231 |
formatted_ref.append(formatted_trial)
|
| 232 |
-
|
| 233 |
else:
|
|
|
|
| 234 |
formatted_ref = []
|
| 235 |
|
| 236 |
-
|
| 237 |
-
#Extract criteria
|
| 238 |
-
if query_response.response == "Empty Response":
|
| 239 |
-
return query_response
|
| 240 |
-
else:
|
| 241 |
-
combine_criteira = re.sub(r'#+\s*', '', query_response.response).strip()
|
| 242 |
-
combine_criteira = re.sub(r'\*\*', '', combine_criteira).strip()
|
| 243 |
-
combine_criteira = re.sub(r'(Criteria)\n\s*\n(\d+\.)', r'\1\n\2', combine_criteira).strip()
|
| 244 |
-
combine_criteira = await clean_trial_text(combine_criteira)
|
| 245 |
-
return query_response,formatted_ref
|
| 246 |
|
| 247 |
# # LLM.complete
|
| 248 |
# complete_response = await llm.acomplete(f"""
|
|
|
|
| 9 |
from llama_index.core.postprocessor import SimilarityPostprocessor
|
| 10 |
from llama_index.core.storage.docstore import SimpleDocumentStore
|
| 11 |
from llama_index.core import StorageContext, load_index_from_storage
|
| 12 |
+
from llama_index.core.data_structs import Node
|
| 13 |
+
from llama_index.core.schema import NodeWithScore
|
| 14 |
import re
|
| 15 |
import pandas as pd
|
| 16 |
import gradio as gr
|
|
|
|
| 103 |
cleaned_sections.pop(reference_title_index)
|
| 104 |
return '\n'.join(cleaned_sections).strip()
|
| 105 |
|
| 106 |
+
async def get_criteria(study_information, top_k):
|
| 107 |
+
criteria_response = await query_engine_get_study.aquery(f"""
|
| 108 |
+
Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
|
| 109 |
|
| 110 |
+
### Instruction:
|
| 111 |
+
Find suitable papers that are relevant or similar to the provided clinical trial information (### Clinical Trial Information).
|
| 112 |
+
Prioritize the following topics when finding related studies:
|
| 113 |
+
1. Study Objectives
|
| 114 |
+
2. Study Design and Phases
|
| 115 |
+
3. Conditions
|
| 116 |
+
4. Intervention/Treatment
|
| 117 |
|
| 118 |
+
Criteria Generation:
|
| 119 |
+
As a clinical researcher, generate new eligibility criteria for the given clinical trial information.
|
| 120 |
+
Analyze the information from all {top_k} related studies to generate new precise eligibility criteria.
|
| 121 |
+
Ensure that the criteria are specific for the given clinical trial information (### Clinical Trial Information).
|
| 122 |
+
|
| 123 |
+
Please follow the pattern of the output (### Pattern of the output).
|
| 124 |
+
--------------------------------------------------
|
| 125 |
+
### Clinical Trial Information
|
| 126 |
+
{study_information}
|
| 127 |
+
--------------------------------------------------
|
| 128 |
+
### Pattern of the Output
|
| 129 |
+
Inclusion Criteria
|
| 130 |
+
1.
|
| 131 |
+
2.
|
| 132 |
+
...
|
| 133 |
+
|
| 134 |
+
Exclusion Criteria
|
| 135 |
+
1.
|
| 136 |
+
2.
|
| 137 |
+
...
|
| 138 |
+
"""
|
| 139 |
+
)
|
| 140 |
+
metadata_list = []
|
| 141 |
+
|
| 142 |
+
for source in criteria_response.source_nodes:
|
| 143 |
+
meta_data = source.node.get_metadata_str()
|
| 144 |
+
metadata_list.append(meta_data)
|
| 145 |
+
|
| 146 |
+
return criteria_response.response, metadata_list
|
| 147 |
+
|
| 148 |
+
async def process_reference(metadata_list):
|
| 149 |
+
# Join the metadata elements with numbering and format as a string separated by newline
|
| 150 |
+
joined_str = "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
|
| 151 |
+
|
| 152 |
+
return joined_str
|
| 153 |
+
|
| 154 |
+
async def get_response(criteria, reference):
|
| 155 |
+
response = await llm.acomplete(f"""
|
| 156 |
+
### Task Description:
|
| 157 |
+
You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
|
| 158 |
+
|
| 159 |
+
### Instructions:
|
| 160 |
+
1. Review the eligibility criteria provided, which include references to metadata numbers (e.g., [1], [2], etc.). Identify all reference numbers that are actually used in the criteria.
|
| 161 |
+
2. Remove metadata of reference papers (### Metadata of Reference Papers) that does not have a corresponding reference in the eligibility criteria. This will ensure only relevant references are kept.
|
| 162 |
+
3. Reorder the remaining metadata so that they are numbered sequentially, starting from 1.
|
| 163 |
+
4. Update the reference numbers in the eligibility criteria accordingly to reflect the new order.
|
| 164 |
+
5. Maintain Criteria Consistency: Ensure that the eligibility criteria remain exactly the same in terms of content, but the reference numbers are updated to match the new numbering of metadata.
|
| 165 |
+
--------------------------------------------------
|
| 166 |
+
### Eligibility Criteria
|
| 167 |
+
{criteria}
|
| 168 |
+
--------------------------------------------------
|
| 169 |
+
### Metadata of Reference Papers
|
| 170 |
+
{reference}
|
| 171 |
+
--------------------------------------------------
|
| 172 |
+
### Pattern of the Output
|
| 173 |
+
Inclusion Criteria
|
| 174 |
+
1.
|
| 175 |
+
2.
|
| 176 |
+
...
|
| 177 |
+
|
| 178 |
+
Exclusion Criteria
|
| 179 |
+
1.
|
| 180 |
+
2.
|
| 181 |
+
...
|
| 182 |
+
|
| 183 |
+
Reference Papers
|
| 184 |
+
1.NCT ID:
|
| 185 |
+
Study Name:
|
| 186 |
+
Condition:
|
| 187 |
+
Intervention/Treatment:
|
| 188 |
+
2.NCT ID:
|
| 189 |
+
Study Name:
|
| 190 |
+
Condition:
|
| 191 |
+
Intervention/Treatment:
|
| 192 |
+
.
|
| 193 |
+
.
|
| 194 |
+
.""")
|
| 195 |
+
response_text = response.text
|
| 196 |
+
return response_text
|
| 197 |
+
|
| 198 |
+
async def extract_criteria(text):
|
| 199 |
+
# Define patterns for inclusion and exclusion criteria
|
| 200 |
+
inclusion_pattern = r'Inclusion Criteria:(.*?)(?=Exclusion Criteria)'
|
| 201 |
+
exclusion_pattern = r'Exclusion Criteria:(.*?)(?=Reference Papers|\n\n\n)'
|
| 202 |
+
|
| 203 |
+
# Search and clean inclusion criteria
|
| 204 |
+
inclusion_match = re.search(inclusion_pattern, text, re.DOTALL)
|
| 205 |
+
cleaned_inclusion = inclusion_match.group(1).strip() if inclusion_match else "Not found"
|
| 206 |
+
|
| 207 |
+
# Search and clean exclusion criteria
|
| 208 |
+
exclusion_match = re.search(exclusion_pattern, text, re.DOTALL)
|
| 209 |
+
cleaned_exclusion = exclusion_match.group(1).strip() if exclusion_match else "Not found"
|
| 210 |
+
|
| 211 |
+
# Format and return results
|
| 212 |
+
return (
|
| 213 |
+
"Inclusion Criteria:\n" + cleaned_inclusion + "\n\n" +
|
| 214 |
+
"Exclusion Criteria:\n" + cleaned_exclusion
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
|
| 218 |
# Set up query engine
|
| 219 |
query_engine_get_study = CitationQueryEngine.from_args(
|
| 220 |
index_persisted,
|
|
|
|
| 225 |
use_async=True
|
| 226 |
)
|
| 227 |
|
| 228 |
+
# Build prompt
|
| 229 |
study_information = f"""
|
| 230 |
#Study Objectives/Study Description
|
| 231 |
{study_obj}
|
|
|
|
| 249 |
- Masking: None {Masking}
|
| 250 |
"""
|
| 251 |
|
| 252 |
+
# Call step 1
|
| 253 |
+
criteria, metadata_list = await get_criteria(study_information, top_k)
|
| 254 |
+
if criteria != "Empty Response":
|
| 255 |
+
processed_ref = await process_reference(metadata_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
+
# Call stpe 2
|
| 258 |
+
response = await get_response(criteria, processed_ref)
|
| 259 |
+
|
| 260 |
+
# Extract Criteria
|
| 261 |
+
combine_criteria = extract_criteria(response)
|
| 262 |
|
| 263 |
+
# Extract Ref
|
| 264 |
+
pattern = r'Reference Papers\s*(.+)$'
|
| 265 |
+
# pattern = r'Reference Papers:?\s*(.*?)(?:\n\n.*$|$)'
|
| 266 |
+
match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
|
| 267 |
+
ext_ref = match.group(1) if match and match.group(1) else ''
|
| 268 |
+
split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
|
| 269 |
|
| 270 |
+
formatted_ref = []
|
| 271 |
+
n=0
|
| 272 |
+
for ref in split_ref:
|
| 273 |
+
nct_match = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
if nct_match:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
nct_id = nct_match.group(1)
|
|
|
|
|
|
|
|
|
|
| 276 |
else:
|
| 277 |
+
nct_match = re.search(r'(NCT\d+)', ref)
|
| 278 |
+
if nct_match:
|
| 279 |
+
nct_id = nct_match.group(1)
|
| 280 |
+
else:
|
| 281 |
+
continue
|
| 282 |
+
n+=1
|
| 283 |
+
study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
|
| 284 |
+
condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
|
| 285 |
+
intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
|
| 286 |
+
|
| 287 |
+
study_name = re.sub(r'\*+', '', study_name).strip()
|
| 288 |
+
condition = re.sub(r'\*+', '', condition).strip()
|
| 289 |
+
intervention = re.sub(r'\*+', '', intervention).strip()
|
| 290 |
|
| 291 |
formatted_trial = [
|
| 292 |
n,
|
|
|
|
| 296 |
intervention
|
| 297 |
]
|
| 298 |
formatted_ref.append(formatted_trial)
|
| 299 |
+
|
| 300 |
else:
|
| 301 |
+
combine_criteria = "Empty Response"
|
| 302 |
formatted_ref = []
|
| 303 |
|
| 304 |
+
return combine_criteria, formatted_ref
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
# # LLM.complete
|
| 307 |
# complete_response = await llm.acomplete(f"""
|