Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -68,27 +68,21 @@ storage_context = StorageContext.from_defaults(persist_dir="VectorStore")
|
|
| 68 |
index_persisted = load_index_from_storage(storage_context, index_id="vector_index")
|
| 69 |
|
| 70 |
async def remove_ref(text):
|
|
|
|
| 71 |
split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE)
|
| 72 |
-
|
| 73 |
-
if len(split_text) > 1:
|
| 74 |
-
return split_text[0].strip()
|
| 75 |
-
|
| 76 |
-
return text.strip()
|
| 77 |
|
| 78 |
async def clean_trial_text(text):
|
| 79 |
-
|
| 80 |
-
cleaned_sections = []
|
| 81 |
-
|
| 82 |
-
found_numbers = False
|
| 83 |
-
has_intro_text = False
|
| 84 |
-
reference_title_index = -1
|
| 85 |
|
| 86 |
for i, line in enumerate(sections):
|
| 87 |
if re.match(r'Reference Papers\s*$', line, re.IGNORECASE):
|
| 88 |
-
in_references = True
|
| 89 |
-
|
| 90 |
-
cleaned_sections.append(line)
|
| 91 |
continue
|
|
|
|
| 92 |
if in_references and not found_numbers:
|
| 93 |
if re.match(r'\d+\.', line.strip()):
|
| 94 |
found_numbers = True
|
|
@@ -97,13 +91,17 @@ async def clean_trial_text(text):
|
|
| 97 |
has_intro_text = True
|
| 98 |
cleaned_sections.append(line)
|
| 99 |
continue
|
|
|
|
| 100 |
if not in_references:
|
| 101 |
cleaned_sections.append(line)
|
|
|
|
| 102 |
if in_references and not has_intro_text and reference_title_index != -1:
|
| 103 |
cleaned_sections.pop(reference_title_index)
|
|
|
|
| 104 |
return '\n'.join(cleaned_sections).strip()
|
| 105 |
|
| 106 |
async def get_criteria(study_information, top_k):
|
|
|
|
| 107 |
criteria_response = await query_engine_get_study.aquery(f"""
|
| 108 |
Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
|
| 109 |
|
|
@@ -135,23 +133,16 @@ async def get_criteria(study_information, top_k):
|
|
| 135 |
1.
|
| 136 |
2.
|
| 137 |
...
|
| 138 |
-
"""
|
| 139 |
-
)
|
| 140 |
-
metadata_list = []
|
| 141 |
-
|
| 142 |
-
for source in criteria_response.source_nodes:
|
| 143 |
-
meta_data = source.node.get_metadata_str()
|
| 144 |
-
metadata_list.append(meta_data)
|
| 145 |
-
|
| 146 |
return criteria_response.response, metadata_list
|
| 147 |
|
| 148 |
async def process_reference(metadata_list):
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
return joined_str
|
| 153 |
|
| 154 |
async def get_response(criteria, reference):
|
|
|
|
| 155 |
response = await llm.acomplete(f"""
|
| 156 |
### Task Description:
|
| 157 |
You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
|
|
@@ -196,112 +187,89 @@ async def get_response(criteria, reference):
|
|
| 196 |
return response_text
|
| 197 |
|
| 198 |
async def extract_criteria(text):
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
# Format and return results
|
| 212 |
-
return (
|
| 213 |
-
"Inclusion Criteria:\n" + cleaned_inclusion + "\n\n" +
|
| 214 |
-
"Exclusion Criteria:\n" + cleaned_exclusion
|
| 215 |
-
)
|
| 216 |
-
|
| 217 |
-
async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
|
| 218 |
-
# Set up query engine
|
| 219 |
-
query_engine_get_study = CitationQueryEngine.from_args(
|
| 220 |
-
index_persisted,
|
| 221 |
-
similarity_top_k=top_k,
|
| 222 |
-
citation_chunk_size=2048,
|
| 223 |
-
verbose=True,
|
| 224 |
-
node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
|
| 225 |
-
use_async=True
|
| 226 |
-
)
|
| 227 |
-
|
| 228 |
-
# Build prompt
|
| 229 |
-
study_information = f"""
|
| 230 |
-
#Study Objectives/Study Description
|
| 231 |
-
{study_obj}
|
| 232 |
-
|
| 233 |
-
#Intervention
|
| 234 |
-
{interventions}
|
| 235 |
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
- Location_Countries: {location_countries}
|
| 238 |
- Removed Location: {removed_location_countries}
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
|
| 243 |
-
|
| 244 |
- Study Type: {study_type}
|
| 245 |
- Phase: {phase}
|
| 246 |
- Primary Purpose: {purpose}
|
| 247 |
- Allocation: {allocation}
|
| 248 |
- Interventional Model: {intervention_model}
|
| 249 |
- Masking: None {Masking}
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
processed_ref = await process_reference(metadata_list)
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
formatted_ref = []
|
| 271 |
-
n=0
|
| 272 |
-
for ref in split_ref:
|
| 273 |
-
nct_match = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
|
| 274 |
-
if nct_match:
|
| 275 |
-
nct_id = nct_match.group(1)
|
| 276 |
-
else:
|
| 277 |
-
nct_match = re.search(r'(NCT\d+)', ref)
|
| 278 |
-
if nct_match:
|
| 279 |
-
nct_id = nct_match.group(1)
|
| 280 |
-
else:
|
| 281 |
-
continue
|
| 282 |
-
n+=1
|
| 283 |
-
study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
|
| 284 |
-
condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
|
| 285 |
-
intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
|
| 286 |
-
|
| 287 |
-
study_name = re.sub(r'\*+', '', study_name).strip()
|
| 288 |
-
condition = re.sub(r'\*+', '', condition).strip()
|
| 289 |
-
intervention = re.sub(r'\*+', '', intervention).strip()
|
| 290 |
-
|
| 291 |
-
formatted_trial = [
|
| 292 |
-
n,
|
| 293 |
-
f'<a href="https://clinicaltrials.gov/study/{nct_id}"><u>{nct_id}</u></a>',
|
| 294 |
-
study_name,
|
| 295 |
-
condition,
|
| 296 |
-
intervention
|
| 297 |
-
]
|
| 298 |
-
formatted_ref.append(formatted_trial)
|
| 299 |
-
|
| 300 |
-
else:
|
| 301 |
-
combine_criteria = "Empty Response"
|
| 302 |
-
formatted_ref = []
|
| 303 |
-
|
| 304 |
-
return combine_criteria, formatted_ref
|
| 305 |
|
| 306 |
# # LLM.complete
|
| 307 |
# complete_response = await llm.acomplete(f"""
|
|
@@ -530,7 +498,7 @@ with gr.Blocks() as demo:
|
|
| 530 |
with gr.Row():
|
| 531 |
top_k_box = gr.Slider(
|
| 532 |
label="Amount of reference paper",
|
| 533 |
-
value=
|
| 534 |
minimum=0,
|
| 535 |
maximum=30,
|
| 536 |
step=1,
|
|
|
|
| 68 |
index_persisted = load_index_from_storage(storage_context, index_id="vector_index")
|
| 69 |
|
| 70 |
async def remove_ref(text):
|
| 71 |
+
"""Removes content after 'Reference Papers' (case-insensitive)."""
|
| 72 |
split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE)
|
| 73 |
+
return split_text[0].strip() if len(split_text) > 1 else text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
async def clean_trial_text(text):
|
| 76 |
+
"""Removes intro text from references if present."""
|
| 77 |
+
sections, cleaned_sections, in_references = text.split('\n'), [], False
|
| 78 |
+
has_intro_text, found_numbers, reference_title_index = False, False, -1
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
for i, line in enumerate(sections):
|
| 81 |
if re.match(r'Reference Papers\s*$', line, re.IGNORECASE):
|
| 82 |
+
in_references, reference_title_index = True, len(cleaned_sections)
|
| 83 |
+
cleaned_sections.append(line)
|
|
|
|
| 84 |
continue
|
| 85 |
+
|
| 86 |
if in_references and not found_numbers:
|
| 87 |
if re.match(r'\d+\.', line.strip()):
|
| 88 |
found_numbers = True
|
|
|
|
| 91 |
has_intro_text = True
|
| 92 |
cleaned_sections.append(line)
|
| 93 |
continue
|
| 94 |
+
|
| 95 |
if not in_references:
|
| 96 |
cleaned_sections.append(line)
|
| 97 |
+
|
| 98 |
if in_references and not has_intro_text and reference_title_index != -1:
|
| 99 |
cleaned_sections.pop(reference_title_index)
|
| 100 |
+
|
| 101 |
return '\n'.join(cleaned_sections).strip()
|
| 102 |
|
| 103 |
async def get_criteria(study_information, top_k):
|
| 104 |
+
"""Fetches eligibility criteria and metadata for a study."""
|
| 105 |
criteria_response = await query_engine_get_study.aquery(f"""
|
| 106 |
Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
|
| 107 |
|
|
|
|
| 133 |
1.
|
| 134 |
2.
|
| 135 |
...
|
| 136 |
+
""")
|
| 137 |
+
metadata_list = [source.node.get_metadata_str() for source in criteria_response.source_nodes]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return criteria_response.response, metadata_list
|
| 139 |
|
| 140 |
async def process_reference(metadata_list):
|
| 141 |
+
"""Formats metadata list into a numbered string."""
|
| 142 |
+
return "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
|
|
|
|
|
|
|
| 143 |
|
| 144 |
async def get_response(criteria, reference):
|
| 145 |
+
"""Processes eligibility criteria and updates references to match new numbering."""
|
| 146 |
response = await llm.acomplete(f"""
|
| 147 |
### Task Description:
|
| 148 |
You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
|
|
|
|
| 187 |
return response_text
|
| 188 |
|
| 189 |
async def extract_criteria(text):
|
| 190 |
+
"""Extracts inclusion and exclusion criteria from text."""
|
| 191 |
+
patterns = {
|
| 192 |
+
"inclusion": r'Inclusion Criteria:?(.*?)(?=Exclusion Criteria)',
|
| 193 |
+
"exclusion": r'Exclusion Criteria:?(.*?)(?=Reference Papers|\n\n\n)'
|
| 194 |
+
}
|
| 195 |
+
inclusion = re.search(patterns["inclusion"], text, re.DOTALL | re.IGNORECASE)
|
| 196 |
+
exclusion = re.search(patterns["exclusion"], text, re.DOTALL | re.IGNORECASE)
|
| 197 |
+
|
| 198 |
+
return (
|
| 199 |
+
"Inclusion Criteria:\n" + (inclusion.group(1).strip() if inclusion else "Not found") + "\n\n" +
|
| 200 |
+
"Exclusion Criteria:\n" + (exclusion.group(1).strip() if exclusion else "Not found")
|
| 201 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
+
async def run_function_on_text(top_k, study_obj, study_type, phase, purpose, allocation, intervention_model, Masking, conditions, interventions, location_countries, removed_location_countries):
|
| 204 |
+
"""Runs the main function to process study information and generate formatted output."""
|
| 205 |
+
query_engine_get_study = CitationQueryEngine.from_args(
|
| 206 |
+
index_persisted,
|
| 207 |
+
similarity_top_k=top_k,
|
| 208 |
+
citation_chunk_size=2048,
|
| 209 |
+
verbose=True,
|
| 210 |
+
node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
|
| 211 |
+
use_async=True
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
study_information = f"""
|
| 215 |
+
# Study Objectives/Description
|
| 216 |
+
{study_obj}
|
| 217 |
+
|
| 218 |
+
# Intervention
|
| 219 |
+
{interventions}
|
| 220 |
+
|
| 221 |
+
# Location
|
| 222 |
- Location_Countries: {location_countries}
|
| 223 |
- Removed Location: {removed_location_countries}
|
| 224 |
|
| 225 |
+
# Conditions
|
| 226 |
+
Cancer {conditions}
|
| 227 |
|
| 228 |
+
# Study Design
|
| 229 |
- Study Type: {study_type}
|
| 230 |
- Phase: {phase}
|
| 231 |
- Primary Purpose: {purpose}
|
| 232 |
- Allocation: {allocation}
|
| 233 |
- Interventional Model: {intervention_model}
|
| 234 |
- Masking: None {Masking}
|
| 235 |
+
"""
|
| 236 |
+
|
| 237 |
+
criteria, metadata_list = await get_criteria(study_information, top_k)
|
| 238 |
+
if criteria != "Empty Response":
|
| 239 |
+
processed_ref = await process_reference(metadata_list)
|
| 240 |
+
response = await get_response(criteria, processed_ref)
|
| 241 |
+
combine_criteria = await extract_criteria(response)
|
| 242 |
+
|
| 243 |
+
# Extract and format references
|
| 244 |
+
pattern = r'Reference Papers\s*(.+)$'
|
| 245 |
+
match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
|
| 246 |
+
ext_ref = match.group(1) if match else ""
|
| 247 |
+
split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
|
| 248 |
+
|
| 249 |
+
formatted_ref = []
|
| 250 |
+
for i, ref in enumerate(split_ref, 1):
|
| 251 |
+
nct_id = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
|
| 252 |
+
if not nct_id:
|
| 253 |
+
nct_id = re.search(r'(NCT\d+)', ref)
|
| 254 |
+
if not nct_id:
|
| 255 |
+
continue
|
| 256 |
|
| 257 |
+
study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|;|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL)
|
| 258 |
+
condition = re.search(r'Condition:?\s*(.*?)(?=\n|;|Intervention/Treatment:|$)', ref, re.DOTALL)
|
| 259 |
+
intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL)
|
|
|
|
| 260 |
|
| 261 |
+
formatted_ref.append([
|
| 262 |
+
i,
|
| 263 |
+
f'<a href="https://clinicaltrials.gov/study/{nct_id.group(1)}"><u>{nct_id.group(1)}</u></a>',
|
| 264 |
+
study_name.group(1).strip() if study_name else "",
|
| 265 |
+
condition.group(1).strip() if condition else "",
|
| 266 |
+
intervention.group(1).strip() if intervention else ""
|
| 267 |
+
])
|
| 268 |
+
|
| 269 |
+
else:
|
| 270 |
+
combine_criteria, formatted_ref = "Empty Response", []
|
| 271 |
+
|
| 272 |
+
return combine_criteria, formatted_ref
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
# # LLM.complete
|
| 275 |
# complete_response = await llm.acomplete(f"""
|
|
|
|
| 498 |
with gr.Row():
|
| 499 |
top_k_box = gr.Slider(
|
| 500 |
label="Amount of reference paper",
|
| 501 |
+
value=10,
|
| 502 |
minimum=0,
|
| 503 |
maximum=30,
|
| 504 |
step=1,
|