buildinqq commited on
Commit
9f028e1
·
verified ·
1 Parent(s): 8d03c60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -120
app.py CHANGED
@@ -68,27 +68,21 @@ storage_context = StorageContext.from_defaults(persist_dir="VectorStore")
68
  index_persisted = load_index_from_storage(storage_context, index_id="vector_index")
69
 
70
  async def remove_ref(text):
 
71
  split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE)
72
-
73
- if len(split_text) > 1:
74
- return split_text[0].strip()
75
-
76
- return text.strip()
77
 
78
  async def clean_trial_text(text):
79
- sections = text.split('\n')
80
- cleaned_sections = []
81
- in_references = False
82
- found_numbers = False
83
- has_intro_text = False
84
- reference_title_index = -1
85
 
86
  for i, line in enumerate(sections):
87
  if re.match(r'Reference Papers\s*$', line, re.IGNORECASE):
88
- in_references = True
89
- reference_title_index = len(cleaned_sections)
90
- cleaned_sections.append(line)
91
  continue
 
92
  if in_references and not found_numbers:
93
  if re.match(r'\d+\.', line.strip()):
94
  found_numbers = True
@@ -97,13 +91,17 @@ async def clean_trial_text(text):
97
  has_intro_text = True
98
  cleaned_sections.append(line)
99
  continue
 
100
  if not in_references:
101
  cleaned_sections.append(line)
 
102
  if in_references and not has_intro_text and reference_title_index != -1:
103
  cleaned_sections.pop(reference_title_index)
 
104
  return '\n'.join(cleaned_sections).strip()
105
 
106
  async def get_criteria(study_information, top_k):
 
107
  criteria_response = await query_engine_get_study.aquery(f"""
108
  Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
109
 
@@ -135,23 +133,16 @@ async def get_criteria(study_information, top_k):
135
  1.
136
  2.
137
  ...
138
- """
139
- )
140
- metadata_list = []
141
-
142
- for source in criteria_response.source_nodes:
143
- meta_data = source.node.get_metadata_str()
144
- metadata_list.append(meta_data)
145
-
146
  return criteria_response.response, metadata_list
147
 
148
  async def process_reference(metadata_list):
149
- # Join the metadata elements with numbering and format as a string separated by newline
150
- joined_str = "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
151
-
152
- return joined_str
153
 
154
  async def get_response(criteria, reference):
 
155
  response = await llm.acomplete(f"""
156
  ### Task Description:
157
  You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
@@ -196,112 +187,89 @@ async def get_response(criteria, reference):
196
  return response_text
197
 
198
  async def extract_criteria(text):
199
- # Define patterns for inclusion and exclusion criteria
200
- inclusion_pattern = r'Inclusion Criteria:(.*?)(?=Exclusion Criteria)'
201
- exclusion_pattern = r'Exclusion Criteria:(.*?)(?=Reference Papers|\n\n\n)'
202
-
203
- # Search and clean inclusion criteria
204
- inclusion_match = re.search(inclusion_pattern, text, re.DOTALL)
205
- cleaned_inclusion = inclusion_match.group(1).strip() if inclusion_match else "Not found"
206
-
207
- # Search and clean exclusion criteria
208
- exclusion_match = re.search(exclusion_pattern, text, re.DOTALL)
209
- cleaned_exclusion = exclusion_match.group(1).strip() if exclusion_match else "Not found"
210
-
211
- # Format and return results
212
- return (
213
- "Inclusion Criteria:\n" + cleaned_inclusion + "\n\n" +
214
- "Exclusion Criteria:\n" + cleaned_exclusion
215
- )
216
-
217
- async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
218
- # Set up query engine
219
- query_engine_get_study = CitationQueryEngine.from_args(
220
- index_persisted,
221
- similarity_top_k=top_k,
222
- citation_chunk_size=2048,
223
- verbose=True,
224
- node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
225
- use_async=True
226
- )
227
-
228
- # Build prompt
229
- study_information = f"""
230
- #Study Objectives/Study Description
231
- {study_obj}
232
-
233
- #Intervention
234
- {interventions}
235
 
236
- #Location
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  - Location_Countries: {location_countries}
238
  - Removed Location: {removed_location_countries}
239
 
240
- #Conditions
241
- Cancer {conditions}
242
 
243
- #Study Design
244
  - Study Type: {study_type}
245
  - Phase: {phase}
246
  - Primary Purpose: {purpose}
247
  - Allocation: {allocation}
248
  - Interventional Model: {intervention_model}
249
  - Masking: None {Masking}
250
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- # Call step 1
253
- criteria, metadata_list = await get_criteria(study_information, top_k)
254
- if criteria != "Empty Response":
255
- processed_ref = await process_reference(metadata_list)
256
 
257
- # Call stpe 2
258
- response = await get_response(criteria, processed_ref)
259
-
260
- # Extract Criteria
261
- combine_criteria = extract_criteria(response)
262
-
263
- # Extract Ref
264
- pattern = r'Reference Papers\s*(.+)$'
265
- # pattern = r'Reference Papers:?\s*(.*?)(?:\n\n.*$|$)'
266
- match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
267
- ext_ref = match.group(1) if match and match.group(1) else ''
268
- split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
269
-
270
- formatted_ref = []
271
- n=0
272
- for ref in split_ref:
273
- nct_match = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
274
- if nct_match:
275
- nct_id = nct_match.group(1)
276
- else:
277
- nct_match = re.search(r'(NCT\d+)', ref)
278
- if nct_match:
279
- nct_id = nct_match.group(1)
280
- else:
281
- continue
282
- n+=1
283
- study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
284
- condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
285
- intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
286
-
287
- study_name = re.sub(r'\*+', '', study_name).strip()
288
- condition = re.sub(r'\*+', '', condition).strip()
289
- intervention = re.sub(r'\*+', '', intervention).strip()
290
-
291
- formatted_trial = [
292
- n,
293
- f'<a href="https://clinicaltrials.gov/study/{nct_id}"><u>{nct_id}</u></a>',
294
- study_name,
295
- condition,
296
- intervention
297
- ]
298
- formatted_ref.append(formatted_trial)
299
-
300
- else:
301
- combine_criteria = "Empty Response"
302
- formatted_ref = []
303
-
304
- return combine_criteria, formatted_ref
305
 
306
  # # LLM.complete
307
  # complete_response = await llm.acomplete(f"""
@@ -530,7 +498,7 @@ with gr.Blocks() as demo:
530
  with gr.Row():
531
  top_k_box = gr.Slider(
532
  label="Amount of reference paper",
533
- value=5,
534
  minimum=0,
535
  maximum=30,
536
  step=1,
 
68
  index_persisted = load_index_from_storage(storage_context, index_id="vector_index")
69
 
70
  async def remove_ref(text):
71
+ """Removes content after 'Reference Papers' (case-insensitive)."""
72
  split_text = re.split(r'\bReference Papers\b', text, flags=re.IGNORECASE)
73
+ return split_text[0].strip() if len(split_text) > 1 else text.strip()
 
 
 
 
74
 
75
  async def clean_trial_text(text):
76
+ """Removes intro text from references if present."""
77
+ sections, cleaned_sections, in_references = text.split('\n'), [], False
78
+ has_intro_text, found_numbers, reference_title_index = False, False, -1
 
 
 
79
 
80
  for i, line in enumerate(sections):
81
  if re.match(r'Reference Papers\s*$', line, re.IGNORECASE):
82
+ in_references, reference_title_index = True, len(cleaned_sections)
83
+ cleaned_sections.append(line)
 
84
  continue
85
+
86
  if in_references and not found_numbers:
87
  if re.match(r'\d+\.', line.strip()):
88
  found_numbers = True
 
91
  has_intro_text = True
92
  cleaned_sections.append(line)
93
  continue
94
+
95
  if not in_references:
96
  cleaned_sections.append(line)
97
+
98
  if in_references and not has_intro_text and reference_title_index != -1:
99
  cleaned_sections.pop(reference_title_index)
100
+
101
  return '\n'.join(cleaned_sections).strip()
102
 
103
  async def get_criteria(study_information, top_k):
104
+ """Fetches eligibility criteria and metadata for a study."""
105
  criteria_response = await query_engine_get_study.aquery(f"""
106
  Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
107
 
 
133
  1.
134
  2.
135
  ...
136
+ """)
137
+ metadata_list = [source.node.get_metadata_str() for source in criteria_response.source_nodes]
 
 
 
 
 
 
138
  return criteria_response.response, metadata_list
139
 
140
  async def process_reference(metadata_list):
141
+ """Formats metadata list into a numbered string."""
142
+ return "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
 
 
143
 
144
  async def get_response(criteria, reference):
145
+ """Processes eligibility criteria and updates references to match new numbering."""
146
  response = await llm.acomplete(f"""
147
  ### Task Description:
148
  You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
 
187
  return response_text
188
 
189
  async def extract_criteria(text):
190
+ """Extracts inclusion and exclusion criteria from text."""
191
+ patterns = {
192
+ "inclusion": r'Inclusion Criteria:?(.*?)(?=Exclusion Criteria)',
193
+ "exclusion": r'Exclusion Criteria:?(.*?)(?=Reference Papers|\n\n\n)'
194
+ }
195
+ inclusion = re.search(patterns["inclusion"], text, re.DOTALL | re.IGNORECASE)
196
+ exclusion = re.search(patterns["exclusion"], text, re.DOTALL | re.IGNORECASE)
197
+
198
+ return (
199
+ "Inclusion Criteria:\n" + (inclusion.group(1).strip() if inclusion else "Not found") + "\n\n" +
200
+ "Exclusion Criteria:\n" + (exclusion.group(1).strip() if exclusion else "Not found")
201
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
+ async def run_function_on_text(top_k, study_obj, study_type, phase, purpose, allocation, intervention_model, Masking, conditions, interventions, location_countries, removed_location_countries):
204
+ """Runs the main function to process study information and generate formatted output."""
205
+ query_engine_get_study = CitationQueryEngine.from_args(
206
+ index_persisted,
207
+ similarity_top_k=top_k,
208
+ citation_chunk_size=2048,
209
+ verbose=True,
210
+ node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
211
+ use_async=True
212
+ )
213
+
214
+ study_information = f"""
215
+ # Study Objectives/Description
216
+ {study_obj}
217
+
218
+ # Intervention
219
+ {interventions}
220
+
221
+ # Location
222
  - Location_Countries: {location_countries}
223
  - Removed Location: {removed_location_countries}
224
 
225
+ # Conditions
226
+ Cancer {conditions}
227
 
228
+ # Study Design
229
  - Study Type: {study_type}
230
  - Phase: {phase}
231
  - Primary Purpose: {purpose}
232
  - Allocation: {allocation}
233
  - Interventional Model: {intervention_model}
234
  - Masking: None {Masking}
235
+ """
236
+
237
+ criteria, metadata_list = await get_criteria(study_information, top_k)
238
+ if criteria != "Empty Response":
239
+ processed_ref = await process_reference(metadata_list)
240
+ response = await get_response(criteria, processed_ref)
241
+ combine_criteria = await extract_criteria(response)
242
+
243
+ # Extract and format references
244
+ pattern = r'Reference Papers\s*(.+)$'
245
+ match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
246
+ ext_ref = match.group(1) if match else ""
247
+ split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
248
+
249
+ formatted_ref = []
250
+ for i, ref in enumerate(split_ref, 1):
251
+ nct_id = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
252
+ if not nct_id:
253
+ nct_id = re.search(r'(NCT\d+)', ref)
254
+ if not nct_id:
255
+ continue
256
 
257
+ study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|;|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL)
258
+ condition = re.search(r'Condition:?\s*(.*?)(?=\n|;|Intervention/Treatment:|$)', ref, re.DOTALL)
259
+ intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL)
 
260
 
261
+ formatted_ref.append([
262
+ i,
263
+ f'<a href="https://clinicaltrials.gov/study/{nct_id.group(1)}"><u>{nct_id.group(1)}</u></a>',
264
+ study_name.group(1).strip() if study_name else "",
265
+ condition.group(1).strip() if condition else "",
266
+ intervention.group(1).strip() if intervention else ""
267
+ ])
268
+
269
+ else:
270
+ combine_criteria, formatted_ref = "Empty Response", []
271
+
272
+ return combine_criteria, formatted_ref
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  # # LLM.complete
275
  # complete_response = await llm.acomplete(f"""
 
498
  with gr.Row():
499
  top_k_box = gr.Slider(
500
  label="Amount of reference paper",
501
+ value=10,
502
  minimum=0,
503
  maximum=30,
504
  step=1,