buildinqq commited on
Commit
2281b4b
·
verified ·
1 Parent(s): 061497c

Update app.py

Browse files

2 steps to improve reference accuracy

Files changed (1) hide show
  1. app.py +148 -89
app.py CHANGED
@@ -9,6 +9,8 @@ from llama_index.llms.gemini import Gemini
9
  from llama_index.core.postprocessor import SimilarityPostprocessor
10
  from llama_index.core.storage.docstore import SimpleDocumentStore
11
  from llama_index.core import StorageContext, load_index_from_storage
 
 
12
  import re
13
  import pandas as pd
14
  import gradio as gr
@@ -101,9 +103,118 @@ async def clean_trial_text(text):
101
  cleaned_sections.pop(reference_title_index)
102
  return '\n'.join(cleaned_sections).strip()
103
 
 
 
 
104
 
105
- async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
 
 
 
 
 
 
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Set up query engine
108
  query_engine_get_study = CitationQueryEngine.from_args(
109
  index_persisted,
@@ -114,7 +225,7 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
114
  use_async=True
115
  )
116
 
117
- #Build prompt
118
  study_information = f"""
119
  #Study Objectives/Study Description
120
  {study_obj}
@@ -138,88 +249,44 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
138
  - Masking: None {Masking}
139
  """
140
 
141
- # Query
142
- query_response = await query_engine_get_study.aquery(f"""
143
- Based on the provided instructions and clinical trial information, generate the new eligibility criteria by analyzing the related studies and clinical trial information.
144
- ### Instruction:
145
- Find suitable papers that have relevant or similar to the clinical trial information(### Clinical Trial Information).
146
- Prioritize the following topics when finding related studies:
147
- 1. Study Objectives
148
- 2. Study Design and Phases
149
- 3. Conditions
150
- 4. Intervention/Treatment
151
- Criteria generation:
152
- As a clinical researcher, generate new eligibility criteria for given clinical trial information.
153
- Analyze the information from related studies for more precise new eligibility criteria generation.
154
- Ensure the criteria are clear, specific, and reasonable for a clinical research information.
155
- Please generate list of Reference Papers
156
-
157
- Reference Papers generation:
158
- Please give us NCT IDs and study names for {top_k} used papers.
159
-
160
- Please follows the pattern of the output(### Pattern of the output).
161
- --------------------------------------------------
162
- ### Clinical Trial Information
163
- {study_information}
164
- --------------------------------------------------
165
- ### Pattern of the output
166
-
167
- Exclusion Criteria
168
- 1.
169
- 2.
170
- .
171
- .
172
- .
173
-
174
- Inclusion Criteria
175
- 1.
176
- 2.
177
- .
178
- .
179
- .
180
-
181
- Reference Papers
182
- 1.NCT ID:
183
- Study Name:
184
- Condition:
185
- Intervention/Treatment:
186
- 2.NCT ID:
187
- Study Name:
188
- Condition:
189
- Intervention/Treatment:
190
- .
191
- .
192
- .
193
 
 
 
 
 
 
194
 
195
- """
196
- )
 
 
 
 
197
 
198
- #Extract ref
199
- if query_response.response != "Empty Response":
200
- # pattern = r'Reference Papers:?\s*(.+)$'
201
- pattern = r'(?:Reference Papers\n)(.*?)(?:\n\n[A-Za-z]|$)'
202
- match = re.search(pattern, query_response.response, re.DOTALL | re.IGNORECASE)
203
- ext_ref = match.group(1) if match and match.group(1) else ''
204
- split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
205
- # print(split_ref)
206
-
207
- formatted_ref = []
208
- n=0
209
- for ref in split_ref:
210
- n+=1
211
- nct_match = re.search(r'(NCT\d+)', ref)
212
  if nct_match:
213
- # nct_id = nct_match.group(1)
214
- # study_name = re.search(r'Study Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
215
- # condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
216
- # intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
217
  nct_id = nct_match.group(1)
218
- study_name = ref
219
- condition = ""
220
- intervention = ""
221
  else:
222
- continue
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  formatted_trial = [
225
  n,
@@ -229,20 +296,12 @@ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocati
229
  intervention
230
  ]
231
  formatted_ref.append(formatted_trial)
232
-
233
  else:
 
234
  formatted_ref = []
235
 
236
- # return query_response,formatted_ref
237
- #Extract criteria
238
- if query_response.response == "Empty Response":
239
- return query_response
240
- else:
241
- combine_criteira = re.sub(r'#+\s*', '', query_response.response).strip()
242
- combine_criteira = re.sub(r'\*\*', '', combine_criteira).strip()
243
- combine_criteira = re.sub(r'(Criteria)\n\s*\n(\d+\.)', r'\1\n\2', combine_criteira).strip()
244
- combine_criteira = await clean_trial_text(combine_criteira)
245
- return query_response,formatted_ref
246
 
247
  # # LLM.complete
248
  # complete_response = await llm.acomplete(f"""
 
9
  from llama_index.core.postprocessor import SimilarityPostprocessor
10
  from llama_index.core.storage.docstore import SimpleDocumentStore
11
  from llama_index.core import StorageContext, load_index_from_storage
12
+ from llama_index.core.data_structs import Node
13
+ from llama_index.core.schema import NodeWithScore
14
  import re
15
  import pandas as pd
16
  import gradio as gr
 
103
  cleaned_sections.pop(reference_title_index)
104
  return '\n'.join(cleaned_sections).strip()
105
 
106
+ async def get_criteria(study_information, top_k):
107
+ criteria_response = await query_engine_get_study.aquery(f"""
108
+ Based on the provided instructions and clinical trial information, generate the new eligibility criteria specific for clinical trial information.
109
 
110
+ ### Instruction:
111
+ Find suitable papers that are relevant or similar to the provided clinical trial information (### Clinical Trial Information).
112
+ Prioritize the following topics when finding related studies:
113
+ 1. Study Objectives
114
+ 2. Study Design and Phases
115
+ 3. Conditions
116
+ 4. Intervention/Treatment
117
 
118
+ Criteria Generation:
119
+ As a clinical researcher, generate new eligibility criteria for the given clinical trial information.
120
+ Analyze the information from all {top_k} related studies to generate new precise eligibility criteria.
121
+ Ensure that the criteria are specific for the given clinical trial information (### Clinical Trial Information).
122
+
123
+ Please follow the pattern of the output (### Pattern of the output).
124
+ --------------------------------------------------
125
+ ### Clinical Trial Information
126
+ {study_information}
127
+ --------------------------------------------------
128
+ ### Pattern of the Output
129
+ Inclusion Criteria
130
+ 1.
131
+ 2.
132
+ ...
133
+
134
+ Exclusion Criteria
135
+ 1.
136
+ 2.
137
+ ...
138
+ """
139
+ )
140
+ metadata_list = []
141
+
142
+ for source in criteria_response.source_nodes:
143
+ meta_data = source.node.get_metadata_str()
144
+ metadata_list.append(meta_data)
145
+
146
+ return criteria_response.response, metadata_list
147
+
148
+ async def process_reference(metadata_list):
149
+ # Join the metadata elements with numbering and format as a string separated by newline
150
+ joined_str = "\n".join([f"{i + 1}. {meta}" for i, meta in enumerate(metadata_list)])
151
+
152
+ return joined_str
153
+
154
+ async def get_response(criteria, reference):
155
+ response = await llm.acomplete(f"""
156
+ ### Task Description:
157
+ You are tasked with processing clinical trial metadata and eligibility criteria. The goal is to clean, reorder, and maintain consistency between the metadata and references used in eligibility criteria.
158
+
159
+ ### Instructions:
160
+ 1. Review the eligibility criteria provided, which include references to metadata numbers (e.g., [1], [2], etc.). Identify all reference numbers that are actually used in the criteria.
161
+ 2. Remove metadata of reference papers (### Metadata of Reference Papers) that does not have a corresponding reference in the eligibility criteria. This will ensure only relevant references are kept.
162
+ 3. Reorder the remaining metadata so that they are numbered sequentially, starting from 1.
163
+ 4. Update the reference numbers in the eligibility criteria accordingly to reflect the new order.
164
+ 5. Maintain Criteria Consistency: Ensure that the eligibility criteria remain exactly the same in terms of content, but the reference numbers are updated to match the new numbering of metadata.
165
+ --------------------------------------------------
166
+ ### Eligibility Criteria
167
+ {criteria}
168
+ --------------------------------------------------
169
+ ### Metadata of Reference Papers
170
+ {reference}
171
+ --------------------------------------------------
172
+ ### Pattern of the Output
173
+ Inclusion Criteria
174
+ 1.
175
+ 2.
176
+ ...
177
+
178
+ Exclusion Criteria
179
+ 1.
180
+ 2.
181
+ ...
182
+
183
+ Reference Papers
184
+ 1.NCT ID:
185
+ Study Name:
186
+ Condition:
187
+ Intervention/Treatment:
188
+ 2.NCT ID:
189
+ Study Name:
190
+ Condition:
191
+ Intervention/Treatment:
192
+ .
193
+ .
194
+ .""")
195
+ response_text = response.text
196
+ return response_text
197
+
198
+ async def extract_criteria(text):
199
+ # Define patterns for inclusion and exclusion criteria
200
+ inclusion_pattern = r'Inclusion Criteria:(.*?)(?=Exclusion Criteria)'
201
+ exclusion_pattern = r'Exclusion Criteria:(.*?)(?=Reference Papers|\n\n\n)'
202
+
203
+ # Search and clean inclusion criteria
204
+ inclusion_match = re.search(inclusion_pattern, text, re.DOTALL)
205
+ cleaned_inclusion = inclusion_match.group(1).strip() if inclusion_match else "Not found"
206
+
207
+ # Search and clean exclusion criteria
208
+ exclusion_match = re.search(exclusion_pattern, text, re.DOTALL)
209
+ cleaned_exclusion = exclusion_match.group(1).strip() if exclusion_match else "Not found"
210
+
211
+ # Format and return results
212
+ return (
213
+ "Inclusion Criteria:\n" + cleaned_inclusion + "\n\n" +
214
+ "Exclusion Criteria:\n" + cleaned_exclusion
215
+ )
216
+
217
+ async def run_function_on_text(top_k,study_obj,study_type,phase,purpose,allocation,intervention_model,Masking,conditions,interventions,location_countries,removed_location_countries):
218
  # Set up query engine
219
  query_engine_get_study = CitationQueryEngine.from_args(
220
  index_persisted,
 
225
  use_async=True
226
  )
227
 
228
+ # Build prompt
229
  study_information = f"""
230
  #Study Objectives/Study Description
231
  {study_obj}
 
249
  - Masking: None {Masking}
250
  """
251
 
252
+ # Call step 1
253
+ criteria, metadata_list = await get_criteria(study_information, top_k)
254
+ if criteria != "Empty Response":
255
+ processed_ref = await process_reference(metadata_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ # Call stpe 2
258
+ response = await get_response(criteria, processed_ref)
259
+
260
+ # Extract Criteria
261
+ combine_criteria = extract_criteria(response)
262
 
263
+ # Extract Ref
264
+ pattern = r'Reference Papers\s*(.+)$'
265
+ # pattern = r'Reference Papers:?\s*(.*?)(?:\n\n.*$|$)'
266
+ match = re.search(pattern, response, re.DOTALL | re.IGNORECASE)
267
+ ext_ref = match.group(1) if match and match.group(1) else ''
268
+ split_ref = re.split(r'\n*\d+\.\s+', ext_ref)[1:]
269
 
270
+ formatted_ref = []
271
+ n=0
272
+ for ref in split_ref:
273
+ nct_match = re.search(r'NCT[_ ]ID: (NCT\d+)', ref)
 
 
 
 
 
 
 
 
 
 
274
  if nct_match:
 
 
 
 
275
  nct_id = nct_match.group(1)
 
 
 
276
  else:
277
+ nct_match = re.search(r'(NCT\d+)', ref)
278
+ if nct_match:
279
+ nct_id = nct_match.group(1)
280
+ else:
281
+ continue
282
+ n+=1
283
+ study_name = re.search(r'Study[_ ]Name:?\s*(.*?)(?=\n|Condition:|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
284
+ condition = re.search(r'Condition:?\s*(.*?)(?=\n|Intervention/Treatment:|$)', ref, re.DOTALL).group(1).strip()
285
+ intervention = re.search(r'Intervention/Treatment:?\s*(.*?)(?=\n|$)', ref, re.DOTALL).group(1).strip()
286
+
287
+ study_name = re.sub(r'\*+', '', study_name).strip()
288
+ condition = re.sub(r'\*+', '', condition).strip()
289
+ intervention = re.sub(r'\*+', '', intervention).strip()
290
 
291
  formatted_trial = [
292
  n,
 
296
  intervention
297
  ]
298
  formatted_ref.append(formatted_trial)
299
+
300
  else:
301
+ combine_criteria = "Empty Response"
302
  formatted_ref = []
303
 
304
+ return combine_criteria, formatted_ref
 
 
 
 
 
 
 
 
 
305
 
306
  # # LLM.complete
307
  # complete_response = await llm.acomplete(f"""