Spaces:
Running
Running
Commit
·
b826a4f
1
Parent(s):
cba901f
v1.0 - Stable
Browse files
__pycache__/script_search_api.cpython-310.pyc
ADDED
|
Binary file (8.43 kB). View file
|
|
|
model/__pycache__/analyzer.cpython-310.pyc
ADDED
|
Binary file (7.51 kB). View file
|
|
|
model/analyzer.py
CHANGED
|
@@ -24,10 +24,10 @@ class ContentAnalyzer:
|
|
| 24 |
]
|
| 25 |
self.pattern = re.compile(r'\b(' + '|'.join(self.categories) + r')\b', re.IGNORECASE)
|
| 26 |
logger.info(f"Initialized analyzer with device: {self.device}")
|
| 27 |
-
self._load_model()
|
| 28 |
|
| 29 |
def _load_model(self) -> None:
|
| 30 |
-
"""Load model and tokenizer
|
| 31 |
try:
|
| 32 |
logger.info("Loading model components...")
|
| 33 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
@@ -45,95 +45,193 @@ class ContentAnalyzer:
|
|
| 45 |
logger.error(f"Model loading failed: {str(e)}")
|
| 46 |
raise
|
| 47 |
|
| 48 |
-
def _chunk_text(self, text: str,
|
| 49 |
-
"""
|
| 50 |
-
paragraphs = text.split('\n\n')
|
| 51 |
chunks = []
|
| 52 |
-
current_chunk =
|
|
|
|
| 53 |
|
| 54 |
for para in paragraphs:
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
else:
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
current_chunk = para + "\n\n"
|
| 61 |
|
| 62 |
if current_chunk:
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
-
logger.info(f"Split text into {len(chunks)} chunks")
|
| 66 |
return chunks
|
| 67 |
|
| 68 |
-
async def _analyze_chunk(self, chunk: str) -> List[str]:
|
| 69 |
-
"""
|
| 70 |
-
prompt = f"""
|
| 71 |
-
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
with torch.no_grad():
|
| 79 |
-
outputs = self.model.generate(
|
| 80 |
-
**inputs,
|
| 81 |
-
max_new_tokens=50,
|
| 82 |
-
do_sample=False,
|
| 83 |
-
pad_token_id=self.tokenizer.eos_token_id
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 87 |
-
return [m.capitalize() for m in self.pattern.findall(response)]
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
identified_triggers = set()
|
|
|
|
| 92 |
chunks = self._chunk_text(script)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
for idx, chunk in enumerate(chunks):
|
| 95 |
if progress:
|
| 96 |
-
progress((idx/
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
-
identified_triggers.update(triggers)
|
| 100 |
|
| 101 |
if progress:
|
| 102 |
progress((1.0, "Analysis complete"))
|
| 103 |
|
| 104 |
-
|
|
|
|
|
|
|
| 105 |
|
| 106 |
async def analyze_content(
|
| 107 |
script: str,
|
| 108 |
progress: Optional[gr.Progress] = None
|
| 109 |
) -> Dict[str, Union[List[str], str]]:
|
| 110 |
-
"""
|
| 111 |
try:
|
| 112 |
analyzer = ContentAnalyzer()
|
| 113 |
-
triggers = await analyzer.analyze_script(script, progress)
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
| 119 |
-
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
| 120 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
except Exception as e:
|
| 122 |
logger.error(f"Analysis error: {str(e)}")
|
| 123 |
return {
|
| 124 |
-
"detected_triggers": ["
|
| 125 |
"confidence": "Error",
|
| 126 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
| 127 |
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
|
| 128 |
"error": str(e)
|
| 129 |
}
|
| 130 |
|
| 131 |
if __name__ == "__main__":
|
| 132 |
iface = gr.Interface(
|
| 133 |
fn=analyze_content,
|
| 134 |
-
inputs=gr.Textbox(lines=
|
| 135 |
-
outputs=
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
)
|
| 139 |
-
iface.launch()
|
|
|
|
| 24 |
]
|
| 25 |
self.pattern = re.compile(r'\b(' + '|'.join(self.categories) + r')\b', re.IGNORECASE)
|
| 26 |
logger.info(f"Initialized analyzer with device: {self.device}")
|
| 27 |
+
self._load_model()
|
| 28 |
|
| 29 |
def _load_model(self) -> None:
|
| 30 |
+
"""Load model and tokenizer with CPU optimization"""
|
| 31 |
try:
|
| 32 |
logger.info("Loading model components...")
|
| 33 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
| 45 |
logger.error(f"Model loading failed: {str(e)}")
|
| 46 |
raise
|
| 47 |
|
| 48 |
+
def _chunk_text(self, text: str, max_tokens: int = 512) -> List[str]:
|
| 49 |
+
"""Context-aware chunking with token counting"""
|
| 50 |
+
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
| 51 |
chunks = []
|
| 52 |
+
current_chunk = []
|
| 53 |
+
current_length = 0
|
| 54 |
|
| 55 |
for para in paragraphs:
|
| 56 |
+
para_tokens = self.tokenizer.encode(para, add_special_tokens=False)
|
| 57 |
+
para_length = len(para_tokens)
|
| 58 |
+
|
| 59 |
+
if current_length + para_length > max_tokens and current_chunk:
|
| 60 |
+
chunk_text = "\n\n".join(current_chunk)
|
| 61 |
+
chunks.append(chunk_text)
|
| 62 |
+
current_chunk = [para]
|
| 63 |
+
current_length = para_length
|
| 64 |
else:
|
| 65 |
+
current_chunk.append(para)
|
| 66 |
+
current_length += para_length
|
|
|
|
| 67 |
|
| 68 |
if current_chunk:
|
| 69 |
+
chunk_text = "\n\n".join(current_chunk)
|
| 70 |
+
chunks.append(chunk_text)
|
| 71 |
|
| 72 |
+
logger.info(f"Split text into {len(chunks)} chunks (max_tokens={max_tokens})")
|
| 73 |
return chunks
|
| 74 |
|
| 75 |
+
async def _analyze_chunk(self, chunk: str) -> tuple[List[str], str]:
|
| 76 |
+
"""Deep analysis with step-by-step reasoning"""
|
| 77 |
+
prompt = f"""As a deep-thinking content analyzer, carefully evaluate this text for sensitive content.
|
| 78 |
+
Input text: {chunk}
|
| 79 |
|
| 80 |
+
Think through each step:
|
| 81 |
+
1. What is happening in the text?
|
| 82 |
+
2. What potentially sensitive themes or elements are present?
|
| 83 |
+
3. For each category below, is there clear evidence?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
Categories: {", ".join(self.categories)}
|
| 86 |
+
|
| 87 |
+
Detailed analysis:
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)
|
| 92 |
+
|
| 93 |
+
with torch.no_grad():
|
| 94 |
+
outputs = self.model.generate(
|
| 95 |
+
**inputs,
|
| 96 |
+
do_sample=True,
|
| 97 |
+
temperature=0.7,
|
| 98 |
+
top_p=0.9,
|
| 99 |
+
max_length=8192
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 103 |
+
|
| 104 |
+
# Extract categories more reliably using multiple patterns
|
| 105 |
+
categories_found = set()
|
| 106 |
+
|
| 107 |
+
# Look for explicit category mentions
|
| 108 |
+
category_matches = self.pattern.findall(full_response.lower())
|
| 109 |
+
|
| 110 |
+
# Normalize and validate matches
|
| 111 |
+
for match in category_matches:
|
| 112 |
+
for category in self.categories:
|
| 113 |
+
if match.lower() == category.lower():
|
| 114 |
+
categories_found.add(category)
|
| 115 |
+
|
| 116 |
+
# Convert to list and sort for consistency
|
| 117 |
+
matched_categories = sorted(list(categories_found))
|
| 118 |
+
|
| 119 |
+
# Clean up reasoning text
|
| 120 |
+
reasoning = full_response.split("\n\nCategories found:")[0] if "\n\nCategories found:" in full_response else full_response
|
| 121 |
+
reasoning = reasoning.strip()
|
| 122 |
+
|
| 123 |
+
if not matched_categories and any(trigger_word in full_response.lower() for trigger_word in
|
| 124 |
+
["concerning", "warning", "caution", "trigger", "sensitive"]):
|
| 125 |
+
logger.warning(f"Potential triggers found but no categories matched in chunk")
|
| 126 |
+
|
| 127 |
+
logger.info(f"Chunk analysis complete - Categories found: {matched_categories}")
|
| 128 |
+
return matched_categories, reasoning
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Chunk analysis error: {str(e)}")
|
| 132 |
+
return [], f"Analysis error: {str(e)}"
|
| 133 |
+
|
| 134 |
+
async def analyze_script(self, script: str, progress: Optional[gr.Progress] = None) -> tuple[List[str], List[str]]:
|
| 135 |
+
"""Main analysis workflow with progress updates"""
|
| 136 |
+
if not script.strip():
|
| 137 |
+
return ["No content provided"], ["No analysis performed"]
|
| 138 |
+
|
| 139 |
identified_triggers = set()
|
| 140 |
+
reasoning_outputs = []
|
| 141 |
chunks = self._chunk_text(script)
|
| 142 |
|
| 143 |
+
if not chunks:
|
| 144 |
+
return ["Empty text after chunking"], ["No analysis performed"]
|
| 145 |
+
|
| 146 |
+
total_chunks = len(chunks)
|
| 147 |
+
|
| 148 |
for idx, chunk in enumerate(chunks):
|
| 149 |
if progress:
|
| 150 |
+
progress((idx/total_chunks, f"Deep analysis of chunk {idx+1}/{total_chunks}"))
|
| 151 |
+
|
| 152 |
+
chunk_triggers, chunk_reasoning = await self._analyze_chunk(chunk)
|
| 153 |
+
identified_triggers.update(chunk_triggers)
|
| 154 |
+
reasoning_outputs.append(f"Chunk {idx + 1} Analysis:\n{chunk_reasoning}")
|
| 155 |
|
| 156 |
+
logger.info(f"Processed chunk {idx+1}/{total_chunks}, found triggers: {chunk_triggers}")
|
|
|
|
| 157 |
|
| 158 |
if progress:
|
| 159 |
progress((1.0, "Analysis complete"))
|
| 160 |
|
| 161 |
+
final_triggers = sorted(list(identified_triggers)) if identified_triggers else ["None"]
|
| 162 |
+
logger.info(f"Final triggers identified: {final_triggers}")
|
| 163 |
+
return final_triggers, reasoning_outputs
|
| 164 |
|
| 165 |
async def analyze_content(
|
| 166 |
script: str,
|
| 167 |
progress: Optional[gr.Progress] = None
|
| 168 |
) -> Dict[str, Union[List[str], str]]:
|
| 169 |
+
"""Gradio interface function with enhanced trigger detection"""
|
| 170 |
try:
|
| 171 |
analyzer = ContentAnalyzer()
|
| 172 |
+
triggers, reasoning_output = await analyzer.analyze_script(script, progress)
|
| 173 |
|
| 174 |
+
# Extract triggers from detailed analysis
|
| 175 |
+
detected_triggers = set()
|
| 176 |
+
full_reasoning = "\n\n".join(reasoning_output)
|
| 177 |
+
|
| 178 |
+
# Look for explicit category markers
|
| 179 |
+
category_markers = [
|
| 180 |
+
(r'\b(\w+):\s*\+', 1), # Matches "Category: +"
|
| 181 |
+
(r'\*\*(\w+(?:\s+\w+)?):\*\*[^\n]*?\bMarked with "\+"', 1), # Matches "**Category:** ... Marked with "+"
|
| 182 |
+
(r'(\w+(?:\s+\w+)?)\s*is clearly present', 1), # Matches "Category is clearly present"
|
| 183 |
+
]
|
| 184 |
+
|
| 185 |
+
for pattern, group in category_markers:
|
| 186 |
+
matches = re.finditer(pattern, full_reasoning, re.IGNORECASE)
|
| 187 |
+
for match in matches:
|
| 188 |
+
category = match.group(group).strip()
|
| 189 |
+
# Normalize category names to match predefined categories
|
| 190 |
+
for predefined_category in analyzer.categories:
|
| 191 |
+
if category.lower() in predefined_category.lower():
|
| 192 |
+
detected_triggers.add(predefined_category)
|
| 193 |
+
|
| 194 |
+
# Add any triggers found through direct pattern matching
|
| 195 |
+
for category in analyzer.categories:
|
| 196 |
+
pattern = fr'\b{re.escape(category)}\b.*?(present|evident|indicated|clear|obvious)'
|
| 197 |
+
if re.search(pattern, full_reasoning, re.IGNORECASE):
|
| 198 |
+
detected_triggers.add(category)
|
| 199 |
+
|
| 200 |
+
# If no triggers were found through detailed analysis, fall back to original triggers
|
| 201 |
+
final_triggers = sorted(list(detected_triggers)) if detected_triggers else triggers
|
| 202 |
+
|
| 203 |
+
result = {
|
| 204 |
+
"detected_triggers": final_triggers if final_triggers else ["None"],
|
| 205 |
+
"confidence": "High confidence" if final_triggers and final_triggers != ["None"] else "No triggers found",
|
| 206 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
| 207 |
+
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 208 |
+
"analysis_reasoning": full_reasoning
|
| 209 |
}
|
| 210 |
+
|
| 211 |
+
logger.info(f"Enhanced analysis complete. Results: {result}")
|
| 212 |
+
return result
|
| 213 |
+
|
| 214 |
except Exception as e:
|
| 215 |
logger.error(f"Analysis error: {str(e)}")
|
| 216 |
return {
|
| 217 |
+
"detected_triggers": ["Analysis error"],
|
| 218 |
"confidence": "Error",
|
| 219 |
"model": "DeepSeek-R1-Distill-Qwen-1.5B",
|
| 220 |
"analysis_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 221 |
+
"analysis_reasoning": str(e),
|
| 222 |
"error": str(e)
|
| 223 |
}
|
| 224 |
|
| 225 |
if __name__ == "__main__":
|
| 226 |
iface = gr.Interface(
|
| 227 |
fn=analyze_content,
|
| 228 |
+
inputs=gr.Textbox(lines=12, label="Paste Script Here", placeholder="Enter text to analyze..."),
|
| 229 |
+
outputs=[
|
| 230 |
+
gr.JSON(label="Analysis Results"),
|
| 231 |
+
gr.Textbox(label="Analysis Reasoning", lines=10)
|
| 232 |
+
],
|
| 233 |
+
title="TREAT - Trigger Analysis for Entertainment Texts",
|
| 234 |
+
description="Deep analysis of scripts for sensitive content using AI",
|
| 235 |
+
allow_flagging="never"
|
| 236 |
)
|
| 237 |
+
iface.launch(show_error=True)
|