Spaces:

ErzhanAb
/

Russian_Language_Toxic_Comments

Sleeping

App Files Files Community

ErzhanAb commited on Aug 24

Commit

6fb2a39

verified ·

1 Parent(s): 4686fa6

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -81

app.py CHANGED Viewed

@@ -2,36 +2,57 @@ import os, json, re
 from html import unescape
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from nltk.stem.snowball import RussianStemmer
-# =========================
-# 1) Константы/пути
-# =========================
-MODEL_DIR = "."  # файлы (config.json, model.safetensors, tokenizer.*) лежат в корне
-INFER_CFG = os.path.join(MODEL_DIR, "inference_config.json")
-# Порог по умолчанию (если не нашли inference_config.json)
-DEFAULT_THRESHOLD = 0.40
-if os.path.exists(INFER_CFG):
-    try:
-        with open(INFER_CFG, "r", encoding="utf-8") as f:
-            DEFAULT_THRESHOLD = float(json.load(f).get("threshold_val", DEFAULT_THRESHOLD))
-    except Exception:
-        pass
-# =========================
-# 2) Предобработка (та же, что при обучении!)
-# =========================
 _URL_RE   = re.compile(r'https?://\S+|www\.\S+')
 _TAG_RE   = re.compile(r'[@#]\w+')
 _NUM_RE   = re.compile(r'\d+')
 _PUNCT_RE = re.compile(r"[^\w\s]+", flags=re.UNICODE)
 _WS_RE    = re.compile(r"\s+")
-stemmer = RussianStemmer(ignore_stopwords=False)
 def clean_and_stem(s: str) -> str:
     if not isinstance(s, str):
         s = str(s)
@@ -48,85 +69,106 @@ def clean_and_stem(s: str) -> str:
         out.append(t if t in {"url", "tag", "num"} else stemmer.stem(t))
     return " ".join(out)
-# =========================
-# 3) Загрузка модели
-# =========================
-# Читаем локальные файлы — без скачивания с интернета
-tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
-model.eval()
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(DEVICE)
-@torch.inference_mode()
-def infer_proba(text: str) -> float:
-    text = clean_and_stem(text)
-    if not text:
         return 0.0
-    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
-    enc = {k: v.to(DEVICE) for k, v in enc.items()}
-    logits = model(**enc).logits
-    probs = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]
-    return float(probs[1])  # P(toxic)
-# =========================
-# 4) Gradio UI
-# =========================
-TITLE = "Анализатор токсичности (ruBERT-tiny2)"
-DESCRIPTION = (
-    "Введите комментарий на русском языке. Модель вернёт вероятности классов и метку по выбранному порогу."
-)
-CUSTOM_CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
-:root { --font: 'Inter', system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
-"""
-def predict(comment: str, threshold: float):
     comment = (comment or "").strip()
     if not comment:
         return {"Токсичный": 0.0, "Не токсичный": 1.0}, "—"
-    p_toxic = infer_proba(comment)
     pred = "Токсичный" if p_toxic >= threshold else "Не токсичный"
     dist = {"Токсичный": p_toxic, "Не токсичный": 1 - p_toxic}
-    expl = f"Порог: {threshold:.2f} • Вероятность токсичности: {p_toxic:.3f} → Предсказание: **{pred}**"
     return dist, expl
-with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"),
-               css=CUSTOM_CSS) as demo:
     gr.Markdown(f"# {TITLE}")
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column(scale=2):
-            inp = gr.Textbox(label="Текст комментария", lines=6, placeholder="Напишите что-нибудь…")
-            thr = gr.Slider(label="Порог классификации", minimum=0.0, maximum=1.0,
-                            step=0.01, value=DEFAULT_THRESHOLD)
             with gr.Row():
-                btn = gr.Button("Анализ", variant="primary")
-                clr = gr.Button("Очистить", variant="secondary")
         with gr.Column(scale=1):
-            out_label = gr.Label(label="Распределение по классам", num_top_classes=2)
-            out_txt = gr.Markdown()
-    examples = gr.Examples(
-        examples=[
-            ["да ты что, совсем с ума сошёл? это полный бред!", DEFAULT_THRESHOLD],
-            ["спасибо за помощь, очень полезный совет!", DEFAULT_THRESHOLD],
-        ],
-        inputs=[inp, thr],
-        label="Примеры"
-    )
-    btn.click(predict, [inp, thr], [out_label, out_txt])
-    inp.submit(predict, [inp, thr], [out_label, out_txt])
-    def _clear():
-        return "", DEFAULT_THRESHOLD, {"Токсичный": 0.0, "Не токсичный": 1.0}, "—"
-    clr.click(_clear, [], [inp, thr, out_label, out_txt])
 if __name__ == "__main__":
-    # SSR по умолчанию у новых версий Gradio; дополнительных флагов не нужно
     demo.launch()

 from html import unescape
 import gradio as gr
+import numpy as np
+# ====== TF-IDF + LR (joblib / sklearn) ======
+PIPE = None
+try:
+    import joblib
+    PIPE = joblib.load("model.joblib")  # сохранённый пайплайн TF-IDF+LR
+except Exception as e:
+    PIPE = None
+    print(f"[WARN] Не удалось загрузить model.joblib: {e}")
+# ====== Transformer (ruBERT-tiny2) ======
+TRANSFORMER = {"model": None, "tokenizer": None, "device": "cpu"}
+try:
+    import torch
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    MODEL_DIR = "."  # в корне лежат config.json, model.safetensors, tokenizer.*
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
+    model.to(device).eval()
+    TRANSFORMER["model"] = model
+    TRANSFORMER["tokenizer"] = tokenizer
+    TRANSFORMER["device"] = device
+except Exception as e:
+    print(f"[WARN] Не удалось загрузить ruBERT: {e}")
+# ====== Порог по умолчанию ======
+DEFAULT_THRESHOLD = 0.70  # как просили
+# если есть inference_config.json от обучения трансформера — подхватим рекомендованный
+try:
+    if os.path.exists("inference_config.json"):
+        with open("inference_config.json", "r", encoding="utf-8") as f:
+            cfg = json.load(f)
+            DEFAULT_THRESHOLD = float(cfg.get("threshold_val", DEFAULT_THRESHOLD))
+except Exception:
+    pass
+# ====== Предобработка для трансформера (как в обучении) ======
 from nltk.stem.snowball import RussianStemmer
+stemmer = RussianStemmer(ignore_stopwords=False)
 _URL_RE   = re.compile(r'https?://\S+|www\.\S+')
 _TAG_RE   = re.compile(r'[@#]\w+')
 _NUM_RE   = re.compile(r'\d+')
 _PUNCT_RE = re.compile(r"[^\w\s]+", flags=re.UNICODE)
 _WS_RE    = re.compile(r"\s+")
 def clean_and_stem(s: str) -> str:
     if not isinstance(s, str):
         s = str(s)
         out.append(t if t in {"url", "tag", "num"} else stemmer.stem(t))
     return " ".join(out)
+# ====== Инференс ======
+def infer_tfidf(text: str) -> float:
+    """Вернёт P(toxic) из TF-IDF+LR. В пайплайне уже есть свой preprocessor."""
+    if PIPE is None:
         return 0.0
+    proba = PIPE.predict_proba([text])[0, 1]
+    return float(proba)
+def infer_transformer(text: str) -> float:
+    """Вернёт P(toxic) из ruBERT-tiny2 (локальный чекпойнт)."""
+    if TRANSFORMER["model"] is None:
+        return 0.0
+    import torch
+    text = clean_and_stem(text)
+    if not text:
+        return 0.0
+    tok = TRANSFORMER["tokenizer"](text, return_tensors="pt", truncation=True, max_length=256)
+    tok = {k: v.to(TRANSFORMER["device"]) for k, v in tok.items()}
+    with torch.inference_mode():
+        logits = TRANSFORMER["model"](**tok).logits
+        p = torch.softmax(logits, dim=1)[0, 1].detach().cpu().item()
+    return float(p)
+def predict(model_name: str, comment: str, threshold: float):
     comment = (comment or "").strip()
     if not comment:
         return {"Токсичный": 0.0, "Не токсичный": 1.0}, "—"
+    if model_name == "ruBERT-tiny2 (трансформер)":
+        p_toxic = infer_transformer(comment)
+    else:  # TF-IDF + Логистическая регрессия
+        p_toxic = infer_tfidf(comment)
     pred = "Токсичный" if p_toxic >= threshold else "Не токсичный"
     dist = {"Токсичный": p_toxic, "Не токсичный": 1 - p_toxic}
+    expl = (
+        f"Модель: **{model_name}**  \n"
+        f"Порог: **{threshold:.2f}**  \n"
+        f"Вероятность токсичности: **{p_toxic:.3f}**  \n"
+        f"Предсказание: **{pred}**"
+    )
     return dist, expl
+def clear_all():
+    return "ruBERT-tiny2 (трансформер)", "", DEFAULT_THRESHOLD, {"Токсичный": 0.0, "Не токсичный": 1.0}, "—"
+# ====== UI ======
+TITLE = "Анализатор токсичности (две модели)"
+DESCRIPTION = "Выберите модель, задайте порог (по умолчанию 0.70) и введите комментарий."
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');
+:root { --font: 'Inter', system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
+"""
+ABOUT_MD = """
+### Параметры и описание моделей
+**1) ruBERT-tiny2 (трансформер)**
+- База: `cointegrated/rubert-tiny2` (BERT-tiny для русского).
+- Токенизация: BERT WordPiece.
+- Предобработка: удаление пунктуации, нормализация спец-токенов (`url`, `tag`, `num`), стемминг Snowball.
+- Обучение: 10 эпох с early stopping (по macro-F1), class weights (balanced).
+- Рекомендованный порог по валидации: ~**0.70**.
+**2) TF-IDF + Логистическая регрессия**
+- Векторизация: `TfidfVectorizer(analyzer="char_wb", ngram_range=(4,5), max_features=200k, min_df≈1.75e-4, max_df≈0.96)`.
+- Классификатор: `LogisticRegression(penalty="l1", solver="liblinear", C≈5.52, class_weight="balanced", max_iter=5000, tol≈2.4e-4)`.
+- Рекомендованный порог (по ранее полученным метрикам): ~**0.40**.
+**Порог** можно свободно менять слайдером — выберите баланс precision/recall под задачу.
+"""
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=CUSTOM_CSS) as demo:
     gr.Markdown(f"# {TITLE}")
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column(scale=2):
+            model_sel = gr.Dropdown(
+                ["ruBERT-tiny2 (трансформер)", "TF-IDF + Логистическая регрессия"],
+                value="ruBERT-tiny2 (трансформер)",
+                label="Модель"
+            )
+            comment_input = gr.Textbox(label="Текст комментария", lines=6, placeholder="Напишите что-нибудь…")
+            thr = gr.Slider(label="Порог классификации", minimum=0.0, maximum=1.0, value=DEFAULT_THRESHOLD, step=0.01)
             with gr.Row():
+                analyze_btn = gr.Button("Анализ", variant="primary")
+                clear_btn = gr.Button("Очистить", variant="secondary")
         with gr.Column(scale=1):
+            result_label = gr.Label(label="Распределение по классам", num_top_classes=2)
+            result_md = gr.Markdown()
+    gr.Markdown(ABOUT_MD)
+    analyze_btn.click(predict, [model_sel, comment_input, thr], [result_label, result_md])
+    comment_input.submit(predict, [model_sel, comment_input, thr], [result_label, result_md])
+    clear_btn.click(clear_all, [], [model_sel, comment_input, thr, result_label, result_md])
 if __name__ == "__main__":
     demo.launch()