init

2025-09-30 23:43:51 +04:00
commit 6f744b8066
7 changed files with 2327 additions and 0 deletions
--- a/nlp_processor.py
+++ b/nlp_processor.py
@@ -0,0 +1,59 @@
+import logging
+
+import joblib
+import nltk
+import numpy as np
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+
+
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+
+model, vectorizer = joblib.load('model.pkl')
+
+
+def model_reload():
+    print('Reloading the model')
+    global model, vectorizer
+    model, vectorizer = joblib.load('model.pkl')
+
+def preprocess_comment(comment):
+    tokens = word_tokenize(comment)
+    stop_words = set(stopwords.words('russian'))
+    filtered_tokens = [t for t in tokens if t.lower() not in stop_words]
+    return ' '.join(filtered_tokens)
+
+
+def comment_to_vector(comment):
+    vector = vectorizer.transform([comment])
+    return vector
+
+
+# Определение категории по комменту
+def predict_category(comment):
+    # Преобразуем текст в вектор
+    vector = vectorizer.transform([comment])
+
+    # Получаем "вероятности" через decision function
+    decision_scores = model.decision_function(vector)
+
+    # Преобразуем scores в "псевдо-вероятности" через softmax
+    exp_scores = np.exp(decision_scores - np.max(decision_scores))
+    probabilities = exp_scores / np.sum(exp_scores)
+
+    # Получаем топ-3 категорий
+    top_3_indices = np.argsort(probabilities[0])[::-1][:3]
+    top_3_categories = [
+        {'category': model.classes_[i].encode('latin1').decode('utf-8'), 'weight': float(probabilities[0][i])}
+        for i in top_3_indices
+    ]
+
+    return top_3_categories
+
+
+# Тестирование
+comment = "ремешок часы"  # Пример комментария
+result = predict_category(comment)
+for item in result:
+    print(f"Категория: {item['category']}, Вес: {item['weight']:.4f}")