This commit is contained in:
xds
2025-09-30 23:43:51 +04:00
commit 6f744b8066
7 changed files with 2327 additions and 0 deletions

59
nlp_processor.py Normal file
View File

@@ -0,0 +1,59 @@
import logging
import joblib
import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')
model, vectorizer = joblib.load('model.pkl')
def model_reload():
print('Reloading the model')
global model, vectorizer
model, vectorizer = joblib.load('model.pkl')
def preprocess_comment(comment):
tokens = word_tokenize(comment)
stop_words = set(stopwords.words('russian'))
filtered_tokens = [t for t in tokens if t.lower() not in stop_words]
return ' '.join(filtered_tokens)
def comment_to_vector(comment):
vector = vectorizer.transform([comment])
return vector
# Определение категории по комменту
def predict_category(comment):
# Преобразуем текст в вектор
vector = vectorizer.transform([comment])
# Получаем "вероятности" через decision function
decision_scores = model.decision_function(vector)
# Преобразуем scores в "псевдо-вероятности" через softmax
exp_scores = np.exp(decision_scores - np.max(decision_scores))
probabilities = exp_scores / np.sum(exp_scores)
# Получаем топ-3 категорий
top_3_indices = np.argsort(probabilities[0])[::-1][:3]
top_3_categories = [
{'category': model.classes_[i].encode('latin1').decode('utf-8'), 'weight': float(probabilities[0][i])}
for i in top_3_indices
]
return top_3_categories
# Тестирование
comment = "ремешок часы" # Пример комментария
result = predict_category(comment)
for item in result:
print(f"Категория: {item['category']}, Вес: {item['weight']:.4f}")