Files
nlp/nlp_processor.py
2025-09-30 23:43:51 +04:00

59 lines
1.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
import joblib
import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')
model, vectorizer = joblib.load('model.pkl')
def model_reload():
print('Reloading the model')
global model, vectorizer
model, vectorizer = joblib.load('model.pkl')
def preprocess_comment(comment):
tokens = word_tokenize(comment)
stop_words = set(stopwords.words('russian'))
filtered_tokens = [t for t in tokens if t.lower() not in stop_words]
return ' '.join(filtered_tokens)
def comment_to_vector(comment):
vector = vectorizer.transform([comment])
return vector
# Определение категории по комменту
def predict_category(comment):
# Преобразуем текст в вектор
vector = vectorizer.transform([comment])
# Получаем "вероятности" через decision function
decision_scores = model.decision_function(vector)
# Преобразуем scores в "псевдо-вероятности" через softmax
exp_scores = np.exp(decision_scores - np.max(decision_scores))
probabilities = exp_scores / np.sum(exp_scores)
# Получаем топ-3 категорий
top_3_indices = np.argsort(probabilities[0])[::-1][:3]
top_3_categories = [
{'category': model.classes_[i].encode('latin1').decode('utf-8'), 'weight': float(probabilities[0][i])}
for i in top_3_indices
]
return top_3_categories
# Тестирование
comment = "ремешок часы" # Пример комментария
result = predict_category(comment)
for item in result:
print(f"Категория: {item['category']}, Вес: {item['weight']:.4f}")