59 lines
1.7 KiB
Python
59 lines
1.7 KiB
Python
import logging
|
||
|
||
import joblib
|
||
import nltk
|
||
import numpy as np
|
||
from nltk import word_tokenize
|
||
from nltk.corpus import stopwords
|
||
|
||
|
||
nltk.download('stopwords')
|
||
nltk.download('punkt_tab')
|
||
|
||
model, vectorizer = joblib.load('model.pkl')
|
||
|
||
|
||
def model_reload():
|
||
print('Reloading the model')
|
||
global model, vectorizer
|
||
model, vectorizer = joblib.load('model.pkl')
|
||
|
||
def preprocess_comment(comment):
|
||
tokens = word_tokenize(comment)
|
||
stop_words = set(stopwords.words('russian'))
|
||
filtered_tokens = [t for t in tokens if t.lower() not in stop_words]
|
||
return ' '.join(filtered_tokens)
|
||
|
||
|
||
def comment_to_vector(comment):
|
||
vector = vectorizer.transform([comment])
|
||
return vector
|
||
|
||
|
||
# Определение категории по комменту
|
||
def predict_category(comment):
|
||
# Преобразуем текст в вектор
|
||
vector = vectorizer.transform([comment])
|
||
|
||
# Получаем "вероятности" через decision function
|
||
decision_scores = model.decision_function(vector)
|
||
|
||
# Преобразуем scores в "псевдо-вероятности" через softmax
|
||
exp_scores = np.exp(decision_scores - np.max(decision_scores))
|
||
probabilities = exp_scores / np.sum(exp_scores)
|
||
|
||
# Получаем топ-3 категорий
|
||
top_3_indices = np.argsort(probabilities[0])[::-1][:3]
|
||
top_3_categories = [
|
||
{'category': model.classes_[i].encode('latin1').decode('utf-8'), 'weight': float(probabilities[0][i])}
|
||
for i in top_3_indices
|
||
]
|
||
|
||
return top_3_categories
|
||
|
||
|
||
# Тестирование
|
||
comment = "ремешок часы" # Пример комментария
|
||
result = predict_category(comment)
|
||
for item in result:
|
||
print(f"Категория: {item['category']}, Вес: {item['weight']:.4f}") |