nlp/nlp_processor.py

import logging

import joblib
import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords


nltk.download('stopwords')
nltk.download('punkt_tab')

model, vectorizer = joblib.load('model.pkl')


def model_reload():
    print('Reloading the model')
    global model, vectorizer
    model, vectorizer = joblib.load('model.pkl')

def preprocess_comment(comment):
    tokens = word_tokenize(comment)
    stop_words = set(stopwords.words('russian'))
    filtered_tokens = [t for t in tokens if t.lower() not in stop_words]
    return ' '.join(filtered_tokens)


def comment_to_vector(comment):
    vector = vectorizer.transform([comment])
    return vector


# Определение категории по комменту
def predict_category(comment):
    # Преобразуем текст в вектор
    vector = vectorizer.transform([comment])

    # Получаем "вероятности" через decision function
    decision_scores = model.decision_function(vector)

    # Преобразуем scores в "псевдо-вероятности" через softmax
    exp_scores = np.exp(decision_scores - np.max(decision_scores))
    probabilities = exp_scores / np.sum(exp_scores)

    # Получаем топ-3 категорий
    top_3_indices = np.argsort(probabilities[0])[::-1][:3]
    top_3_categories = [
        {'category': model.classes_[i].encode('latin1').decode('utf-8'), 'weight': float(probabilities[0][i])}
        for i in top_3_indices
    ]

    return top_3_categories


# Тестирование
comment = "ремешок часы"  # Пример комментария
result = predict_category(comment)
for item in result:
    print(f"Категория: {item['category']}, Вес: {item['weight']:.4f}")