Files
nlp/nlp_teacher.py
2025-09-30 23:43:51 +04:00

120 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import sys
from io import StringIO
import joblib
import nltk
import pandas as pd
import requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from requests.compat import chardet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import nlp_processor
nltk.download('stopwords')
nltk.download('punkt_tab')
morph = MorphAnalyzer()
def fix_encoding(text):
if isinstance(text, str):
try:
# Преобразуем неправильно декодированные строки
return text.encode('latin1').decode('utf-8')
except:
return text
return text
# Загрузка данных
def reteach() -> str:
# URL вашего API, возвращающего CSV
api_url = "https://luminic.space/api/spaces/67af3c0f652da946a7dd9931/transactions/csv"
headers = {
"Authorization": "Bearer eyJhbGciOiJIUzM4NCJ9.eyJzdWIiOiJ2b3JvbmludiIsImlhdCI6MTc0NDAzMDk5MywiZXhwIjoxNzQ0ODk0OTkzfQ.mybRBMsjPfJ8_kbDfJiPSr8UOIWS6WYddSFptplBTIfnPQm9SsdH3ixlEM-UY0CQ",
}
# Отправляем GET-запрос
response = requests.get(api_url, headers=headers)
print(response)
# Проверяем успешность запроса
if response.status_code == 200:
encoding = chardet.detect(response.content)['encoding']
print(f"Detected encoding: {encoding}")
# Используем StringIO для преобразования текста в файлоподобный объект
csv_data = StringIO(response.text)
# Читаем CSV в DataFrame
data = pd.read_csv(csv_data)
# result = chardet.detect(data["comment"])
# print(result)
print(data.head(3))
# Токенизация и удаление стоп-слов
stop_words = set(stopwords.words('russian'))
def tokenize_text(text):
text = re.sub(r'\d+([.,]\d+)?\s*(р|руб|коп)?', '', text)
# Лемматизация с обработкой денежных единиц
tokens = []
for token in word_tokenize(text.lower()):
if token in {'$', '', ''}:
continue
parsed = morph.parse(token)[0]
if parsed.tag.POS in {'NOUN', 'VERB', 'ADJ', 'ADV'}:
tokens.append(parsed.normal_form)
return ' '.join([t for t in tokens if t not in stop_words and len(t) > 2])
data['comment'] = data['comment'].apply(fix_encoding)
print(data.head(3))
data['processed_comment'] = data['comment'].apply(tokenize_text)
# Нормализация и удаление лишних символов
data['processed_comment'] = data['processed_comment'].apply(lambda x: x.lower())
data['processed_comment'] = data['processed_comment'].apply(lambda x: x.replace(',', ''))
data['processed_comment'] = data['processed_comment'].apply(lambda x: x.replace('.', ''))
# Преобразование текста в числовой вектор
vectorizer = TfidfVectorizer(
ngram_range=(1, 3),
max_features=15000,
min_df=2,
max_df=0.8,
analyzer='char_wb', # Важно для опечаток!
sublinear_tf=True
)
X = vectorizer.fit_transform(data['processed_comment'])
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, data['category'], test_size=0.2, random_state=42)
# Обучение модели классификации
model = LinearSVC()
model.fit(X_train, y_train)
# Оценка модели
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
# Сохранение модели в файл
# Сохранение модели в файл
joblib.dump((model, vectorizer), 'model.pkl')
nlp_processor.model_reload()
return accuracy_score(y_test, y_pred)