init
This commit is contained in:
119
nlp_teacher.py
Normal file
119
nlp_teacher.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import re
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
import joblib
|
||||
import nltk
|
||||
import pandas as pd
|
||||
import requests
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
from requests.compat import chardet
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import nlp_processor
|
||||
|
||||
nltk.download('stopwords')
|
||||
nltk.download('punkt_tab')
|
||||
morph = MorphAnalyzer()
|
||||
|
||||
|
||||
def fix_encoding(text):
|
||||
if isinstance(text, str):
|
||||
try:
|
||||
# Преобразуем неправильно декодированные строки
|
||||
return text.encode('latin1').decode('utf-8')
|
||||
except:
|
||||
return text
|
||||
return text
|
||||
|
||||
|
||||
# Загрузка данных
|
||||
|
||||
def reteach() -> str:
|
||||
# URL вашего API, возвращающего CSV
|
||||
api_url = "https://luminic.space/api/spaces/67af3c0f652da946a7dd9931/transactions/csv"
|
||||
|
||||
headers = {
|
||||
"Authorization": "Bearer eyJhbGciOiJIUzM4NCJ9.eyJzdWIiOiJ2b3JvbmludiIsImlhdCI6MTc0NDAzMDk5MywiZXhwIjoxNzQ0ODk0OTkzfQ.mybRBMsjPfJ8_kbDfJiPSr8UOIWS6WYddSFptplBTIfnPQm9SsdH3ixlEM-UY0CQ",
|
||||
}
|
||||
# Отправляем GET-запрос
|
||||
response = requests.get(api_url, headers=headers)
|
||||
|
||||
print(response)
|
||||
# Проверяем успешность запроса
|
||||
if response.status_code == 200:
|
||||
|
||||
encoding = chardet.detect(response.content)['encoding']
|
||||
print(f"Detected encoding: {encoding}")
|
||||
# Используем StringIO для преобразования текста в файлоподобный объект
|
||||
csv_data = StringIO(response.text)
|
||||
|
||||
# Читаем CSV в DataFrame
|
||||
data = pd.read_csv(csv_data)
|
||||
# result = chardet.detect(data["comment"])
|
||||
# print(result)
|
||||
print(data.head(3))
|
||||
|
||||
# Токенизация и удаление стоп-слов
|
||||
stop_words = set(stopwords.words('russian'))
|
||||
|
||||
def tokenize_text(text):
|
||||
text = re.sub(r'\d+([.,]\d+)?\s*(р|руб|коп)?', '', text)
|
||||
|
||||
# Лемматизация с обработкой денежных единиц
|
||||
tokens = []
|
||||
for token in word_tokenize(text.lower()):
|
||||
if token in {'$', '€', '₽'}:
|
||||
continue
|
||||
parsed = morph.parse(token)[0]
|
||||
if parsed.tag.POS in {'NOUN', 'VERB', 'ADJ', 'ADV'}:
|
||||
tokens.append(parsed.normal_form)
|
||||
|
||||
return ' '.join([t for t in tokens if t not in stop_words and len(t) > 2])
|
||||
|
||||
data['comment'] = data['comment'].apply(fix_encoding)
|
||||
print(data.head(3))
|
||||
data['processed_comment'] = data['comment'].apply(tokenize_text)
|
||||
|
||||
# Нормализация и удаление лишних символов
|
||||
data['processed_comment'] = data['processed_comment'].apply(lambda x: x.lower())
|
||||
data['processed_comment'] = data['processed_comment'].apply(lambda x: x.replace(',', ''))
|
||||
data['processed_comment'] = data['processed_comment'].apply(lambda x: x.replace('.', ''))
|
||||
|
||||
# Преобразование текста в числовой вектор
|
||||
vectorizer = TfidfVectorizer(
|
||||
ngram_range=(1, 3),
|
||||
max_features=15000,
|
||||
min_df=2,
|
||||
max_df=0.8,
|
||||
analyzer='char_wb', # Важно для опечаток!
|
||||
sublinear_tf=True
|
||||
)
|
||||
X = vectorizer.fit_transform(data['processed_comment'])
|
||||
# Разделение данных на обучающую и тестовую выборки
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, data['category'], test_size=0.2, random_state=42)
|
||||
|
||||
# Обучение модели классификации
|
||||
model = LinearSVC()
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Оценка модели
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
|
||||
|
||||
# Сохранение модели в файл
|
||||
|
||||
# Сохранение модели в файл
|
||||
joblib.dump((model, vectorizer), 'model.pkl')
|
||||
nlp_processor.model_reload()
|
||||
return accuracy_score(y_test, y_pred)
|
||||
Reference in New Issue
Block a user