diff --git a/ai/preprocess.py b/ai/preprocess.py deleted file mode 100644 index afd8dbd8..00000000 --- a/ai/preprocess.py +++ /dev/null @@ -1,75 +0,0 @@ -import re -import nltk -from bs4 import BeautifulSoup -from nltk.corpus import stopwords -from pymystem3 import Mystem -from string import punctuation -from transformers import BertTokenizer - -nltk.download("stopwords") - - -def get_clear_text(text): - soup = BeautifulSoup(text, 'html.parser') - - # extract the plain text from the HTML document without tags - clear_text = '' - for tag in soup.find_all(): - clear_text += tag.string or '' - - clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text) - - # only words - clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE) - - clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text) - - clear_text = clear_text.lower() - - mystem = Mystem() - russian_stopwords = stopwords.words("russian") - - tokens = mystem.lemmatize(clear_text) - tokens = [token for token in tokens if token not in russian_stopwords \ - and token != " " \ - and token.strip() not in punctuation] - - clear_text = " ".join(tokens) - - return clear_text - - -# if __name__ == '__main__': -# -# # initialize the tokenizer with the pre-trained BERT model and vocabulary -# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') -# -# # split each text into smaller segments of maximum length 512 -# max_length = 512 -# segmented_texts = [] -# for text in [clear_text1, clear_text2]: -# segmented_text = [] -# for i in range(0, len(text), max_length): -# segment = text[i:i+max_length] -# segmented_text.append(segment) -# segmented_texts.append(segmented_text) -# -# # tokenize each segment using the BERT tokenizer -# tokenized_texts = [] -# for segmented_text in segmented_texts: -# tokenized_text = [] -# for segment in segmented_text: -# segment_tokens = tokenizer.tokenize(segment) -# segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]'] -# tokenized_text.append(segment_tokens) -# tokenized_texts.append(tokenized_text) -# -# input_ids = [] -# for tokenized_text in tokenized_texts: -# input_id = [] -# for segment_tokens in tokenized_text: -# segment_id = tokenizer.convert_tokens_to_ids(segment_tokens) -# input_id.append(segment_id) -# input_ids.append(input_id) -# -# print(input_ids) diff --git a/auth/email.py b/auth/email.py index 7290c518..a42cf1f7 100644 --- a/auth/email.py +++ b/auth/email.py @@ -1,4 +1,4 @@ -from httpx import AsyncClient +import requests from settings import MAILGUN_API_KEY, MAILGUN_DOMAIN @@ -24,13 +24,7 @@ async def send_auth_email(user, token, lang="ru", template="email_confirmation") print("[auth.email] payload: %r" % payload) # debug # print('http://localhost:3000/?modal=auth&mode=confirm-email&token=%s' % token) - async with AsyncClient() as client: - response = await client.post(api_url, headers=headers, data=gql) - if response.status_code != 200: - return False, None - r = response.json() - api_url, auth=("api", MAILGUN_API_KEY), data=payload - ) - response.raise_for_status() + response = requests.post(api_url, auth=("api", MAILGUN_API_KEY), data=payload) + response.raise_for_status() except Exception as e: print(e) diff --git a/requirements.txt b/requirements.txt index 7d58a3f2..659f7c2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,9 +24,6 @@ python-dateutil~=2.8.2 beautifulsoup4~=4.11.1 lxml sentry-sdk>=1.14.0 -nltk~=3.8.1 -pymystem3~=0.2.0 -transformers boto3~=1.28.2 botocore~=1.31.2 python-multipart~=0.0.6