core/ai/preprocess.py
2023-10-27 00:07:35 +03:00

76 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from transformers import BertTokenizer
nltk.download("stopwords")
def get_clear_text(text):
soup = BeautifulSoup(text, 'html.parser')
# extract the plain text from the HTML document without tags
clear_text = ''
for tag in soup.find_all():
clear_text += tag.string or ''
clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text)
# only words
clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE)
clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text)
clear_text = clear_text.lower()
mystem = Mystem()
russian_stopwords = stopwords.words("russian")
tokens = mystem.lemmatize(clear_text)
tokens = [token for token in tokens if token not in russian_stopwords \
and token != " " \
and token.strip() not in punctuation]
clear_text = " ".join(tokens)
return clear_text
# if __name__ == '__main__':
#
# # initialize the tokenizer with the pre-trained BERT model and vocabulary
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#
# # split each text into smaller segments of maximum length 512
# max_length = 512
# segmented_texts = []
# for text in [clear_text1, clear_text2]:
# segmented_text = []
# for i in range(0, len(text), max_length):
# segment = text[i:i+max_length]
# segmented_text.append(segment)
# segmented_texts.append(segmented_text)
#
# # tokenize each segment using the BERT tokenizer
# tokenized_texts = []
# for segmented_text in segmented_texts:
# tokenized_text = []
# for segment in segmented_text:
# segment_tokens = tokenizer.tokenize(segment)
# segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]']
# tokenized_text.append(segment_tokens)
# tokenized_texts.append(tokenized_text)
#
# input_ids = []
# for tokenized_text in tokenized_texts:
# input_id = []
# for segment_tokens in tokenized_text:
# segment_id = tokenizer.convert_tokens_to_ids(segment_tokens)
# input_id.append(segment_id)
# input_ids.append(input_id)
#
# print(input_ids)