core/ai/preprocess.py

import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from transformers import BertTokenizer

nltk.download("stopwords")


def get_clear_text(text):
    soup = BeautifulSoup(text, 'html.parser')

    # extract the plain text from the HTML document without tags
    clear_text = ''
    for tag in soup.find_all():
        clear_text += tag.string or ''

    clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text)

    # only words
    clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE)

    clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text)

    clear_text = clear_text.lower()

    mystem = Mystem()
    russian_stopwords = stopwords.words("russian")

    tokens = mystem.lemmatize(clear_text)
    tokens = [token for token in tokens if token not in russian_stopwords \
              and token != " " \
              and token.strip() not in punctuation]

    clear_text = " ".join(tokens)

    return clear_text


# if __name__ == '__main__':
#
#     # initialize the tokenizer with the pre-trained BERT model and vocabulary
#     tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#
#     # split each text into smaller segments of maximum length 512
#     max_length = 512
#     segmented_texts = []
#     for text in [clear_text1, clear_text2]:
#         segmented_text = []
#         for i in range(0, len(text), max_length):
#             segment = text[i:i+max_length]
#             segmented_text.append(segment)
#         segmented_texts.append(segmented_text)
#
#     # tokenize each segment using the BERT tokenizer
#     tokenized_texts = []
#     for segmented_text in segmented_texts:
#         tokenized_text = []
#         for segment in segmented_text:
#             segment_tokens = tokenizer.tokenize(segment)
#             segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]']
#             tokenized_text.append(segment_tokens)
#         tokenized_texts.append(tokenized_text)
#
#     input_ids = []
#     for tokenized_text in tokenized_texts:
#         input_id = []
#         for segment_tokens in tokenized_text:
#             segment_id = tokenizer.convert_tokens_to_ids(segment_tokens)
#             input_id.append(segment_id)
#         input_ids.append(input_id)
#
#     print(input_ids)
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								import re
 								import nltk
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
+								from bs4 import BeautifulSoup
 								from nltk.corpus import stopwords
 								from pymystem3 import Mystem
-												lint

											
										
										
											2023-10-26 20:38:31 +00:00
+								from string import punctuation
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								from transformers import BertTokenizer
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
 								nltk.download("stopwords")
 								def get_clear_text(text):
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								    soup = BeautifulSoup(text, 'html.parser')
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
 								    # extract the plain text from the HTML document without tags
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								    clear_text = ''
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
+								    for tag in soup.find_all():
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								        clear_text += tag.string or ''
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								    clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text)
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
 								    # only words
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								    clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE)
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								    clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text)
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
 								    clear_text = clear_text.lower()
 								    mystem = Mystem()
 								    russian_stopwords = stopwords.words("russian")
 								    tokens = mystem.lemmatize(clear_text)
-												Revert "Feature/lint"

											
										
										
											2023-10-26 21:07:35 +00:00
+								    tokens = [token for token in tokens if token not in russian_stopwords \
 								              and token != " " \
 								              and token.strip() not in punctuation]
-												ai test.py, sqllite removed

											
										
										
											2023-05-09 21:41:13 +00:00
 								    clear_text = " ".join(tokens)
 								    return clear_text
 								# if __name__ == '__main__':
 								#
 								#     # initialize the tokenizer with the pre-trained BERT model and vocabulary
 								#     tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
 								#
 								#     # split each text into smaller segments of maximum length 512
 								#     max_length = 512
 								#     segmented_texts = []
 								#     for text in [clear_text1, clear_text2]:
 								#         segmented_text = []
 								#         for i in range(0, len(text), max_length):
 								#             segment = text[i:i+max_length]
 								#             segmented_text.append(segment)
 								#         segmented_texts.append(segmented_text)
 								#
 								#     # tokenize each segment using the BERT tokenizer
 								#     tokenized_texts = []
 								#     for segmented_text in segmented_texts:
 								#         tokenized_text = []
 								#         for segment in segmented_text:
 								#             segment_tokens = tokenizer.tokenize(segment)
 								#             segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]']
 								#             tokenized_text.append(segment_tokens)
 								#         tokenized_texts.append(tokenized_text)
 								#
 								#     input_ids = []
 								#     for tokenized_text in tokenized_texts:
 								#         input_id = []
 								#         for segment_tokens in tokenized_text:
 								#             segment_id = tokenizer.convert_tokens_to_ids(segment_tokens)
 								#             input_id.append(segment_id)
 								#         input_ids.append(input_id)
 								#
 								#     print(input_ids)