Revert "Feature/lint"

2023-10-27 00:07:35 +03:00
parent 05136699ee
commit b142949805
70 changed files with 1465 additions and 1223 deletions
--- a/ai/preprocess.py
+++ b/ai/preprocess.py
@@ -1,28 +1,28 @@
+import re
+import nltk
 from bs4 import BeautifulSoup
 from nltk.corpus import stopwords
 from pymystem3 import Mystem
 from string import punctuation
-
-import nltk
-import re
+from transformers import BertTokenizer

 nltk.download("stopwords")


 def get_clear_text(text):
-    soup = BeautifulSoup(text, "html.parser")
+    soup = BeautifulSoup(text, 'html.parser')

    # extract the plain text from the HTML document without tags
-    clear_text = ""
+    clear_text = ''
    for tag in soup.find_all():
-        clear_text += tag.string or ""
+        clear_text += tag.string or ''

-    clear_text = re.sub(pattern="[\u202F\u00A0\n]+", repl=" ", string=clear_text)
+    clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text)

    # only words
-    clear_text = re.sub(pattern="[^A-ZА-ЯЁ -]", repl="", string=clear_text, flags=re.IGNORECASE)
+    clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE)

-    clear_text = re.sub(pattern=r"\s+", repl=" ", string=clear_text)
+    clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text)

    clear_text = clear_text.lower()

@@ -30,11 +30,9 @@ def get_clear_text(text):
    russian_stopwords = stopwords.words("russian")

    tokens = mystem.lemmatize(clear_text)
-    tokens = [
-        token
-        for token in tokens
-        if token not in russian_stopwords and token != " " and token.strip() not in punctuation
-    ]
+    tokens = [token for token in tokens if token not in russian_stopwords \
+              and token != " " \
+              and token.strip() not in punctuation]

    clear_text = " ".join(tokens)