2025-04-27 12:53:49 +03:00
|
|
|
|
"""
|
|
|
|
|
Модуль для обработки HTML-фрагментов
|
|
|
|
|
"""
|
|
|
|
|
|
2025-07-31 18:55:59 +03:00
|
|
|
|
import re
|
|
|
|
|
from typing import Optional
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
2025-06-02 02:56:11 +03:00
|
|
|
|
|
2025-07-31 18:55:59 +03:00
|
|
|
|
def extract_text(html_content: Optional[str]) -> str:
|
2025-05-16 09:23:48 +03:00
|
|
|
|
"""
|
2025-07-31 18:55:59 +03:00
|
|
|
|
Извлекает текст из HTML с помощью регулярных выражений.
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
|
|
|
|
Args:
|
2025-07-31 18:55:59 +03:00
|
|
|
|
html_content (Optional[str]): HTML-строка для извлечения текста
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
|
|
|
|
Returns:
|
2025-06-02 02:56:11 +03:00
|
|
|
|
str: Извлеченный текст или пустая строка
|
2025-05-16 09:23:48 +03:00
|
|
|
|
"""
|
2025-07-31 18:55:59 +03:00
|
|
|
|
if not html_content:
|
2025-06-02 02:56:11 +03:00
|
|
|
|
return ""
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
2025-07-31 18:55:59 +03:00
|
|
|
|
# Удаляем HTML-теги
|
|
|
|
|
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
|
|
|
|
|
|
|
|
# Декодируем HTML-сущности
|
|
|
|
|
text = re.sub(r"&[a-zA-Z]+;", " ", text)
|
|
|
|
|
|
|
|
|
|
# Заменяем несколько пробелов на один
|
|
|
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
2025-04-27 12:53:49 +03:00
|
|
|
|
def wrap_html_fragment(fragment: str) -> str:
|
|
|
|
|
"""
|
2025-07-31 18:55:59 +03:00
|
|
|
|
Оборачивает HTML-фрагмент в полный HTML-документ.
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
2025-04-27 12:53:49 +03:00
|
|
|
|
Args:
|
2025-07-31 18:55:59 +03:00
|
|
|
|
fragment (str): HTML-фрагмент
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
2025-04-27 12:53:49 +03:00
|
|
|
|
Returns:
|
|
|
|
|
str: Полный HTML-документ
|
|
|
|
|
"""
|
2025-07-31 18:55:59 +03:00
|
|
|
|
if "<!DOCTYPE html>" in fragment and "<html>" in fragment:
|
2025-04-27 12:53:49 +03:00
|
|
|
|
return fragment
|
2025-05-16 09:23:48 +03:00
|
|
|
|
|
2025-07-31 18:55:59 +03:00
|
|
|
|
return f"""<!DOCTYPE html>
|
2025-04-27 12:53:49 +03:00
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
|
<title></title>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
{fragment}
|
|
|
|
|
</body>
|
|
|
|
|
</html>"""
|