Files
core/utils/validators.py
Untone 00a866876c
Some checks failed
Deploy on push / deploy (push) Failing after 4m31s
search-wrapper
2025-08-23 14:08:34 +03:00

36 lines
1.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from utils.extract_text import extract_text
from utils.logger import root_logger as logger
def validate_html_content(html_content: str) -> tuple[bool, str]:
"""
Проверяет валидность HTML контента через trafilatura.
Args:
html_content: HTML строка для проверки
Returns:
tuple[bool, str]: (валидность, сообщение об ошибке)
Example:
>>> is_valid, error = validate_html_content("<p>Valid HTML</p>")
>>> is_valid
True
>>> error
''
>>> is_valid, error = validate_html_content("Invalid < HTML")
>>> is_valid
False
>>> 'Invalid HTML' in error
True
"""
if not html_content or not html_content.strip():
return False, ""
try:
extracted = extract_text(html_content)
return bool(extracted), extracted or ""
except Exception as e:
logger.error(f"HTML validation error: {e}", exc_info=True)
return False, f"Invalid HTML content: {e!s}"