34 lines
1.0 KiB
Python
34 lines
1.0 KiB
Python
|
|
from utils.extract_text import extract_text
|
|||
|
|
from utils.logger import root_logger as logger
|
|||
|
|
|
|||
|
|
def validate_html_content(html_content: str) -> tuple[bool, str]:
|
|||
|
|
"""
|
|||
|
|
Проверяет валидность HTML контента через trafilatura.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
html_content: HTML строка для проверки
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
tuple[bool, str]: (валидность, сообщение об ошибке)
|
|||
|
|
|
|||
|
|
Example:
|
|||
|
|
>>> is_valid, error = validate_html_content("<p>Valid HTML</p>")
|
|||
|
|
>>> is_valid
|
|||
|
|
True
|
|||
|
|
>>> error
|
|||
|
|
''
|
|||
|
|
>>> is_valid, error = validate_html_content("Invalid < HTML")
|
|||
|
|
>>> is_valid
|
|||
|
|
False
|
|||
|
|
>>> 'Invalid HTML' in error
|
|||
|
|
True
|
|||
|
|
"""
|
|||
|
|
if not html_content or not html_content.strip():
|
|||
|
|
return False, ""
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
extracted = extract_text(html_content)
|
|||
|
|
return bool(extracted), extracted or ""
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"HTML validation error: {e}", exc_info=True)
|
|||
|
|
return False, f"Invalid HTML content: {e!s}"
|