Improve topic sorting: add popular sorting by publications and authors count

This commit is contained in:
2025-06-02 02:56:11 +03:00
parent baca19a4d5
commit 3327976586
113 changed files with 7238 additions and 3739 deletions

View File

@@ -4,24 +4,31 @@
import trafilatura
from utils.logger import root_logger as logger
def extract_text(html: str) -> str:
"""
Извлекает текст из HTML-фрагмента.
Извлекает чистый текст из HTML
Args:
html: HTML-фрагмент
html: HTML строка
Returns:
str: Текст из HTML-фрагмента
str: Извлеченный текст или пустая строка
"""
return trafilatura.extract(
wrap_html_fragment(html),
include_comments=False,
include_tables=False,
include_images=False,
include_formatting=False,
)
try:
result = trafilatura.extract(
html,
include_comments=False,
include_tables=True,
include_formatting=False,
favor_precision=True,
)
return result or ""
except Exception as e:
logger.error(f"Error extracting text: {e}")
return ""
def wrap_html_fragment(fragment: str) -> str: