This commit is contained in:
parent
b735bf8cab
commit
bcbfdd76e9
|
@ -21,6 +21,7 @@ from services.db import local_session
|
|||
from services.notify import notify_shout
|
||||
from services.schema import mutation, query
|
||||
from services.search import search_service
|
||||
from utils.html_wrapper import wrap_html_fragment
|
||||
from utils.logger import root_logger as logger
|
||||
|
||||
def create_shout_from_draft(session, draft, author_id):
|
||||
|
@ -183,7 +184,8 @@ async def create_draft(_, info, draft_input):
|
|||
return {"error": f"Failed to create draft: {str(e)}"}
|
||||
|
||||
def generate_teaser(body, limit=300):
|
||||
body_text = trafilatura.extract(body, include_comments=False, include_tables=False)
|
||||
body_html = wrap_html_fragment(body)
|
||||
body_text = trafilatura.extract(body_html, include_comments=False, include_tables=False)
|
||||
body_teaser = ". ".join(body_text[:limit].split(". ")[:-1])
|
||||
return body_teaser
|
||||
|
||||
|
@ -270,10 +272,12 @@ async def update_draft(_, info, draft_id: int, draft_input):
|
|||
if "seo" not in filtered_input and not draft.seo:
|
||||
body_src = filtered_input.get("body", draft.body)
|
||||
lead_src = filtered_input.get("lead", draft.lead)
|
||||
body_html = wrap_html_fragment(body_src)
|
||||
lead_html = wrap_html_fragment(lead_src)
|
||||
|
||||
try:
|
||||
body_text = trafilatura.extract(body_src, include_comments=False, include_tables=False) if body_src else None
|
||||
lead_text = trafilatura.extract(lead_src, include_comments=False, include_tables=False) if lead_src else None
|
||||
body_text = trafilatura.extract(body_html, include_comments=False, include_tables=False) if body_src else None
|
||||
lead_text = trafilatura.extract(lead_html, include_comments=False, include_tables=False) if lead_src else None
|
||||
|
||||
body_teaser = generate_teaser(body_text, 300) if body_text else ""
|
||||
filtered_input["seo"] = lead_text if lead_text else body_teaser
|
||||
|
@ -347,6 +351,7 @@ def validate_html_content(html_content: str) -> tuple[bool, str]:
|
|||
return False, "Content is empty"
|
||||
|
||||
try:
|
||||
html_content = wrap_html_fragment(html_content)
|
||||
extracted = trafilatura.extract(html_content)
|
||||
if not extracted:
|
||||
return False, "Invalid HTML structure or empty content"
|
||||
|
|
|
@ -23,6 +23,7 @@ from services.db import local_session
|
|||
from services.notify import notify_shout
|
||||
from services.schema import mutation, query
|
||||
from services.search import search_service
|
||||
from utils.html_wrapper import wrap_html_fragment
|
||||
from utils.logger import root_logger as logger
|
||||
|
||||
|
||||
|
@ -180,9 +181,11 @@ async def create_shout(_, info, inp):
|
|||
# Создаем публикацию без topics
|
||||
body = inp.get("body", "")
|
||||
lead = inp.get("lead", "")
|
||||
body_text = trafilatura.extract(body)
|
||||
lead_text = trafilatura.extract(lead)
|
||||
seo = inp.get("seo", lead_text or body_text[:300].split(". ")[:-1].join(". "))
|
||||
body_html = wrap_html_fragment(body)
|
||||
lead_html = wrap_html_fragment(lead)
|
||||
body_text = trafilatura.extract(body_html)
|
||||
lead_text = trafilatura.extract(lead_html)
|
||||
seo = inp.get("seo", lead_text.strip() or body_text.strip()[:300].split(". ")[:-1].join(". "))
|
||||
new_shout = Shout(
|
||||
slug=slug,
|
||||
body=body,
|
||||
|
|
1
utils/__init__.py
Normal file
1
utils/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
|
38
utils/html_wrapper.py
Normal file
38
utils/html_wrapper.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
"""
|
||||
Модуль для обработки HTML-фрагментов
|
||||
"""
|
||||
|
||||
def wrap_html_fragment(fragment: str) -> str:
|
||||
"""
|
||||
Оборачивает HTML-фрагмент в полную HTML-структуру для корректной обработки.
|
||||
|
||||
Args:
|
||||
fragment: HTML-фрагмент для обработки
|
||||
|
||||
Returns:
|
||||
str: Полный HTML-документ
|
||||
|
||||
Example:
|
||||
>>> wrap_html_fragment("<p>Текст параграфа</p>")
|
||||
'<!DOCTYPE html><html><head><meta charset="utf-8"></head><body><p>Текст параграфа</p></body></html>'
|
||||
"""
|
||||
if not fragment or not fragment.strip():
|
||||
return fragment
|
||||
|
||||
# Проверяем, является ли контент полным HTML-документом
|
||||
is_full_html = fragment.strip().startswith('<!DOCTYPE') or fragment.strip().startswith('<html')
|
||||
|
||||
# Если это фрагмент, оборачиваем его в полный HTML-документ
|
||||
if not is_full_html:
|
||||
return f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title></title>
|
||||
</head>
|
||||
<body>
|
||||
{fragment}
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
return fragment
|
Loading…
Reference in New Issue
Block a user