Improve topic sorting: add popular sorting by publications and authors count

This commit is contained in:
2025-06-02 02:56:11 +03:00
parent baca19a4d5
commit 3327976586
113 changed files with 7238 additions and 3739 deletions

View File

@@ -2,7 +2,7 @@ import re
from difflib import ndiff
def get_diff(original, modified):
def get_diff(original: str, modified: str) -> list[str]:
"""
Get the difference between two strings using difflib.
@@ -13,11 +13,10 @@ def get_diff(original, modified):
Returns:
A list of differences.
"""
diff = list(ndiff(original.split(), modified.split()))
return diff
return list(ndiff(original.split(), modified.split()))
def apply_diff(original, diff):
def apply_diff(original: str, diff: list[str]) -> str:
"""
Apply the difference to the original string.

View File

@@ -1,28 +1,118 @@
from decimal import Decimal
from json import JSONEncoder
"""
JSON encoders and utilities
"""
import datetime
import decimal
from typing import Any, Union
import orjson
class CustomJSONEncoder(JSONEncoder):
def default_json_encoder(obj: Any) -> Any:
"""
Расширенный JSON энкодер с поддержкой сериализации объектов SQLAlchemy.
Default JSON encoder для объектов, которые не поддерживаются стандартным JSON
Примеры:
>>> import json
>>> from decimal import Decimal
>>> from orm.topic import Topic
>>> json.dumps(Decimal("10.50"), cls=CustomJSONEncoder)
'"10.50"'
>>> topic = Topic(id=1, slug="test")
>>> json.dumps(topic, cls=CustomJSONEncoder)
'{"id": 1, "slug": "test", ...}'
Args:
obj: Объект для сериализации
Returns:
Сериализуемое представление объекта
Raises:
TypeError: Если объект не может быть сериализован
"""
if hasattr(obj, "dict") and callable(obj.dict):
return obj.dict()
if hasattr(obj, "__dict__"):
return obj.__dict__
if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)):
return obj.isoformat()
if isinstance(obj, decimal.Decimal):
return float(obj)
if hasattr(obj, "__json__"):
return obj.__json__()
msg = f"Object of type {type(obj)} is not JSON serializable"
raise TypeError(msg)
def default(self, obj):
if isinstance(obj, Decimal):
return str(obj)
# Проверяем, есть ли у объекта метод dict() (как у моделей SQLAlchemy)
if hasattr(obj, "dict") and callable(obj.dict):
return obj.dict()
def orjson_dumps(obj: Any, **kwargs: Any) -> bytes:
"""
Сериализует объект в JSON с помощью orjson
return super().default(obj)
Args:
obj: Объект для сериализации
**kwargs: Дополнительные параметры для orjson.dumps
Returns:
bytes: JSON в виде байтов
"""
# Используем правильную константу для orjson
option_flags = orjson.OPT_SERIALIZE_DATACLASS
if kwargs.get("indent"):
option_flags |= orjson.OPT_INDENT_2
return orjson.dumps(obj, default=default_json_encoder, option=option_flags)
def orjson_loads(data: Union[str, bytes]) -> Any:
"""
Десериализует JSON с помощью orjson
Args:
data: JSON данные в виде строки или байтов
Returns:
Десериализованный объект
"""
return orjson.loads(data)
class JSONEncoder:
"""Кастомный JSON кодировщик на основе orjson"""
@staticmethod
def encode(obj: Any) -> str:
"""Encode object to JSON string"""
return orjson_dumps(obj).decode("utf-8")
@staticmethod
def decode(data: Union[str, bytes]) -> Any:
"""Decode JSON string to object"""
return orjson_loads(data)
# Создаем экземпляр для обратной совместимости
CustomJSONEncoder = JSONEncoder()
def fast_json_dumps(obj: Any, indent: bool = False) -> str:
"""
Быстрая сериализация JSON
Args:
obj: Объект для сериализации
indent: Форматировать с отступами
Returns:
JSON строка
"""
return orjson_dumps(obj, indent=indent).decode("utf-8")
def fast_json_loads(data: Union[str, bytes]) -> Any:
"""
Быстрая десериализация JSON
Args:
data: JSON данные
Returns:
Десериализованный объект
"""
return orjson_loads(data)
# Экспортируем для удобства
dumps = fast_json_dumps
loads = fast_json_loads

View File

@@ -4,24 +4,31 @@
import trafilatura
from utils.logger import root_logger as logger
def extract_text(html: str) -> str:
"""
Извлекает текст из HTML-фрагмента.
Извлекает чистый текст из HTML
Args:
html: HTML-фрагмент
html: HTML строка
Returns:
str: Текст из HTML-фрагмента
str: Извлеченный текст или пустая строка
"""
return trafilatura.extract(
wrap_html_fragment(html),
include_comments=False,
include_tables=False,
include_images=False,
include_formatting=False,
)
try:
result = trafilatura.extract(
html,
include_comments=False,
include_tables=True,
include_formatting=False,
favor_precision=True,
)
return result or ""
except Exception as e:
logger.error(f"Error extracting text: {e}")
return ""
def wrap_html_fragment(fragment: str) -> str:

View File

@@ -5,48 +5,55 @@ from auth.orm import Author
from services.db import local_session
def replace_translit(src):
def replace_translit(src: str) -> str:
ruchars = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя."
enchars = [
"a",
"b",
"v",
"g",
"d",
"e",
"yo",
"zh",
"z",
"i",
"y",
"k",
"l",
"m",
"n",
"o",
"p",
"r",
"s",
"t",
"u",
"f",
"h",
"c",
"ch",
"sh",
"sch",
"",
"y",
"'",
"e",
"yu",
"ya",
"-",
]
return src.translate(str.maketrans(ruchars, enchars))
enchars = "abvgdeyozhziyklmnoprstufhcchshsch'yye'yuyaa-"
# Создаем словарь для замены, так как некоторые русские символы соответствуют нескольким латинским
translit_dict = {
"а": "a",
"б": "b",
"в": "v",
"г": "g",
"д": "d",
"е": "e",
"ё": "yo",
"ж": "zh",
"з": "z",
"и": "i",
"й": "y",
"к": "k",
"л": "l",
"м": "m",
"н": "n",
"о": "o",
"п": "p",
"р": "r",
"с": "s",
"т": "t",
"у": "u",
"ф": "f",
"х": "h",
"ц": "c",
"ч": "ch",
"ш": "sh",
"щ": "sch",
"ъ": "",
"ы": "y",
"ь": "",
"э": "e",
"ю": "yu",
"я": "ya",
".": "-",
}
result = ""
for char in src:
result += translit_dict.get(char, char)
return result
def generate_unique_slug(src):
def generate_unique_slug(src: str) -> str:
print("[resolvers.auth] generating slug from: " + src)
slug = replace_translit(src.lower())
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
@@ -63,3 +70,6 @@ def generate_unique_slug(src):
unique_slug = slug
print("[resolvers.auth] " + unique_slug)
return quote_plus(unique_slug.replace("'", "")).replace("+", "-")
# Fallback return если что-то пошло не так
return quote_plus(slug.replace("'", "")).replace("+", "-")

View File

@@ -1,5 +1,6 @@
import logging
from pathlib import Path
from typing import Any
import colorlog
@@ -7,7 +8,7 @@ _lib_path = Path(__file__).parents[1]
_leng_path = len(_lib_path.as_posix())
def filter(record: logging.LogRecord):
def filter(record: logging.LogRecord) -> bool:
# Define `package` attribute with the relative path.
record.package = record.pathname[_leng_path + 1 :].replace(".py", "")
record.emoji = (
@@ -23,7 +24,7 @@ def filter(record: logging.LogRecord):
if record.levelno == logging.CRITICAL
else ""
)
return record
return True
# Define the color scheme
@@ -57,28 +58,32 @@ fmt_config = {
class MultilineColoredFormatter(colorlog.ColoredFormatter):
def __init__(self, *args, **kwargs):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.log_colors = kwargs.pop("log_colors", {})
self.secondary_log_colors = kwargs.pop("secondary_log_colors", {})
def format(self, record):
def format(self, record: logging.LogRecord) -> str:
# Add default emoji if not present
if not hasattr(record, "emoji"):
record = filter(record)
record.emoji = "📝"
message = record.getMessage()
if "\n" in message:
lines = message.split("\n")
first_line = lines[0]
record.message = first_line
formatted_first_line = super().format(record)
# Add default package if not present
if not hasattr(record, "package"):
record.package = getattr(record, "name", "unknown")
# Format the first line normally
formatted_first_line = super().format(record)
# Check if the message has multiple lines
lines = formatted_first_line.split("\n")
if len(lines) > 1:
# For multiple lines, only apply colors to the first line
# Keep subsequent lines without color formatting
formatted_lines = [formatted_first_line]
for line in lines[1:]:
formatted_lines.append(line)
formatted_lines.extend(lines[1:])
return "\n".join(formatted_lines)
else:
return super().format(record)
return super().format(record)
# Create a MultilineColoredFormatter object for colorized logging
@@ -89,7 +94,7 @@ stream = logging.StreamHandler()
stream.setFormatter(formatter)
def get_colorful_logger(name="main"):
def get_colorful_logger(name: str = "main") -> logging.Logger:
# Create and configure the logger
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)