Improve topic sorting: add popular sorting by publications and authors count

2025-06-02 02:56:11 +03:00
parent baca19a4d5
commit 3327976586
113 changed files with 7238 additions and 3739 deletions
--- a/utils/diff.py
+++ b/utils/diff.py
@@ -2,7 +2,7 @@ import re
 from difflib import ndiff


-def get_diff(original, modified):
+def get_diff(original: str, modified: str) -> list[str]:
    """
    Get the difference between two strings using difflib.

@@ -13,11 +13,10 @@ def get_diff(original, modified):
    Returns:
    A list of differences.
    """
-    diff = list(ndiff(original.split(), modified.split()))
-    return diff
+    return list(ndiff(original.split(), modified.split()))


-def apply_diff(original, diff):
+def apply_diff(original: str, diff: list[str]) -> str:
    """
    Apply the difference to the original string.

--- a/utils/encoders.py
+++ b/utils/encoders.py
@@ -1,28 +1,118 @@
-from decimal import Decimal
-from json import JSONEncoder
+"""
+JSON encoders and utilities
+"""
+
+import datetime
+import decimal
+from typing import Any, Union
+
+import orjson


-class CustomJSONEncoder(JSONEncoder):
+def default_json_encoder(obj: Any) -> Any:
    """
-    Расширенный JSON энкодер с поддержкой сериализации объектов SQLAlchemy.
+    Default JSON encoder для объектов, которые не поддерживаются стандартным JSON

-    Примеры:
-    >>> import json
-    >>> from decimal import Decimal
-    >>> from orm.topic import Topic
-    >>> json.dumps(Decimal("10.50"), cls=CustomJSONEncoder)
-    '"10.50"'
-    >>> topic = Topic(id=1, slug="test")
-    >>> json.dumps(topic, cls=CustomJSONEncoder)
-    '{"id": 1, "slug": "test", ...}'
+    Args:
+        obj: Объект для сериализации
+
+    Returns:
+        Сериализуемое представление объекта
+
+    Raises:
+        TypeError: Если объект не может быть сериализован
    """
+    if hasattr(obj, "dict") and callable(obj.dict):
+        return obj.dict()
+    if hasattr(obj, "__dict__"):
+        return obj.__dict__
+    if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)):
+        return obj.isoformat()
+    if isinstance(obj, decimal.Decimal):
+        return float(obj)
+    if hasattr(obj, "__json__"):
+        return obj.__json__()
+    msg = f"Object of type {type(obj)} is not JSON serializable"
+    raise TypeError(msg)

-    def default(self, obj):
-        if isinstance(obj, Decimal):
-            return str(obj)

-        # Проверяем, есть ли у объекта метод dict() (как у моделей SQLAlchemy)
-        if hasattr(obj, "dict") and callable(obj.dict):
-            return obj.dict()
+def orjson_dumps(obj: Any, **kwargs: Any) -> bytes:
+    """
+    Сериализует объект в JSON с помощью orjson

-        return super().default(obj)
+    Args:
+        obj: Объект для сериализации
+        **kwargs: Дополнительные параметры для orjson.dumps
+
+    Returns:
+        bytes: JSON в виде байтов
+    """
+    # Используем правильную константу для orjson
+    option_flags = orjson.OPT_SERIALIZE_DATACLASS
+    if kwargs.get("indent"):
+        option_flags |= orjson.OPT_INDENT_2
+
+    return orjson.dumps(obj, default=default_json_encoder, option=option_flags)
+
+
+def orjson_loads(data: Union[str, bytes]) -> Any:
+    """
+    Десериализует JSON с помощью orjson
+
+    Args:
+        data: JSON данные в виде строки или байтов
+
+    Returns:
+        Десериализованный объект
+    """
+    return orjson.loads(data)
+
+
+class JSONEncoder:
+    """Кастомный JSON кодировщик на основе orjson"""
+
+    @staticmethod
+    def encode(obj: Any) -> str:
+        """Encode object to JSON string"""
+        return orjson_dumps(obj).decode("utf-8")
+
+    @staticmethod
+    def decode(data: Union[str, bytes]) -> Any:
+        """Decode JSON string to object"""
+        return orjson_loads(data)
+
+
+# Создаем экземпляр для обратной совместимости
+CustomJSONEncoder = JSONEncoder()
+
+
+def fast_json_dumps(obj: Any, indent: bool = False) -> str:
+    """
+    Быстрая сериализация JSON
+
+    Args:
+        obj: Объект для сериализации
+        indent: Форматировать с отступами
+
+    Returns:
+        JSON строка
+    """
+    return orjson_dumps(obj, indent=indent).decode("utf-8")
+
+
+def fast_json_loads(data: Union[str, bytes]) -> Any:
+    """
+    Быстрая десериализация JSON
+
+    Args:
+        data: JSON данные
+
+    Returns:
+        Десериализованный объект
+    """
+    return orjson_loads(data)
+
+
+# Экспортируем для удобства
+dumps = fast_json_dumps
+loads = fast_json_loads
--- a/utils/extract_text.py
+++ b/utils/extract_text.py
@@ -4,24 +4,31 @@

 import trafilatura

+from utils.logger import root_logger as logger
+

 def extract_text(html: str) -> str:
    """
-    Извлекает текст из HTML-фрагмента.
+    Извлекает чистый текст из HTML

    Args:
-        html: HTML-фрагмент
+        html: HTML строка

    Returns:
-        str: Текст из HTML-фрагмента
+        str: Извлеченный текст или пустая строка
    """
-    return trafilatura.extract(
-        wrap_html_fragment(html),
-        include_comments=False,
-        include_tables=False,
-        include_images=False,
-        include_formatting=False,
-    )
+    try:
+        result = trafilatura.extract(
+            html,
+            include_comments=False,
+            include_tables=True,
+            include_formatting=False,
+            favor_precision=True,
+        )
+        return result or ""
+    except Exception as e:
+        logger.error(f"Error extracting text: {e}")
+        return ""


 def wrap_html_fragment(fragment: str) -> str:
--- a/utils/generate_slug.py
+++ b/utils/generate_slug.py
@@ -5,48 +5,55 @@ from auth.orm import Author
 from services.db import local_session


-def replace_translit(src):
+def replace_translit(src: str) -> str:
    ruchars = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя."
-    enchars = [
-        "a",
-        "b",
-        "v",
-        "g",
-        "d",
-        "e",
-        "yo",
-        "zh",
-        "z",
-        "i",
-        "y",
-        "k",
-        "l",
-        "m",
-        "n",
-        "o",
-        "p",
-        "r",
-        "s",
-        "t",
-        "u",
-        "f",
-        "h",
-        "c",
-        "ch",
-        "sh",
-        "sch",
-        "",
-        "y",
-        "'",
-        "e",
-        "yu",
-        "ya",
-        "-",
-    ]
-    return src.translate(str.maketrans(ruchars, enchars))
+    enchars = "abvgdeyozhziyklmnoprstufhcchshsch'yye'yuyaa-"
+
+    # Создаем словарь для замены, так как некоторые русские символы соответствуют нескольким латинским
+    translit_dict = {
+        "а": "a",
+        "б": "b",
+        "в": "v",
+        "г": "g",
+        "д": "d",
+        "е": "e",
+        "ё": "yo",
+        "ж": "zh",
+        "з": "z",
+        "и": "i",
+        "й": "y",
+        "к": "k",
+        "л": "l",
+        "м": "m",
+        "н": "n",
+        "о": "o",
+        "п": "p",
+        "р": "r",
+        "с": "s",
+        "т": "t",
+        "у": "u",
+        "ф": "f",
+        "х": "h",
+        "ц": "c",
+        "ч": "ch",
+        "ш": "sh",
+        "щ": "sch",
+        "ъ": "",
+        "ы": "y",
+        "ь": "",
+        "э": "e",
+        "ю": "yu",
+        "я": "ya",
+        ".": "-",
+    }
+
+    result = ""
+    for char in src:
+        result += translit_dict.get(char, char)
+    return result


-def generate_unique_slug(src):
+def generate_unique_slug(src: str) -> str:
    print("[resolvers.auth] generating slug from: " + src)
    slug = replace_translit(src.lower())
    slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
@@ -63,3 +70,6 @@ def generate_unique_slug(src):
            unique_slug = slug
            print("[resolvers.auth] " + unique_slug)
            return quote_plus(unique_slug.replace("'", "")).replace("+", "-")
+
+    # Fallback return если что-то пошло не так
+    return quote_plus(slug.replace("'", "")).replace("+", "-")
--- a/utils/logger.py
+++ b/utils/logger.py
@@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
+from typing import Any

 import colorlog

@@ -7,7 +8,7 @@ _lib_path = Path(__file__).parents[1]
 _leng_path = len(_lib_path.as_posix())


-def filter(record: logging.LogRecord):
+def filter(record: logging.LogRecord) -> bool:
    # Define `package` attribute with the relative path.
    record.package = record.pathname[_leng_path + 1 :].replace(".py", "")
    record.emoji = (
@@ -23,7 +24,7 @@ def filter(record: logging.LogRecord):
        if record.levelno == logging.CRITICAL
        else ""
    )
-    return record
+    return True


 # Define the color scheme
@@ -57,28 +58,32 @@ fmt_config = {


 class MultilineColoredFormatter(colorlog.ColoredFormatter):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)
        self.log_colors = kwargs.pop("log_colors", {})
        self.secondary_log_colors = kwargs.pop("secondary_log_colors", {})

-    def format(self, record):
+    def format(self, record: logging.LogRecord) -> str:
        # Add default emoji if not present
        if not hasattr(record, "emoji"):
-            record = filter(record)
+            record.emoji = "📝"

-        message = record.getMessage()
-        if "\n" in message:
-            lines = message.split("\n")
-            first_line = lines[0]
-            record.message = first_line
-            formatted_first_line = super().format(record)
+        # Add default package if not present
+        if not hasattr(record, "package"):
+            record.package = getattr(record, "name", "unknown")
+
+        # Format the first line normally
+        formatted_first_line = super().format(record)
+
+        # Check if the message has multiple lines
+        lines = formatted_first_line.split("\n")
+        if len(lines) > 1:
+            # For multiple lines, only apply colors to the first line
+            # Keep subsequent lines without color formatting
            formatted_lines = [formatted_first_line]
-            for line in lines[1:]:
-                formatted_lines.append(line)
+            formatted_lines.extend(lines[1:])
            return "\n".join(formatted_lines)
-        else:
-            return super().format(record)
+        return super().format(record)


 # Create a MultilineColoredFormatter object for colorized logging
@@ -89,7 +94,7 @@ stream = logging.StreamHandler()
 stream.setFormatter(formatter)


-def get_colorful_logger(name="main"):
+def get_colorful_logger(name: str = "main") -> logging.Logger:
    # Create and configure the logger
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)