core/scripts/preload_models.py

#!/usr/bin/env python3
"""
🚀 Предзагрузка HuggingFace моделей для кеширования в Docker

Этот скрипт загружает модели заранее при сборке Docker образа,
чтобы избежать загрузки во время первого запуска приложения.
"""

import os
import sys
from pathlib import Path


def get_models_cache_dir() -> str:
    """Определяет лучшую папку для кеша моделей"""
    # Пробуем /dump если доступен для записи
    dump_path = Path("/dump")
    if dump_path.exists() and os.access("/dump", os.W_OK):
        cache_dir = "/dump/huggingface"
        try:
            Path(cache_dir).mkdir(parents=True, exist_ok=True)
            return cache_dir
        except Exception:  # noqa: S110
            pass

    # Fallback - локальная папка ./dump
    cache_dir = "./dump/huggingface"
    Path(cache_dir).mkdir(parents=True, exist_ok=True)
    return cache_dir


# Настройка переменных окружения для кеша
MODELS_CACHE_DIR = get_models_cache_dir()
os.environ["TRANSFORMERS_CACHE"] = MODELS_CACHE_DIR
os.environ["HF_HOME"] = MODELS_CACHE_DIR


def is_model_cached(model_name: str) -> bool:
    """🔍 Проверяет наличие модели в кеше"""
    try:
        cache_path = Path(MODELS_CACHE_DIR)
        model_cache_name = f"models--sentence-transformers--{model_name}"
        model_path = cache_path / model_cache_name

        if not model_path.exists():
            return False

        # Проверяем наличие snapshots папки (новый формат HuggingFace)
        snapshots_path = model_path / "snapshots"
        if snapshots_path.exists():
            # Ищем любой snapshot с config.json
            for snapshot_dir in snapshots_path.iterdir():
                if snapshot_dir.is_dir():
                    config_file = snapshot_dir / "config.json"
                    if config_file.exists():
                        return True

        # Fallback: проверяем старый формат
        config_file = model_path / "config.json"
        return config_file.exists()
    except Exception:
        return False


try:
    from sentence_transformers import SentenceTransformer

    # Создаем папку для кеша
    Path(MODELS_CACHE_DIR).mkdir(parents=True, exist_ok=True)
    print(f"📁 Created cache directory: {MODELS_CACHE_DIR}")

    # Список моделей для предзагрузки
    models = [
        "paraphrase-multilingual-MiniLM-L12-v2",  # Основная многоязычная модель
        "all-MiniLM-L6-v2",  # Fallback модель
    ]

    for model_name in models:
        try:
            if is_model_cached(model_name):
                print(f"🔍 Found cached model: {model_name}")
                continue

            print(f"🔽 Downloading model: {model_name}")
            model = SentenceTransformer(model_name, cache_folder=MODELS_CACHE_DIR)
            print(f"✅ Successfully cached: {model_name}")

            # Освобождаем память
            del model

        except Exception as e:
            print(f"❌ Failed to download {model_name}: {e}")

    print("🚀 Model preloading completed!")

except ImportError as e:
    print(f"❌ Failed to import dependencies: {e}")
    sys.exit(1)
except Exception as e:
    print(f"❌ Unexpected error: {e}")
    sys.exit(1)