102 lines
3.5 KiB
Python
102 lines
3.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
🚀 Предзагрузка HuggingFace моделей для кеширования в Docker
|
|||
|
|
|
|||
|
|
Этот скрипт загружает модели заранее при сборке Docker образа,
|
|||
|
|
чтобы избежать загрузки во время первого запуска приложения.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_models_cache_dir() -> str:
|
|||
|
|
"""Определяет лучшую папку для кеша моделей"""
|
|||
|
|
# Пробуем /dump если доступен для записи
|
|||
|
|
dump_path = Path("/dump")
|
|||
|
|
if dump_path.exists() and os.access("/dump", os.W_OK):
|
|||
|
|
cache_dir = "/dump/huggingface"
|
|||
|
|
try:
|
|||
|
|
Path(cache_dir).mkdir(parents=True, exist_ok=True)
|
|||
|
|
return cache_dir
|
|||
|
|
except Exception: # noqa: S110
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# Fallback - локальная папка ./dump
|
|||
|
|
cache_dir = "./dump/huggingface"
|
|||
|
|
Path(cache_dir).mkdir(parents=True, exist_ok=True)
|
|||
|
|
return cache_dir
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Настройка переменных окружения для кеша
|
|||
|
|
MODELS_CACHE_DIR = get_models_cache_dir()
|
|||
|
|
os.environ["TRANSFORMERS_CACHE"] = MODELS_CACHE_DIR
|
|||
|
|
os.environ["HF_HOME"] = MODELS_CACHE_DIR
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_model_cached(model_name: str) -> bool:
|
|||
|
|
"""🔍 Проверяет наличие модели в кеше"""
|
|||
|
|
try:
|
|||
|
|
cache_path = Path(MODELS_CACHE_DIR)
|
|||
|
|
model_cache_name = f"models--sentence-transformers--{model_name}"
|
|||
|
|
model_path = cache_path / model_cache_name
|
|||
|
|
|
|||
|
|
if not model_path.exists():
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# Проверяем наличие snapshots папки (новый формат HuggingFace)
|
|||
|
|
snapshots_path = model_path / "snapshots"
|
|||
|
|
if snapshots_path.exists():
|
|||
|
|
# Ищем любой snapshot с config.json
|
|||
|
|
for snapshot_dir in snapshots_path.iterdir():
|
|||
|
|
if snapshot_dir.is_dir():
|
|||
|
|
config_file = snapshot_dir / "config.json"
|
|||
|
|
if config_file.exists():
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
# Fallback: проверяем старый формат
|
|||
|
|
config_file = model_path / "config.json"
|
|||
|
|
return config_file.exists()
|
|||
|
|
except Exception:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from sentence_transformers import SentenceTransformer
|
|||
|
|
|
|||
|
|
# Создаем папку для кеша
|
|||
|
|
Path(MODELS_CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
|||
|
|
print(f"📁 Created cache directory: {MODELS_CACHE_DIR}")
|
|||
|
|
|
|||
|
|
# Список моделей для предзагрузки
|
|||
|
|
models = [
|
|||
|
|
"paraphrase-multilingual-MiniLM-L12-v2", # Основная многоязычная модель
|
|||
|
|
"all-MiniLM-L6-v2", # Fallback модель
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for model_name in models:
|
|||
|
|
try:
|
|||
|
|
if is_model_cached(model_name):
|
|||
|
|
print(f"🔍 Found cached model: {model_name}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
print(f"🔽 Downloading model: {model_name}")
|
|||
|
|
model = SentenceTransformer(model_name, cache_folder=MODELS_CACHE_DIR)
|
|||
|
|
print(f"✅ Successfully cached: {model_name}")
|
|||
|
|
|
|||
|
|
# Освобождаем память
|
|||
|
|
del model
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Failed to download {model_name}: {e}")
|
|||
|
|
|
|||
|
|
print("🚀 Model preloading completed!")
|
|||
|
|
|
|||
|
|
except ImportError as e:
|
|||
|
|
print(f"❌ Failed to import dependencies: {e}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Unexpected error: {e}")
|
|||
|
|
sys.exit(1)
|