2025-06-01 23:56:11 +00:00
|
|
|
import asyncio
|
2025-02-09 19:26:50 +00:00
|
|
|
import concurrent.futures
|
2025-06-01 23:56:11 +00:00
|
|
|
from concurrent.futures import Future
|
|
|
|
from typing import Any, Optional
|
|
|
|
|
|
|
|
try:
|
|
|
|
from utils.logger import root_logger as logger
|
|
|
|
except ImportError:
|
|
|
|
import logging
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class PreTopicService:
|
|
|
|
def __init__(self) -> None:
|
|
|
|
self.topic_embeddings: Optional[Any] = None
|
|
|
|
self.search_embeddings: Optional[Any] = None
|
|
|
|
self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=2)
|
|
|
|
self._initialization_future: Optional[Future[None]] = None
|
|
|
|
|
|
|
|
def _ensure_initialization(self) -> None:
|
|
|
|
"""Ensure embeddings are initialized"""
|
2025-02-09 19:26:50 +00:00
|
|
|
if self._initialization_future is None:
|
|
|
|
self._initialization_future = self._executor.submit(self._prepare_embeddings)
|
|
|
|
|
|
|
|
def _prepare_embeddings(self) -> None:
|
2025-06-01 23:56:11 +00:00
|
|
|
"""Prepare embeddings for topic and search functionality"""
|
2025-02-09 19:26:50 +00:00
|
|
|
try:
|
2025-06-01 23:56:11 +00:00
|
|
|
from txtai.embeddings import Embeddings # type: ignore[import-untyped]
|
|
|
|
|
|
|
|
# Initialize topic embeddings
|
|
|
|
self.topic_embeddings = Embeddings(
|
|
|
|
{
|
|
|
|
"method": "transformers",
|
|
|
|
"path": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
# Initialize search embeddings
|
|
|
|
self.search_embeddings = Embeddings(
|
|
|
|
{
|
|
|
|
"method": "transformers",
|
|
|
|
"path": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
logger.info("PreTopic embeddings initialized successfully")
|
|
|
|
except ImportError:
|
|
|
|
logger.warning("txtai.embeddings not available, PreTopicService disabled")
|
2025-02-09 19:26:50 +00:00
|
|
|
except Exception as e:
|
2025-06-01 23:56:11 +00:00
|
|
|
logger.error(f"Failed to initialize embeddings: {e}")
|
|
|
|
|
|
|
|
async def suggest_topics(self, text: str) -> list[dict[str, Any]]:
|
|
|
|
"""Suggest topics based on text content"""
|
|
|
|
if self.topic_embeddings is None:
|
2025-02-09 19:26:50 +00:00
|
|
|
return []
|
2025-02-11 09:00:35 +00:00
|
|
|
|
2025-02-09 19:26:50 +00:00
|
|
|
try:
|
2025-06-01 23:56:11 +00:00
|
|
|
self._ensure_initialization()
|
|
|
|
if self._initialization_future:
|
|
|
|
await asyncio.wrap_future(self._initialization_future)
|
|
|
|
|
|
|
|
if self.topic_embeddings is not None:
|
|
|
|
results = self.topic_embeddings.search(text, 1)
|
|
|
|
if results:
|
|
|
|
return [{"topic": result["text"], "score": result["score"]} for result in results]
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(f"Error suggesting topics: {e}")
|
|
|
|
return []
|
2025-02-11 09:00:35 +00:00
|
|
|
|
2025-06-01 23:56:11 +00:00
|
|
|
async def search_content(self, query: str, limit: int = 10) -> list[dict[str, Any]]:
|
|
|
|
"""Search content using embeddings"""
|
|
|
|
if self.search_embeddings is None:
|
|
|
|
return []
|
2025-02-09 19:26:50 +00:00
|
|
|
|
2025-06-01 23:56:11 +00:00
|
|
|
try:
|
|
|
|
self._ensure_initialization()
|
|
|
|
if self._initialization_future:
|
|
|
|
await asyncio.wrap_future(self._initialization_future)
|
|
|
|
|
|
|
|
if self.search_embeddings is not None:
|
|
|
|
results = self.search_embeddings.search(query, limit)
|
|
|
|
if results:
|
|
|
|
return [{"content": result["text"], "score": result["score"]} for result in results]
|
2025-02-09 19:26:50 +00:00
|
|
|
except Exception as e:
|
2025-06-01 23:56:11 +00:00
|
|
|
logger.error(f"Error searching content: {e}")
|
|
|
|
return []
|
|
|
|
|
2025-02-09 19:26:50 +00:00
|
|
|
|
2025-06-01 23:56:11 +00:00
|
|
|
# Global instance
|
|
|
|
pretopic_service = PreTopicService()
|