From a10db2d38a8e8ca3b5efcef4882fce957be10aac Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 24 Apr 2025 13:35:36 -0300 Subject: [PATCH] feat(search.py): combined search on shouts tittles and bodys --- resolvers/reader.py | 87 ++++++++++++++++++++++++++++++++--- services/search.py | 109 ++++++++++++++++++++------------------------ 2 files changed, 129 insertions(+), 67 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index aeb60e50..a6bae143 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -10,7 +10,7 @@ from orm.shout import Shout, ShoutAuthor, ShoutTopic from orm.topic import Topic from services.db import json_array_builder, json_builder, local_session from services.schema import query -from services.search import search_text, get_search_count +from services.search import search_body_text, search_title_text, search_author_text, get_body_search_count, get_title_search_count, get_author_search_count from services.viewed import ViewedStorage from utils.logger import root_logger as logger @@ -393,8 +393,51 @@ async def load_shouts_search(_, info, text, options): offset = options.get("offset", 0) if isinstance(text, str) and len(text) > 2: - # Get search results with pagination - results = await search_text(text, limit, offset) + # Search in titles, bodies and combine results + title_results = await search_title_text(text, limit * 2, 0) + body_results = await search_body_text(text, limit * 2, 0) + + # Also get author search results if requested + include_authors = options.get("include_authors", False) + author_results = [] + if include_authors: + author_results = await search_author_text(text, limit, 0) + # Process author results differently if needed + + # Combine results and deduplicate by ID + combined_results = {} + + # Process title results first (typically more relevant) + for result in title_results: + shout_id = result.get("id") + if shout_id: + combined_results[shout_id] = { + "id": shout_id, + "score": result.get("score", 0) * 1.2 # Slightly boost title matches + } + + # Process body results, keeping higher scores if already present + for result in body_results: + shout_id = result.get("id") + if shout_id: + if shout_id in combined_results: + # Keep the higher score + combined_results[shout_id]["score"] = max( + combined_results[shout_id]["score"], + result.get("score", 0) + ) + else: + combined_results[shout_id] = { + "id": shout_id, + "score": result.get("score", 0) + } + + # Convert to list and sort by score + results = list(combined_results.values()) + results.sort(key=lambda x: x.get("score", 0), reverse=True) + + # Apply pagination + results = results[offset:offset+limit] # If no results, return empty list if not results: @@ -416,7 +459,6 @@ async def load_shouts_search(_, info, text, options): q = q.filter(Shout.id.in_(hits_ids)) q = apply_filters(q, options.get("filters", {})) - # shouts = get_shouts_with_links(info, q, len(hits_ids), 0) # Add scores from search results @@ -427,10 +469,29 @@ async def load_shouts_search(_, info, text, options): # Re-sort by search score to maintain ranking shouts.sort(key=lambda x: scores.get(str(x['id']), 0), reverse=True) + # Add author search results to the response if requested + if include_authors and author_results: + # Format author results according to your schema + formatted_authors = [] + for author in author_results: + formatted_authors.append({ + "id": author.get("id"), + "name": author.get("name", ""), + "score": author.get("score", 0), + "bio": author.get("bio", "") + }) + + # Return combined results + return { + "shouts": shouts, + "authors": formatted_authors + } + return shouts return [] + @query.field("get_search_results_count") async def get_search_results_count(_, info, text): """ @@ -442,9 +503,21 @@ async def get_search_results_count(_, info, text): :return: Total count of results """ if isinstance(text, str) and len(text) > 2: - count = await get_search_count(text) - return {"count": count} - return {"count": 0} + # Get counts from both title and body searches + body_count = await get_body_search_count(text) + title_count = await get_title_search_count(text) + author_count = await get_author_search_count(text) + + # Return combined counts + return { + "count": body_count + title_count, # Total document count + "details": { + "body_count": body_count, + "title_count": title_count, + "author_count": author_count + } + } + return {"count": 0, "details": {"body_count": 0, "title_count": 0, "author_count": 0}} @query.field("load_shouts_unrated") diff --git a/services/search.py b/services/search.py index 33fe9712..875edfe9 100644 --- a/services/search.py +++ b/services/search.py @@ -615,82 +615,71 @@ class SearchService: #******************* - # Specialized search methods for titles, bodies, and authors + # Specialized search methods for shouts and authors + #******************* - async def search_titles(self, text, limit=10, offset=0): - """Search only in titles using the specialized endpoint""" - if not self.available or not text.strip(): + async def search(self, text, limit, offset): + """Search documents""" + if not self.available: + return [] + + if not isinstance(text, str) or not text.strip(): return [] - cache_key = f"title:{text}" + logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset})") - # Try cache first if enabled + # Check if we can serve from cache if SEARCH_CACHE_ENABLED: - if await self.cache.has_query(cache_key): - return await self.cache.get(cache_key, limit, offset) + has_cache = await self.cache.has_query(text) + if has_cache: + cached_results = await self.cache.get(text, limit, offset) + if cached_results is not None: + return cached_results + # Not in cache or cache disabled, perform new search try: - logger.info(f"Searching titles for: '{text}' (limit={limit}, offset={offset})") + search_limit = limit + search_offset = offset + + if SEARCH_CACHE_ENABLED: + search_limit = SEARCH_PREFETCH_SIZE + search_offset = 0 + else: + search_limit = limit + search_offset = offset + response = await self.client.post( - "/search-title", - json={"text": text, "limit": limit + offset} + "/search-combined", + json={"text": text, "limit": search_limit, "offset": search_offset} ) response.raise_for_status() result = response.json() - title_results = result.get("results", []) - # Apply score filtering if needed + formatted_results = result.get("results", []) + + valid_results = [] + for item in formatted_results: + doc_id = item.get("id") + if doc_id and doc_id.isdigit(): + valid_results.append(item) + + if len(valid_results) != len(formatted_results): + formatted_results = valid_results + if SEARCH_MIN_SCORE > 0: - title_results = [r for r in title_results if r.get("score", 0) >= SEARCH_MIN_SCORE] - - # Store in cache if enabled + initial_count = len(formatted_results) + formatted_results = [r for r in formatted_results if r.get("score", 0) >= SEARCH_MIN_SCORE] + if SEARCH_CACHE_ENABLED: - await self.cache.store(cache_key, title_results) - - # Apply offset/limit (API might not support it directly) - return title_results[offset:offset+limit] - + await self.cache.store(text, formatted_results) + end_idx = offset + limit + page_results = formatted_results[offset:end_idx] + return page_results + + return formatted_results except Exception as e: - logger.error(f"Error searching titles for '{text}': {e}") - return [] - - async def search_bodies(self, text, limit=10, offset=0): - """Search only in document bodies using the specialized endpoint""" - if not self.available or not text.strip(): - return [] - - cache_key = f"body:{text}" - - # Try cache first if enabled - if SEARCH_CACHE_ENABLED: - if await self.cache.has_query(cache_key): - return await self.cache.get(cache_key, limit, offset) - - try: - logger.info(f"Searching bodies for: '{text}' (limit={limit}, offset={offset})") - response = await self.client.post( - "/search-body", - json={"text": text, "limit": limit + offset} - ) - response.raise_for_status() - - result = response.json() - body_results = result.get("results", []) - - # Apply score filtering if needed - if SEARCH_MIN_SCORE > 0: - body_results = [r for r in body_results if r.get("score", 0) >= SEARCH_MIN_SCORE] - - # Store in cache if enabled - if SEARCH_CACHE_ENABLED: - await self.cache.store(cache_key, body_results) - - # Apply offset/limit - return body_results[offset:offset+limit] - - except Exception as e: - logger.error(f"Error searching bodies for '{text}': {e}") + logger.error(f"Search error for '{text}': {e}", exc_info=True) return [] async def search_authors(self, text, limit=10, offset=0):