refactor(search.py): with checking titles without bodies for not re indexing them every startup

2025-04-24 14:58:14 -03:00 · 2025-04-24 14:58:14 -03:00 · 3062a2b7de
commit 3062a2b7de
parent c0406dbbf2
1 changed files with 28 additions and 16 deletions
--- a/services/search.py
+++ b/services/search.py
@ -10,6 +10,9 @@ from datetime import datetime, timedelta
 # Set up proper logging
 logger = logging.getLogger("search")
 logger.setLevel(logging.INFO)  # Change to INFO to see more details
 # Disable noise HTTP client logging
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("httpcore").setLevel(logging.WARNING)
 # Configuration for search service
 SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"])
@ -798,28 +801,37 @@ async def initialize_search_index(shouts_data):
            if problem_docs:
                await search_service.bulk_index(problem_docs)
-    db_ids = [str(shout.id) for shout in shouts_data]
+    # Only consider shouts with body content for body verification
-    
+    def has_body_content(shout):
        for field in ['subtitle', 'lead', 'body']:
            if getattr(shout, field, None) and isinstance(getattr(shout, field, None), str) and getattr(shout, field).strip():
                return True
        media = getattr(shout, 'media', None)
        if media:
            if isinstance(media, str):
                try:
-        numeric_ids = [int(sid) for sid in db_ids if sid.isdigit()]
+                    media_json = json.loads(media)
-        if numeric_ids:
+                    if isinstance(media_json, dict) and (media_json.get('title') or media_json.get('body')):
-            min_id = min(numeric_ids)
+                        return True
-            max_id = max(numeric_ids)
+                except Exception:
-            id_range = max_id - min_id + 1
+                    return True
-    except Exception as e:
+            elif isinstance(media, dict):
-        pass
+                if media.get('title') or media.get('body'):
                    return True
        return False
    shouts_with_body = [shout for shout in shouts_data if has_body_content(shout)]
    body_ids = [str(shout.id) for shout in shouts_with_body]
    if abs(indexed_doc_count - len(shouts_data)) > 10:
        doc_ids = [str(shout.id) for shout in shouts_data]
        verification = await search_service.verify_docs(doc_ids)
        if verification.get("status") == "error":
            return
-            
+        # Only reindex missing docs that actually have body content
-        missing_ids = verification.get("missing", [])
+        missing_ids = [mid for mid in verification.get("missing", []) if mid in body_ids]
        if missing_ids:
-            missing_docs = [shout for shout in shouts_data if str(shout.id) in missing_ids]
+            missing_docs = [shout for shout in shouts_with_body if str(shout.id) in missing_ids]
            await search_service.bulk_index(missing_docs)
    else:
        pass