refactor(search.py): with checking titles without bodies for not re indexing them every startup
All checks were successful
Deploy on push / deploy (push) Successful in 42s
All checks were successful
Deploy on push / deploy (push) Successful in 42s
This commit is contained in:
parent
c0406dbbf2
commit
3062a2b7de
|
@ -10,6 +10,9 @@ from datetime import datetime, timedelta
|
||||||
# Set up proper logging
|
# Set up proper logging
|
||||||
logger = logging.getLogger("search")
|
logger = logging.getLogger("search")
|
||||||
logger.setLevel(logging.INFO) # Change to INFO to see more details
|
logger.setLevel(logging.INFO) # Change to INFO to see more details
|
||||||
|
# Disable noise HTTP client logging
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||||
|
|
||||||
# Configuration for search service
|
# Configuration for search service
|
||||||
SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"])
|
SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"])
|
||||||
|
@ -798,28 +801,37 @@ async def initialize_search_index(shouts_data):
|
||||||
if problem_docs:
|
if problem_docs:
|
||||||
await search_service.bulk_index(problem_docs)
|
await search_service.bulk_index(problem_docs)
|
||||||
|
|
||||||
db_ids = [str(shout.id) for shout in shouts_data]
|
# Only consider shouts with body content for body verification
|
||||||
|
def has_body_content(shout):
|
||||||
|
for field in ['subtitle', 'lead', 'body']:
|
||||||
|
if getattr(shout, field, None) and isinstance(getattr(shout, field, None), str) and getattr(shout, field).strip():
|
||||||
|
return True
|
||||||
|
media = getattr(shout, 'media', None)
|
||||||
|
if media:
|
||||||
|
if isinstance(media, str):
|
||||||
try:
|
try:
|
||||||
numeric_ids = [int(sid) for sid in db_ids if sid.isdigit()]
|
media_json = json.loads(media)
|
||||||
if numeric_ids:
|
if isinstance(media_json, dict) and (media_json.get('title') or media_json.get('body')):
|
||||||
min_id = min(numeric_ids)
|
return True
|
||||||
max_id = max(numeric_ids)
|
except Exception:
|
||||||
id_range = max_id - min_id + 1
|
return True
|
||||||
except Exception as e:
|
elif isinstance(media, dict):
|
||||||
pass
|
if media.get('title') or media.get('body'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
shouts_with_body = [shout for shout in shouts_data if has_body_content(shout)]
|
||||||
|
body_ids = [str(shout.id) for shout in shouts_with_body]
|
||||||
|
|
||||||
if abs(indexed_doc_count - len(shouts_data)) > 10:
|
if abs(indexed_doc_count - len(shouts_data)) > 10:
|
||||||
doc_ids = [str(shout.id) for shout in shouts_data]
|
doc_ids = [str(shout.id) for shout in shouts_data]
|
||||||
|
|
||||||
verification = await search_service.verify_docs(doc_ids)
|
verification = await search_service.verify_docs(doc_ids)
|
||||||
|
|
||||||
if verification.get("status") == "error":
|
if verification.get("status") == "error":
|
||||||
return
|
return
|
||||||
|
# Only reindex missing docs that actually have body content
|
||||||
missing_ids = verification.get("missing", [])
|
missing_ids = [mid for mid in verification.get("missing", []) if mid in body_ids]
|
||||||
if missing_ids:
|
if missing_ids:
|
||||||
missing_docs = [shout for shout in shouts_data if str(shout.id) in missing_ids]
|
missing_docs = [shout for shout in shouts_with_body if str(shout.id) in missing_ids]
|
||||||
await search_service.bulk_index(missing_docs)
|
await search_service.bulk_index(missing_docs)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
Loading…
Reference in New Issue
Block a user