diff --git a/migration/extract.py b/migration/extract.py index d67275a9..4c8a95b7 100644 --- a/migration/extract.py +++ b/migration/extract.py @@ -251,7 +251,7 @@ def extract_md_images(body, prefix): return newbody -def cleanup(body): +def cleanup_md(body): newbody = ( body.replace("<", "") .replace(">", "") @@ -274,7 +274,7 @@ def cleanup(body): def extract_md(body, shout_dict = None): newbody = body if newbody: - newbody = cleanup(newbody) + newbody = cleanup_md(newbody) if not newbody: raise Exception("cleanup error") @@ -375,8 +375,45 @@ def prepare_html_body(entry): return body -def extract_html(entry, shout_id = None): +def cleanup_html(body: str) -> str: + new_body = body + regex_remove = [ + r"style=\"width:\s*\d+px;height:\s*\d+px;\"", + r"style=\"width:\s*\d+px;\"", + r"style=\"color: #000000;\"", + r"style=\"float: none;\"", + r"style=\"background: white;\"", + r"class=\"Apple-interchange-newline\"", + r"class=\"MsoNormalCxSpMiddle\"", + r"class=\"MsoNormal\"", + r"lang=\"EN-US\"", + r"id=\"docs-internal-guid-[\w-]+\"", + r"

", + r"", + r"", + r"", + r"

", + r"

", + r"

", + r"

", + r"
", + ] + regex_replace = { + r"

": "

" + } + for regex in regex_remove: + new_body = re.sub(regex, "", new_body) + for regex, replace in regex_replace.items(): + new_body = re.sub(regex, replace, new_body) + return new_body + +def extract_html(entry, shout_id = None, cleanup=False): body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')') + if cleanup: + # we do that before bs parsing to catch the invalid html + body_clean = cleanup_html(body_orig) + if body_clean != body_orig: + print(f"[migration] html cleaned for slug {entry.get('slug', None)}") if shout_id: extract_footnotes(body_orig, shout_id) body_html = str(BeautifulSoup(body_orig, features="html.parser")) diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index 09ef4cb0..2e74f96e 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -150,7 +150,7 @@ async def migrate(entry, storage): "createdAt": date_parse(entry.get("createdAt", OLD_DATE)), "updatedAt": date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts, "topics": await add_topics_follower(entry, storage, author), - "body": extract_html(entry) + "body": extract_html(entry, cleanup=True) } # main topic patch