diff --git a/migration/extract.py b/migration/extract.py
index 0aee6ce8..9ea84067 100644
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -388,23 +388,30 @@ def cleanup_html(body: str) -> str:
r"class=\"MsoNormal\"",
r"lang=\"EN-US\"",
r"id=\"docs-internal-guid-[\w-]+\"",
- r"
",
+ r"\s*
",
r"",
- r"",
- r"",
- r"",
- r"",
- r"",
- r"",
- r"",
+ r"\s*",
+ r"\s*",
+ r"\s*
",
+ r"\s*
",
+ r"\s*
",
+ r"\s*
",
+ r"\s*
",
]
regex_replace = {
- r"
": ""
+ r"
\s*": ""
}
- for regex in regex_remove:
- new_body = re.sub(regex, "", new_body)
- for regex, replace in regex_replace.items():
- new_body = re.sub(regex, replace, new_body)
+ changed = True
+ while changed:
+ # we need several iterations to clean nested tags this way
+ changed = False
+ new_body_iteration = new_body
+ for regex in regex_remove:
+ new_body = re.sub(regex, "", new_body)
+ for regex, replace in regex_replace.items():
+ new_body = re.sub(regex, replace, new_body)
+ if new_body_iteration != new_body:
+ changed = True
return new_body
def extract_html(entry, shout_id = None, cleanup=False):
@@ -418,4 +425,10 @@ def extract_html(entry, shout_id = None, cleanup=False):
if shout_id:
extract_footnotes(body_orig, shout_id)
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+ if cleanup:
+ # we do that after bs parsing because it can add dummy tags
+ body_clean_html = cleanup_html(body_html)
+ if body_clean_html != body_html:
+ print(f"[migration] html cleaned after bs4 for slug {entry.get('slug', None)}")
+ body_html = body_clean_html
return body_html