diff --git a/migration/extract.py b/migration/extract.py index 0aee6ce8..9ea84067 100644 --- a/migration/extract.py +++ b/migration/extract.py @@ -388,23 +388,30 @@ def cleanup_html(body: str) -> str: r"class=\"MsoNormal\"", r"lang=\"EN-US\"", r"id=\"docs-internal-guid-[\w-]+\"", - r"

", + r"

\s*

", r"", - r"", - r"", - r"

", - r"

", - r"

", - r"

", - r"
", + r"\s*", + r"\s*", + r"

\s*

", + r"

\s*

", + r"

\s*

", + r"

\s*

", + r"
\s*
", ] regex_replace = { - r"

": "

" + r"
\s*

": "

" } - for regex in regex_remove: - new_body = re.sub(regex, "", new_body) - for regex, replace in regex_replace.items(): - new_body = re.sub(regex, replace, new_body) + changed = True + while changed: + # we need several iterations to clean nested tags this way + changed = False + new_body_iteration = new_body + for regex in regex_remove: + new_body = re.sub(regex, "", new_body) + for regex, replace in regex_replace.items(): + new_body = re.sub(regex, replace, new_body) + if new_body_iteration != new_body: + changed = True return new_body def extract_html(entry, shout_id = None, cleanup=False): @@ -418,4 +425,10 @@ def extract_html(entry, shout_id = None, cleanup=False): if shout_id: extract_footnotes(body_orig, shout_id) body_html = str(BeautifulSoup(body_orig, features="html.parser")) + if cleanup: + # we do that after bs parsing because it can add dummy tags + body_clean_html = cleanup_html(body_html) + if body_clean_html != body_html: + print(f"[migration] html cleaned after bs4 for slug {entry.get('slug', None)}") + body_html = body_clean_html return body_html