From 36f26aaa1ce4d6176146227e292b7a3cbd884b2f Mon Sep 17 00:00:00 2001 From: tonyrewin Date: Sun, 3 Jul 2022 04:01:09 +0300 Subject: [PATCH] html2md parser tuning --- migration/html2text/__init__.py | 60 ++++++++++++++++----------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/migration/html2text/__init__.py b/migration/html2text/__init__.py index c38b44db..8a0a1f7f 100644 --- a/migration/html2text/__init__.py +++ b/migration/html2text/__init__.py @@ -364,17 +364,19 @@ class HTML2Text(html.parser.HTMLParser): else: self.inheader = False return # prevent redundant emphasis marks on headers + if 'class' in attrs: - self.current_class = attrs.get('class') + self.current_class = attrs.get('class', '') # self.p() if not start: self.current_class = '' - if 'style' in attrs: - if attrs.get('style') == 'text-align: center': - self.current_class = 'center' - if not start: - self.current_class = '' + if tag == 'span': + if 'style' in attrs: + if attrs.get('style') == 'text-align: center': + self.current_class = 'center' + if not start: + self.current_class = '' if start: if self.current_class == 'highlight' and \ self.inheader == False and \ @@ -531,21 +533,25 @@ class HTML2Text(html.parser.HTMLParser): if tag == "a" and not self.ignore_links: if start: - if ( - "href" in attrs - and attrs["href"] is not None - and not (self.skip_internal_links and attrs["href"].startswith("#")) - and not ( - self.ignore_mailto_links and attrs["href"].startswith("mailto:") - ) - ): - self.astack.append(attrs) - self.maybe_automatic_link = attrs["href"] - self.empty_link = True - if self.protect_links: - attrs["href"] = "<" + attrs["href"] + ">" + if 'data-original-title' in attrs: + # WARNING: old discours specific code + if 'import Tooltip' not in self.outtextlist[0]: self.outtextlist.insert(0, 'import Tooltip from "$/components/Article/Tooltip"\n\n') + self.o('///%s///' % attrs['data-original-title']) else: - self.astack.append(None) + if ( + "href" in attrs + and not attrs["href"].startswith('#_ftn') + and attrs["href"] is not None + and not (self.skip_internal_links and attrs["href"].startswith("#")) + and not (self.ignore_mailto_links and attrs["href"].startswith("mailto:")) + ): + self.astack.append(attrs) + self.maybe_automatic_link = attrs["href"] + self.empty_link = True + if self.protect_links: + attrs["href"] = "<" + attrs["href"] + ">" + else: + self.astack.append(None) else: if self.astack: a = self.astack.pop() @@ -573,7 +579,8 @@ class HTML2Text(html.parser.HTMLParser): self.o("][" + str(a_props.count) + "]") if tag == "img" and start and not self.ignore_images: - if "src" in attrs: + # skip cloudinary images + if "src" in attrs and 'cloudinary' not in attrs['src']: assert attrs["src"] is not None if not self.images_to_alt: attrs["href"] = attrs["src"] @@ -1031,12 +1038,5 @@ def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> bodywidth = config.BODY_WIDTH h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) - return h.handle(html)\ - .replace('<...>', '**...**')\ - .replace('<…>', '***...**')\ - .replace('****', '')\ - .replace('\u00a0',' ')\ - .replace('\u200c', '')\ - .replace('\u200b', '')\ - .replace('\ufeff', '') - # .replace('\u2212', '-') + h = h.handle(html) + return h