upgrade Aaron's code

This commit is contained in:
Untone 2021-10-12 22:37:45 +03:00
parent b5a7c239a7
commit 816a90f656

View File

@ -230,6 +230,9 @@ class HTML2Text(HTMLParser.HTMLParser):
self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl self.baseurl = baseurl
self.header_id = None
self.span_hightlight = False
self.span_lead = False
try: del unifiable_n[name2cp('nbsp')] try: del unifiable_n[name2cp('nbsp')]
except KeyError: pass except KeyError: pass
@ -261,7 +264,8 @@ class HTML2Text(HTMLParser.HTMLParser):
else: else:
nbsp = u' ' nbsp = u' '
self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp) self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
self.outtext = self.outtext.replace('\n** **\n', '')
self.outtext = self.outtext.replace('====', '')
return self.outtext return self.outtext
def handle_charref(self, c): def handle_charref(self, c):
@ -390,26 +394,34 @@ class HTML2Text(HTMLParser.HTMLParser):
parent_style = self.tag_stack[-1][2] parent_style = self.tag_stack[-1][2]
if hn(tag): if hn(tag):
self.p()
if start: if start:
self.p()
self.inheader = True self.inheader = True
self.o(hn(tag)*"#" + ' ') self.o(hn(tag)*"#" + ' ')
self.header_id = attrs.get('id')
else: else:
if attrs.get('id', False): self.o('{#' + attrs.get['id'] + '}') if self.header_id:
self.o(' {#' + self.header_id + '}')
self.header_id = None
self.p()
self.inheader = False self.inheader = False
return # prevent redundant emphasis marks on headers return # prevent redundant emphasis marks on headers
if tag == 'span' and 'class' in attrs: if tag == 'span':
if start and 'class' in attrs:
if attrs['class'] == 'highlight': if attrs['class'] == 'highlight':
if start: self.o('`') # NOTE: same as <code>
self.o('`') self.span_hightlight = True
else:
self.o('`')
elif attrs['class'] == 'lead': elif attrs['class'] == 'lead':
if start: self.o('==') # NOTE: but CriticMarkup uses {== ==}
self.o(self.strong_mark) self.span_lead = True
else: else:
self.o(self.strong_mark) if self.span_hightlight:
self.o('`')
self.span_hightlight = False
elif self.span_lead:
self.o('==')
self.span_lead = False
if tag in ['p', 'div']: if tag in ['p', 'div']:
if self.google_doc: if self.google_doc:
@ -581,20 +593,20 @@ class HTML2Text(HTMLParser.HTMLParser):
if self.abbr_data is not None: if self.abbr_data is not None:
self.abbr_data += data self.abbr_data += data
if not self.quiet: # if not self.quiet:
if self.google_doc: # if self.google_doc:
# prevent white space immediately after 'begin emphasis' marks ('**' and '_') # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
lstripped_data = data.lstrip() lstripped_data = data.lstrip()
if self.drop_white_space and not (self.pre or self.code): if self.drop_white_space and not (self.pre or self.code or self.span_hightlight or self.span_lead):
data = lstripped_data data = lstripped_data
if lstripped_data != '': if puredata: # and not self.pre:
self.drop_white_space = 0
if puredata and not self.pre:
data = re.sub('\s+', ' ', data) data = re.sub('\s+', ' ', data)
if data and data[0] == ' ': if data and data[0] == ' ':
self.space = 1 self.space = 1
data = data[1:] data = data[1:]
if lstripped_data != '':
self.drop_white_space = 0
if not data and not force: return if not data and not force: return
if self.startpre: if self.startpre:
@ -657,7 +669,8 @@ class HTML2Text(HTMLParser.HTMLParser):
if self.abbr_list and force == "end": if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items(): for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n") self.out(" *[" + abbr + "]: " + definition + "\n")
data.replace('\u200b', '')
data.replace('\xa0', ' ')
self.p_p = 0 self.p_p = 0
self.out(data) self.out(data)
self.outcount += 1 self.outcount += 1
@ -839,8 +852,7 @@ def escape_md_section(text, snob=False):
def main(): def main():
baseurl = '' baseurl = ''
p = optparse.OptionParser('%prog [(filename|url) [encoding]]', p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__)
version='%prog ' + __version__)
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
p.add_option("--ignore-links", dest="ignore_links", action="store_true", p.add_option("--ignore-links", dest="ignore_links", action="store_true",