s > 9 correctly.
if li['name'] == "ul": self.o(self.ul_item_mark + " ")
elif li['name'] == "ol":
li['num'] += 1
self.o(str(li['num'])+". ")
self.start = 1
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0:
self.p_p = 1
def p(self):
self.p_p = 2
def soft_br(self):
self.pbr()
self.br_toggle = ' '
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None:
self.abbr_data += data
if not self.quiet:
if self.google_doc:
# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
lstripped_data = data.lstrip()
if self.drop_white_space and not (self.pre or self.code):
data = lstripped_data
if lstripped_data != '':
self.drop_white_space = 0
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
if not data.startswith("\n"): # stuff...
data = "\n" + data
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
if not self.list:
bq += " "
#else: list content is already partially indented
for i in xrange(len(self.list)):
bq += " "
data = data.replace("\n", "\n"+bq)
if self.startpre:
self.startpre = 0
if self.list:
data = data.lstrip("\n") # use existing initial indentation
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out((self.br_toggle+'\n'+bq)*self.p_p)
self.space = 0
self.br_toggle = ''
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
if has_key(link, 'title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.outcount += 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
if self.style:
self.style_def.update(dumb_css_parser(data))
if not self.maybe_automatic_link is None:
href = self.maybe_automatic_link
if href == data and self.absolute_url_matcher.match(href):
self.o("<" + data + ">")
return
else:
self.o("[")
self.maybe_automatic_link = None
if not self.code and not self.pre:
data = escape_md_section(data, snob=self.escape_snob)
self.o(data, 1)
def unknown_decl(self, data): pass
def charref(self, name):
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not self.unicode_snob and c in unifiable_n.keys():
return unifiable_n[c]
else:
try:
return unichr(c)
except NameError: #Python3
return chr(c)
def entityref(self, c):
if not self.unicode_snob and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c + ';'
else:
try:
return unichr(name2cp(c))
except NameError: #Python3
return chr(name2cp(c))
def replaceEntities(self, s):
s = s.group(1)
if s[0] == "#":
return self.charref(s[1:])
else: return self.entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(self, s):
return self.r_unescape.sub(self.replaceEntities, s)
def google_nest_count(self, style):
"""calculate the nesting count of google doc lists"""
nest_count = 0
if 'margin-left' in style:
nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
return nest_count
def optwrap(self, text):
"""Wrap all paragraphs in the provided text."""
if not self.body_width:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if not skipwrap(para):
result += "\n".join(wrap(para, self.body_width))
if para.endswith(' '):
result += " \n"
newlines = 1
else:
result += "\n\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
ordered_list_matcher = re.compile(r'\d+\.\s')
unordered_list_matcher = re.compile(r'[-\*\+]\s')
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
md_dot_matcher = re.compile(r"""
^ # start of line
(\s*\d+) # optional whitespace and a number
(\.) # dot
(?=\s) # lookahead assert whitespace
""", re.MULTILINE | re.VERBOSE)
md_plus_matcher = re.compile(r"""
^
(\s*)
(\+)
(?=\s)
""", flags=re.MULTILINE | re.VERBOSE)
md_dash_matcher = re.compile(r"""
^
(\s*)
(-)
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
# or another dash (header or hr)
""", flags=re.MULTILINE | re.VERBOSE)
slash_chars = r'\`*_{}[]()#+-.!'
md_backslash_matcher = re.compile(r'''
(\\) # match one slash
(?=[%s]) # followed by a char that requires escaping
''' % re.escape(slash_chars),
flags=re.VERBOSE)
def skipwrap(para):
# If the text begins with four spaces or one tab, it's a code block; don't wrap
if para[0:4] == ' ' or para[0] == '\t':
return True
# If the text begins with only two "--", possibly preceded by whitespace, that's
# an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists, but there's
# a
-inside- case in one of the tests that also depends upon it.
if stripped[0:1] == '-' or stripped[0:1] == '*':
return True
# If the text begins with a single -, *, or +, followed by a space, or an integer,
# followed by a ., followed by a space (in either case optionally preceeded by
# whitespace), it's a list; don't wrap.
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
return True
return False
def wrapwrite(text):
text = text.encode('utf-8')
try: #Python3
sys.stdout.buffer.write(text)
except AttributeError:
sys.stdout.write(text)
def html2text(html, baseurl=''):
h = HTML2Text(baseurl=baseurl)
return h.handle(html)
def unescape(s, unicode_snob=False):
h = HTML2Text()
h.unicode_snob = unicode_snob
return h.unescape(s)
def escape_md(text):
"""Escapes markdown-sensitive characters within other markdown constructs."""
return md_chars_matcher.sub(r"\\\1", text)
def escape_md_section(text, snob=False):
"""Escapes markdown-sensitive characters across whole document sections."""
text = md_backslash_matcher.sub(r"\\\1", text)
if snob:
text = md_chars_matcher_all.sub(r"\\\1", text)
text = md_dot_matcher.sub(r"\1\\\2", text)
text = md_plus_matcher.sub(r"\1\\\2", text)
text = md_dash_matcher.sub(r"\1\\\2", text)
return text
def main():
baseurl = ''
p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__)
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
p.add_option("--ignore-links", dest="ignore_links", action="store_true",
default=IGNORE_ANCHORS, help="don't include any formatting for links")
p.add_option("--ignore-images", dest="ignore_images", action="store_true",
default=IGNORE_IMAGES, help="don't include any formatting for images")
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
default=False, help="convert an html-exported Google Document")
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
default=False, help="use a dash rather than a star for unordered list items")
p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
default=False, help="use an asterisk rather than an underscore for emphasized text")
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
default=False, help="hide strike-through text. only relevant when -g is specified as well")
p.add_option("--escape-all", action="store_true", dest="escape_snob",
default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
(options, args) = p.parse_args()
# process input
encoding = "utf-8"
if len(args) > 0:
file_ = args[0]
if len(args) == 2:
encoding = args[1]
if len(args) > 2:
p.error('Too many arguments')
if file_.startswith('http://') or file_.startswith('https://'):
baseurl = file_
j = urllib.urlopen(baseurl)
data = j.read()
if encoding is None:
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
encoding = enc(j.headers, data)[0]
if encoding == 'en-ascii':
encoding = 'utf-8'
else:
data = open(file_, 'rb').read()
if encoding is None:
try:
from chardet import detect
except ImportError:
detect = lambda x: {'encoding': 'utf-8'}
encoding = detect(data)['encoding']
else:
data = sys.stdin.read()
data = data.decode(encoding)
h = HTML2Text(baseurl=baseurl)
# handle options
if options.ul_style_dash: h.ul_item_mark = '-'
if options.em_style_asterisk:
h.emphasis_mark = '*'
h.strong_mark = '__'
h.body_width = options.body_width
h.list_indent = options.list_indent
h.ignore_emphasis = options.ignore_emphasis
h.ignore_links = options.ignore_links
h.ignore_images = options.ignore_images
h.google_doc = options.google_doc
h.hide_strikethrough = options.hide_strikethrough
h.escape_snob = options.escape_snob
wrapwrite(h.handle(data))
if __name__ == "__main__":
main()