some basic fixes
This commit is contained in:
parent
7ebfe9e190
commit
f658f27f04
|
@ -1,936 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
|
||||||
__version__ = "3.200.3"
|
|
||||||
__author__ = "Aaron Swartz (me@aaronsw.com)"
|
|
||||||
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
|
|
||||||
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
|
||||||
|
|
||||||
# TODO:
|
|
||||||
# Support decoded entities with unifiable.
|
|
||||||
|
|
||||||
try:
|
|
||||||
True
|
|
||||||
except NameError:
|
|
||||||
setattr(__builtins__, 'True', 1)
|
|
||||||
setattr(__builtins__, 'False', 0)
|
|
||||||
|
|
||||||
def has_key(x, y):
|
|
||||||
if hasattr(x, 'has_key'): return x.has_key(y)
|
|
||||||
else: return y in x
|
|
||||||
|
|
||||||
import html.entities as htmlentitydefs
|
|
||||||
import urllib.parse as urlparse
|
|
||||||
import html.parser as HTMLParser
|
|
||||||
import urllib.request as urllib
|
|
||||||
import optparse, re, sys, codecs, types
|
|
||||||
|
|
||||||
try: from textwrap import wrap
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
#s upport the python3 API
|
|
||||||
if sys.version_info[0] == 3:
|
|
||||||
unichr = chr
|
|
||||||
xrange = range
|
|
||||||
|
|
||||||
# Use Unicode characters instead of their ascii psuedo-replacements
|
|
||||||
UNICODE_SNOB = 0
|
|
||||||
|
|
||||||
# Escape all special characters. Output is less readable, but avoids corner case formatting issues.
|
|
||||||
ESCAPE_SNOB = 0
|
|
||||||
|
|
||||||
# Put the links after each paragraph instead of at the end.
|
|
||||||
LINKS_EACH_PARAGRAPH = 0
|
|
||||||
|
|
||||||
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
|
|
||||||
BODY_WIDTH = 0
|
|
||||||
|
|
||||||
# Don't show internal links (href="#local-anchor") -- corresponding link targets
|
|
||||||
# won't be visible in the plain text file anyway.
|
|
||||||
SKIP_INTERNAL_LINKS = False
|
|
||||||
|
|
||||||
# Use inline, rather than reference, formatting for images and links
|
|
||||||
INLINE_LINKS = True
|
|
||||||
|
|
||||||
# Number of pixels Google indents nested lists
|
|
||||||
GOOGLE_LIST_INDENT = 36
|
|
||||||
|
|
||||||
IGNORE_ANCHORS = False
|
|
||||||
IGNORE_IMAGES = False
|
|
||||||
IGNORE_EMPHASIS = False
|
|
||||||
|
|
||||||
### Entity Nonsense ###
|
|
||||||
|
|
||||||
def name2cp(k):
|
|
||||||
if k == 'apos': return ord("'")
|
|
||||||
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
|
|
||||||
return htmlentitydefs.name2codepoint[k]
|
|
||||||
else:
|
|
||||||
k = htmlentitydefs.entitydefs[k]
|
|
||||||
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
|
|
||||||
return ord(codecs.latin_1_decode(k)[0])
|
|
||||||
|
|
||||||
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
|
||||||
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
|
||||||
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
|
||||||
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
|
|
||||||
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
|
|
||||||
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
|
|
||||||
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
|
|
||||||
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
|
|
||||||
'lrm':' ', 'rlm':' '}
|
|
||||||
|
|
||||||
unifiable_n = {}
|
|
||||||
|
|
||||||
for k in unifiable.keys():
|
|
||||||
unifiable_n[name2cp(k)] = unifiable[k]
|
|
||||||
|
|
||||||
### End Entity Nonsense ###
|
|
||||||
|
|
||||||
def onlywhite(line):
|
|
||||||
"""Return true if the line does only consist of whitespace characters."""
|
|
||||||
for c in line:
|
|
||||||
if c != ' ' and c != ' ':
|
|
||||||
return c == ' '
|
|
||||||
return line
|
|
||||||
|
|
||||||
def hn(tag):
|
|
||||||
if tag[0] == 'h' and len(tag) == 2:
|
|
||||||
try:
|
|
||||||
n = int(tag[1])
|
|
||||||
if n in range(1, 10): return n
|
|
||||||
except ValueError: return 0
|
|
||||||
|
|
||||||
def dumb_property_dict(style):
|
|
||||||
"""returns a hash of css attributes"""
|
|
||||||
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
|
|
||||||
|
|
||||||
def dumb_css_parser(data):
|
|
||||||
"""returns a hash of css selectors, each of which contains a hash of css attributes"""
|
|
||||||
# remove @import sentences
|
|
||||||
data += ';'
|
|
||||||
importIndex = data.find('@import')
|
|
||||||
while importIndex != -1:
|
|
||||||
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
|
|
||||||
importIndex = data.find('@import')
|
|
||||||
|
|
||||||
# parse the css. reverted from dictionary compehension in order to support older pythons
|
|
||||||
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
|
|
||||||
try:
|
|
||||||
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
|
|
||||||
except ValueError:
|
|
||||||
elements = {} # not that important
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
def element_style(attrs, style_def, parent_style):
|
|
||||||
"""returns a hash of the 'final' style attributes of the element"""
|
|
||||||
style = parent_style.copy()
|
|
||||||
if 'class' in attrs:
|
|
||||||
for css_class in attrs['class'].split():
|
|
||||||
css_style = style_def['.' + css_class]
|
|
||||||
style.update(css_style)
|
|
||||||
if 'style' in attrs:
|
|
||||||
immediate_style = dumb_property_dict(attrs['style'])
|
|
||||||
style.update(immediate_style)
|
|
||||||
return style
|
|
||||||
|
|
||||||
def google_list_style(style):
|
|
||||||
"""finds out whether this is an ordered or unordered list"""
|
|
||||||
if 'list-style-type' in style:
|
|
||||||
list_style = style['list-style-type']
|
|
||||||
if list_style in ['disc', 'circle', 'square', 'none']:
|
|
||||||
return 'ul'
|
|
||||||
return 'ol'
|
|
||||||
|
|
||||||
def google_has_height(style):
|
|
||||||
"""check if the style of the element has the 'height' attribute explicitly defined"""
|
|
||||||
if 'height' in style:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def google_text_emphasis(style):
|
|
||||||
"""return a list of all emphasis modifiers of the element"""
|
|
||||||
emphasis = []
|
|
||||||
if 'text-decoration' in style:
|
|
||||||
emphasis.append(style['text-decoration'])
|
|
||||||
if 'font-style' in style:
|
|
||||||
emphasis.append(style['font-style'])
|
|
||||||
if 'font-weight' in style:
|
|
||||||
emphasis.append(style['font-weight'])
|
|
||||||
return emphasis
|
|
||||||
|
|
||||||
def google_fixed_width_font(style):
|
|
||||||
"""check if the css of the current element defines a fixed width font"""
|
|
||||||
font_family = ''
|
|
||||||
if 'font-family' in style:
|
|
||||||
font_family = style['font-family']
|
|
||||||
if 'Courier New' == font_family or 'Consolas' == font_family:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def list_numbering_start(attrs):
|
|
||||||
"""extract numbering from list element attributes"""
|
|
||||||
if 'start' in attrs:
|
|
||||||
return int(attrs['start']) - 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
class HTML2Text(HTMLParser.HTMLParser):
|
|
||||||
def __init__(self, out=None, baseurl=''):
|
|
||||||
HTMLParser.HTMLParser.__init__(self)
|
|
||||||
|
|
||||||
# Config options
|
|
||||||
self.unicode_snob = UNICODE_SNOB
|
|
||||||
self.escape_snob = ESCAPE_SNOB
|
|
||||||
self.links_each_paragraph = LINKS_EACH_PARAGRAPH
|
|
||||||
self.body_width = BODY_WIDTH
|
|
||||||
self.skip_internal_links = SKIP_INTERNAL_LINKS
|
|
||||||
self.inline_links = INLINE_LINKS
|
|
||||||
self.google_list_indent = GOOGLE_LIST_INDENT
|
|
||||||
self.ignore_links = IGNORE_ANCHORS
|
|
||||||
self.ignore_images = IGNORE_IMAGES
|
|
||||||
self.ignore_emphasis = IGNORE_EMPHASIS
|
|
||||||
self.google_doc = False
|
|
||||||
self.ul_item_mark = '*'
|
|
||||||
self.emphasis_mark = '_'
|
|
||||||
self.strong_mark = '**'
|
|
||||||
|
|
||||||
if out is None:
|
|
||||||
self.out = self.outtextf
|
|
||||||
else:
|
|
||||||
self.out = out
|
|
||||||
|
|
||||||
self.outtextlist = [] # empty list to store output characters before they are "joined"
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.outtext = unicode()
|
|
||||||
except NameError: # Python3
|
|
||||||
self.outtext = str()
|
|
||||||
|
|
||||||
self.quiet = 0
|
|
||||||
self.p_p = 0 # number of newline character to print before next output
|
|
||||||
self.outcount = 0
|
|
||||||
self.start = 1
|
|
||||||
self.space = 0
|
|
||||||
self.a = []
|
|
||||||
self.astack = []
|
|
||||||
self.maybe_automatic_link = None
|
|
||||||
self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
|
|
||||||
self.acount = 0
|
|
||||||
self.list = []
|
|
||||||
self.blockquote = 0
|
|
||||||
self.pre = 0
|
|
||||||
self.startpre = 0
|
|
||||||
self.code = False
|
|
||||||
self.br_toggle = ''
|
|
||||||
self.lastWasNL = 0
|
|
||||||
self.lastWasList = False
|
|
||||||
self.style = 0
|
|
||||||
self.style_def = {}
|
|
||||||
self.tag_stack = []
|
|
||||||
self.emphasis = 0
|
|
||||||
self.drop_white_space = 0
|
|
||||||
self.inheader = False
|
|
||||||
self.abbr_title = None # current abbreviation definition
|
|
||||||
self.abbr_data = None # last inner HTML (for abbr being defined)
|
|
||||||
self.abbr_list = {} # stack of abbreviations to write later
|
|
||||||
self.baseurl = baseurl
|
|
||||||
self.header_id = None
|
|
||||||
self.span_hightlight = False
|
|
||||||
self.span_lead = False
|
|
||||||
|
|
||||||
try: del unifiable_n[name2cp('nbsp')]
|
|
||||||
except KeyError: pass
|
|
||||||
unifiable['nbsp'] = ' _place_holder;'
|
|
||||||
|
|
||||||
|
|
||||||
def feed(self, data):
|
|
||||||
data = data.replace("</' + 'script>", "</ignore>")
|
|
||||||
HTMLParser.HTMLParser.feed(self, data)
|
|
||||||
|
|
||||||
def handle(self, data):
|
|
||||||
self.feed(data)
|
|
||||||
self.feed("")
|
|
||||||
return self.optwrap(self.close())
|
|
||||||
|
|
||||||
def outtextf(self, s):
|
|
||||||
self.outtextlist.append(s)
|
|
||||||
if s: self.lastWasNL = s[-1] == '\n'
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
HTMLParser.HTMLParser.close(self)
|
|
||||||
|
|
||||||
self.pbr()
|
|
||||||
self.o('', 0, 'end')
|
|
||||||
|
|
||||||
self.outtext = self.outtext.join(self.outtextlist)
|
|
||||||
if self.unicode_snob:
|
|
||||||
nbsp = unichr(name2cp('nbsp'))
|
|
||||||
else:
|
|
||||||
nbsp = u' '
|
|
||||||
self.outtext = self.outtext.replace(u' _place_holder;', nbsp)
|
|
||||||
|
|
||||||
return self.outtext
|
|
||||||
|
|
||||||
def handle_charref(self, c):
|
|
||||||
self.o(self.charref(c), 1)
|
|
||||||
|
|
||||||
def handle_entityref(self, c):
|
|
||||||
self.o(self.entityref(c), 1)
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
|
||||||
self.handle_tag(tag, attrs, 1)
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
self.handle_tag(tag, None, 0)
|
|
||||||
|
|
||||||
def previousIndex(self, attrs):
|
|
||||||
""" returns the index of certain set of attributes (of a link) in the
|
|
||||||
self.a list
|
|
||||||
|
|
||||||
If the set of attributes is not found, returns None
|
|
||||||
"""
|
|
||||||
if not has_key(attrs, 'href'): return None
|
|
||||||
|
|
||||||
i = -1
|
|
||||||
for a in self.a:
|
|
||||||
i += 1
|
|
||||||
match = 0
|
|
||||||
|
|
||||||
if has_key(a, 'href') and a['href'] == attrs['href']:
|
|
||||||
if has_key(a, 'title') or has_key(attrs, 'title'):
|
|
||||||
if (has_key(a, 'title') and has_key(attrs, 'title') and
|
|
||||||
a['title'] == attrs['title']):
|
|
||||||
match = True
|
|
||||||
else:
|
|
||||||
match = True
|
|
||||||
|
|
||||||
if match: return i
|
|
||||||
|
|
||||||
def drop_last(self, nLetters):
|
|
||||||
if not self.quiet:
|
|
||||||
self.outtext = self.outtext[:-nLetters]
|
|
||||||
|
|
||||||
def handle_emphasis(self, start, tag_style, parent_style):
|
|
||||||
"""handles various text emphases"""
|
|
||||||
tag_emphasis = google_text_emphasis(tag_style)
|
|
||||||
parent_emphasis = google_text_emphasis(parent_style)
|
|
||||||
|
|
||||||
# handle Google's text emphasis
|
|
||||||
strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
|
|
||||||
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
|
|
||||||
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
|
|
||||||
fixed = google_fixed_width_font(tag_style) and not \
|
|
||||||
google_fixed_width_font(parent_style) and not self.pre
|
|
||||||
|
|
||||||
if start:
|
|
||||||
# crossed-out text must be handled before other attributes
|
|
||||||
# in order not to output qualifiers unnecessarily
|
|
||||||
if bold or italic or fixed:
|
|
||||||
self.emphasis += 1
|
|
||||||
if strikethrough:
|
|
||||||
self.quiet += 1
|
|
||||||
if italic:
|
|
||||||
self.o(self.emphasis_mark)
|
|
||||||
self.drop_white_space += 1
|
|
||||||
if bold:
|
|
||||||
self.o(self.strong_mark)
|
|
||||||
self.drop_white_space += 1
|
|
||||||
if fixed:
|
|
||||||
self.o('`')
|
|
||||||
self.drop_white_space += 1
|
|
||||||
self.code = True
|
|
||||||
else:
|
|
||||||
if bold or italic or fixed:
|
|
||||||
# there must not be whitespace before closing emphasis mark
|
|
||||||
self.emphasis -= 1
|
|
||||||
self.space = 0
|
|
||||||
self.outtext = self.outtext.rstrip()
|
|
||||||
if fixed:
|
|
||||||
if self.drop_white_space:
|
|
||||||
# empty emphasis, drop it
|
|
||||||
self.drop_last(1)
|
|
||||||
self.drop_white_space -= 1
|
|
||||||
else:
|
|
||||||
self.o('`')
|
|
||||||
self.code = False
|
|
||||||
if bold:
|
|
||||||
if self.drop_white_space:
|
|
||||||
# empty emphasis, drop it
|
|
||||||
self.drop_last(2)
|
|
||||||
self.drop_white_space -= 1
|
|
||||||
else:
|
|
||||||
self.o(self.strong_mark)
|
|
||||||
if italic:
|
|
||||||
if self.drop_white_space:
|
|
||||||
# empty emphasis, drop it
|
|
||||||
self.drop_last(1)
|
|
||||||
self.drop_white_space -= 1
|
|
||||||
else:
|
|
||||||
self.o(self.emphasis_mark)
|
|
||||||
# space is only allowed after *all* emphasis marks
|
|
||||||
if (bold or italic) and not self.emphasis:
|
|
||||||
self.o(" ")
|
|
||||||
if strikethrough:
|
|
||||||
self.quiet -= 1
|
|
||||||
|
|
||||||
def handle_tag(self, tag, attrs, start):
|
|
||||||
#attrs = fixattrs(attrs)
|
|
||||||
if attrs is None:
|
|
||||||
attrs = {}
|
|
||||||
else:
|
|
||||||
attrs = dict(attrs)
|
|
||||||
|
|
||||||
if self.google_doc:
|
|
||||||
# the attrs parameter is empty for a closing tag. in addition, we
|
|
||||||
# need the attributes of the parent nodes in order to get a
|
|
||||||
# complete style description for the current element. we assume
|
|
||||||
# that google docs export well formed html.
|
|
||||||
parent_style = {}
|
|
||||||
if start:
|
|
||||||
if self.tag_stack:
|
|
||||||
parent_style = self.tag_stack[-1][2]
|
|
||||||
tag_style = element_style(attrs, self.style_def, parent_style)
|
|
||||||
self.tag_stack.append((tag, attrs, tag_style))
|
|
||||||
else:
|
|
||||||
dummy, attrs, tag_style = self.tag_stack.pop()
|
|
||||||
if self.tag_stack:
|
|
||||||
parent_style = self.tag_stack[-1][2]
|
|
||||||
|
|
||||||
if hn(tag):
|
|
||||||
if start:
|
|
||||||
self.p()
|
|
||||||
self.inheader = True
|
|
||||||
self.o(hn(tag)*"#" + ' ')
|
|
||||||
self.header_id = attrs.get('id')
|
|
||||||
else:
|
|
||||||
if self.header_id:
|
|
||||||
self.o(' {#' + self.header_id + '}')
|
|
||||||
self.header_id = None
|
|
||||||
self.p()
|
|
||||||
self.inheader = False
|
|
||||||
return # prevent redundant emphasis marks on headers
|
|
||||||
|
|
||||||
if tag == 'span':
|
|
||||||
if start and 'class' in attrs:
|
|
||||||
if attrs['class'] == 'highlight':
|
|
||||||
self.o('`') # NOTE: same as <code>
|
|
||||||
self.span_hightlight = True
|
|
||||||
elif attrs['class'] == 'lead':
|
|
||||||
if self.span_lead == False:
|
|
||||||
self.o('==\n') # NOTE: but CriticMarkup uses {== ==}
|
|
||||||
self.span_lead = True
|
|
||||||
else:
|
|
||||||
if self.span_hightlight:
|
|
||||||
self.o('`')
|
|
||||||
self.span_hightlight = False
|
|
||||||
elif self.span_lead:
|
|
||||||
if self.span_lead == True:
|
|
||||||
self.o('\n==')
|
|
||||||
self.span_lead = False
|
|
||||||
|
|
||||||
if tag in ['p', 'div']:
|
|
||||||
if self.google_doc:
|
|
||||||
if start and google_has_height(tag_style):
|
|
||||||
self.p()
|
|
||||||
else:
|
|
||||||
self.soft_br()
|
|
||||||
else:
|
|
||||||
self.p()
|
|
||||||
|
|
||||||
if tag == "br" and start: self.o(" \n")
|
|
||||||
|
|
||||||
if tag == "hr" and start:
|
|
||||||
self.p()
|
|
||||||
self.o("* * *")
|
|
||||||
self.p()
|
|
||||||
|
|
||||||
if tag in ["head", "style", 'script']:
|
|
||||||
if start: self.quiet += 1
|
|
||||||
else: self.quiet -= 1
|
|
||||||
|
|
||||||
if tag == "style":
|
|
||||||
if start: self.style += 1
|
|
||||||
else: self.style -= 1
|
|
||||||
|
|
||||||
if tag in ["body"]:
|
|
||||||
self.quiet = 0 # sites like 9rules.com never close <head>
|
|
||||||
|
|
||||||
if tag == "blockquote":
|
|
||||||
if start:
|
|
||||||
self.p(); self.o('> ', 0, 1); self.start = 1
|
|
||||||
self.blockquote += 1
|
|
||||||
else:
|
|
||||||
self.blockquote -= 1
|
|
||||||
self.p()
|
|
||||||
|
|
||||||
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
|
|
||||||
if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
|
|
||||||
if tag in ['del', 'strike', 's']:
|
|
||||||
if start:
|
|
||||||
self.o("<"+tag+">")
|
|
||||||
else:
|
|
||||||
self.o("</"+tag+">")
|
|
||||||
|
|
||||||
if self.google_doc:
|
|
||||||
if not self.inheader:
|
|
||||||
# handle some font attributes, but leave headers clean
|
|
||||||
self.handle_emphasis(start, tag_style, parent_style)
|
|
||||||
|
|
||||||
if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
|
|
||||||
if tag == "abbr":
|
|
||||||
if start:
|
|
||||||
self.abbr_title = None
|
|
||||||
self.abbr_data = ''
|
|
||||||
if has_key(attrs, 'title'):
|
|
||||||
self.abbr_title = attrs['title']
|
|
||||||
else:
|
|
||||||
if self.abbr_title != None:
|
|
||||||
self.abbr_list[self.abbr_data] = self.abbr_title
|
|
||||||
self.abbr_title = None
|
|
||||||
self.abbr_data = ''
|
|
||||||
|
|
||||||
if tag == "a" and not self.ignore_links:
|
|
||||||
if start:
|
|
||||||
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
|
|
||||||
self.astack.append(attrs)
|
|
||||||
self.maybe_automatic_link = attrs['href'][:2000]
|
|
||||||
else:
|
|
||||||
self.astack.append(None)
|
|
||||||
else:
|
|
||||||
if self.astack:
|
|
||||||
a = self.astack.pop()
|
|
||||||
if self.maybe_automatic_link:
|
|
||||||
self.maybe_automatic_link = None
|
|
||||||
elif a:
|
|
||||||
if self.inline_links:
|
|
||||||
self.o("](" + escape_md(a['href']) + ")")
|
|
||||||
else:
|
|
||||||
i = self.previousIndex(a)
|
|
||||||
if i is not None:
|
|
||||||
a = self.a[i]
|
|
||||||
else:
|
|
||||||
self.acount += 1
|
|
||||||
a['count'] = self.acount
|
|
||||||
a['outcount'] = self.outcount
|
|
||||||
self.a.append(a)
|
|
||||||
self.o("][" + str(a['count']) + "]")
|
|
||||||
|
|
||||||
if tag == "img" and start and not self.ignore_images:
|
|
||||||
if has_key(attrs, 'src'):
|
|
||||||
attrs['href'] = attrs['src']
|
|
||||||
alt = attrs.get('alt', '')
|
|
||||||
self.o("![" + escape_md(alt) + "]")
|
|
||||||
|
|
||||||
if self.inline_links:
|
|
||||||
self.o("(" + escape_md(attrs['href']) + ")")
|
|
||||||
else:
|
|
||||||
i = self.previousIndex(attrs)
|
|
||||||
if i is not None:
|
|
||||||
attrs = self.a[i]
|
|
||||||
else:
|
|
||||||
self.acount += 1
|
|
||||||
attrs['count'] = self.acount
|
|
||||||
attrs['outcount'] = self.outcount
|
|
||||||
self.a.append(attrs)
|
|
||||||
self.o("[" + str(attrs['count']) + "]")
|
|
||||||
|
|
||||||
if tag == 'dl' and start: self.p()
|
|
||||||
if tag == 'dt' and not start: self.pbr()
|
|
||||||
if tag == 'dd' and start: self.o(' ')
|
|
||||||
if tag == 'dd' and not start: self.pbr()
|
|
||||||
|
|
||||||
if tag in ["ol", "ul"]:
|
|
||||||
# Google Docs create sub lists as top level lists
|
|
||||||
if (not self.list) and (not self.lastWasList):
|
|
||||||
self.p()
|
|
||||||
if start:
|
|
||||||
if self.google_doc:
|
|
||||||
list_style = google_list_style(tag_style)
|
|
||||||
else:
|
|
||||||
list_style = tag
|
|
||||||
numbering_start = list_numbering_start(attrs)
|
|
||||||
self.list.append({'name':list_style, 'num':numbering_start})
|
|
||||||
else:
|
|
||||||
if self.list: self.list.pop()
|
|
||||||
self.lastWasList = True
|
|
||||||
else:
|
|
||||||
self.lastWasList = False
|
|
||||||
|
|
||||||
if tag == 'li':
|
|
||||||
self.pbr()
|
|
||||||
if start:
|
|
||||||
if self.list: li = self.list[-1]
|
|
||||||
else: li = {'name':'ul', 'num':0}
|
|
||||||
if self.google_doc:
|
|
||||||
nest_count = self.google_nest_count(tag_style)
|
|
||||||
else:
|
|
||||||
nest_count = len(self.list)
|
|
||||||
self.o(" " * int(nest_count)) #TODO: line up <ol><li>s > 9 correctly.
|
|
||||||
if li['name'] == "ul": self.o(self.ul_item_mark + " ")
|
|
||||||
elif li['name'] == "ol":
|
|
||||||
li['num'] += 1
|
|
||||||
self.o(str(li['num'])+". ")
|
|
||||||
self.start = 1
|
|
||||||
|
|
||||||
if tag in ["table", "tr"] and start: self.p()
|
|
||||||
if tag == 'td': self.pbr()
|
|
||||||
|
|
||||||
if tag == "pre":
|
|
||||||
if start:
|
|
||||||
self.startpre = 1
|
|
||||||
self.pre = 1
|
|
||||||
else:
|
|
||||||
self.pre = 0
|
|
||||||
self.p()
|
|
||||||
|
|
||||||
def pbr(self):
|
|
||||||
if self.p_p == 0:
|
|
||||||
self.p_p = 1
|
|
||||||
|
|
||||||
def p(self):
|
|
||||||
self.p_p = 2
|
|
||||||
|
|
||||||
def soft_br(self):
|
|
||||||
self.pbr()
|
|
||||||
self.br_toggle = ' '
|
|
||||||
|
|
||||||
def o(self, data, puredata=0, force=0):
|
|
||||||
if self.abbr_data is not None:
|
|
||||||
self.abbr_data += data
|
|
||||||
|
|
||||||
if not self.quiet:
|
|
||||||
if self.google_doc:
|
|
||||||
# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
|
|
||||||
lstripped_data = data.lstrip()
|
|
||||||
if self.drop_white_space and not (self.pre or self.code):
|
|
||||||
data = lstripped_data
|
|
||||||
if lstripped_data != '':
|
|
||||||
self.drop_white_space = 0
|
|
||||||
|
|
||||||
if puredata and not self.pre:
|
|
||||||
data = re.sub('\s+', ' ', data)
|
|
||||||
if data and data[0] == ' ':
|
|
||||||
self.space = 1
|
|
||||||
data = data[1:]
|
|
||||||
if not data and not force: return
|
|
||||||
|
|
||||||
if self.startpre:
|
|
||||||
#self.out(" :") #TODO: not output when already one there
|
|
||||||
if not data.startswith("\n"): # <pre>stuff...
|
|
||||||
data = "\n" + data
|
|
||||||
|
|
||||||
bq = (">" * self.blockquote)
|
|
||||||
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
|
|
||||||
|
|
||||||
if self.pre:
|
|
||||||
if not self.list:
|
|
||||||
bq += " "
|
|
||||||
#else: list content is already partially indented
|
|
||||||
for i in xrange(len(self.list)):
|
|
||||||
bq += " "
|
|
||||||
data = data.replace("\n", "\n"+bq)
|
|
||||||
|
|
||||||
if self.startpre:
|
|
||||||
self.startpre = 0
|
|
||||||
if self.list:
|
|
||||||
data = data.lstrip("\n") # use existing initial indentation
|
|
||||||
|
|
||||||
if self.start:
|
|
||||||
self.space = 0
|
|
||||||
self.p_p = 0
|
|
||||||
self.start = 0
|
|
||||||
|
|
||||||
if force == 'end':
|
|
||||||
# It's the end.
|
|
||||||
self.p_p = 0
|
|
||||||
self.out("\n")
|
|
||||||
self.space = 0
|
|
||||||
|
|
||||||
if self.p_p:
|
|
||||||
self.out((self.br_toggle+'\n'+bq)*self.p_p)
|
|
||||||
self.space = 0
|
|
||||||
self.br_toggle = ''
|
|
||||||
|
|
||||||
if self.space:
|
|
||||||
if not self.lastWasNL: self.out(' ')
|
|
||||||
self.space = 0
|
|
||||||
|
|
||||||
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
|
|
||||||
if force == "end": self.out("\n")
|
|
||||||
|
|
||||||
newa = []
|
|
||||||
for link in self.a:
|
|
||||||
if self.outcount > link['outcount']:
|
|
||||||
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
|
|
||||||
if has_key(link, 'title'): self.out(" ("+link['title']+")")
|
|
||||||
self.out("\n")
|
|
||||||
else:
|
|
||||||
newa.append(link)
|
|
||||||
|
|
||||||
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
|
|
||||||
|
|
||||||
self.a = newa
|
|
||||||
|
|
||||||
if self.abbr_list and force == "end":
|
|
||||||
for abbr, definition in self.abbr_list.items():
|
|
||||||
self.out(" *[" + abbr + "]: " + definition + "\n")
|
|
||||||
|
|
||||||
self.p_p = 0
|
|
||||||
self.out(data)
|
|
||||||
self.outcount += 1
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
if r'\/script>' in data: self.quiet -= 1
|
|
||||||
|
|
||||||
if self.style:
|
|
||||||
self.style_def.update(dumb_css_parser(data))
|
|
||||||
|
|
||||||
if not self.maybe_automatic_link is None:
|
|
||||||
href = self.maybe_automatic_link
|
|
||||||
if href == data and self.absolute_url_matcher.match(href):
|
|
||||||
self.o("<" + data + ">")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
self.o("[")
|
|
||||||
self.maybe_automatic_link = None
|
|
||||||
|
|
||||||
if not self.code and not self.pre:
|
|
||||||
data = escape_md_section(data, snob=self.escape_snob)
|
|
||||||
self.o(data, 1)
|
|
||||||
|
|
||||||
def unknown_decl(self, data): pass
|
|
||||||
|
|
||||||
def charref(self, name):
|
|
||||||
if name[0] in ['x','X']:
|
|
||||||
c = int(name[1:], 16)
|
|
||||||
else:
|
|
||||||
c = int(name)
|
|
||||||
|
|
||||||
if not self.unicode_snob and c in unifiable_n.keys():
|
|
||||||
return unifiable_n[c]
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
return unichr(c)
|
|
||||||
except NameError: #Python3
|
|
||||||
return chr(c)
|
|
||||||
|
|
||||||
def entityref(self, c):
|
|
||||||
if not self.unicode_snob and c in unifiable.keys():
|
|
||||||
return unifiable[c]
|
|
||||||
else:
|
|
||||||
try: name2cp(c)
|
|
||||||
except KeyError: return "&" + c + ';'
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
return unichr(name2cp(c))
|
|
||||||
except NameError: #Python3
|
|
||||||
return chr(name2cp(c))
|
|
||||||
|
|
||||||
def replaceEntities(self, s):
|
|
||||||
s = s.group(1)
|
|
||||||
if s[0] == "#":
|
|
||||||
return self.charref(s[1:])
|
|
||||||
else: return self.entityref(s)
|
|
||||||
|
|
||||||
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
|
||||||
def unescape(self, s):
|
|
||||||
return self.r_unescape.sub(self.replaceEntities, s)
|
|
||||||
|
|
||||||
def google_nest_count(self, style):
|
|
||||||
"""calculate the nesting count of google doc lists"""
|
|
||||||
nest_count = 0
|
|
||||||
if 'margin-left' in style:
|
|
||||||
nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
|
|
||||||
return nest_count
|
|
||||||
|
|
||||||
|
|
||||||
def optwrap(self, text):
|
|
||||||
"""Wrap all paragraphs in the provided text."""
|
|
||||||
if not self.body_width:
|
|
||||||
return text
|
|
||||||
|
|
||||||
assert wrap, "Requires Python 2.3."
|
|
||||||
result = ''
|
|
||||||
newlines = 0
|
|
||||||
for para in text.split("\n"):
|
|
||||||
if len(para) > 0:
|
|
||||||
if not skipwrap(para):
|
|
||||||
result += "\n".join(wrap(para, self.body_width))
|
|
||||||
if para.endswith(' '):
|
|
||||||
result += " \n"
|
|
||||||
newlines = 1
|
|
||||||
else:
|
|
||||||
result += "\n\n"
|
|
||||||
newlines = 2
|
|
||||||
else:
|
|
||||||
if not onlywhite(para):
|
|
||||||
result += para + "\n"
|
|
||||||
newlines = 1
|
|
||||||
else:
|
|
||||||
if newlines < 2:
|
|
||||||
result += "\n"
|
|
||||||
newlines += 1
|
|
||||||
return result
|
|
||||||
|
|
||||||
ordered_list_matcher = re.compile(r'\d+\.\s')
|
|
||||||
unordered_list_matcher = re.compile(r'[-\*\+]\s')
|
|
||||||
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
|
|
||||||
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
|
||||||
md_dot_matcher = re.compile(r"""
|
|
||||||
^ # start of line
|
|
||||||
(\s*\d+) # optional whitespace and a number
|
|
||||||
(\.) # dot
|
|
||||||
(?=\s) # lookahead assert whitespace
|
|
||||||
""", re.MULTILINE | re.VERBOSE)
|
|
||||||
md_plus_matcher = re.compile(r"""
|
|
||||||
^
|
|
||||||
(\s*)
|
|
||||||
(\+)
|
|
||||||
(?=\s)
|
|
||||||
""", flags=re.MULTILINE | re.VERBOSE)
|
|
||||||
md_dash_matcher = re.compile(r"""
|
|
||||||
^
|
|
||||||
(\s*)
|
|
||||||
(-)
|
|
||||||
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
|
||||||
# or another dash (header or hr)
|
|
||||||
""", flags=re.MULTILINE | re.VERBOSE)
|
|
||||||
slash_chars = r'\`*_{}[]()#+-.!'
|
|
||||||
md_backslash_matcher = re.compile(r'''
|
|
||||||
(\\) # match one slash
|
|
||||||
(?=[%s]) # followed by a char that requires escaping
|
|
||||||
''' % re.escape(slash_chars),
|
|
||||||
flags=re.VERBOSE)
|
|
||||||
|
|
||||||
def skipwrap(para):
|
|
||||||
# If the text begins with four spaces or one tab, it's a code block; don't wrap
|
|
||||||
if para[0:4] == ' ' or para[0] == '\t':
|
|
||||||
return True
|
|
||||||
# If the text begins with only two "--", possibly preceded by whitespace, that's
|
|
||||||
# an emdash; so wrap.
|
|
||||||
stripped = para.lstrip()
|
|
||||||
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
|
||||||
return False
|
|
||||||
# I'm not sure what this is for; I thought it was to detect lists, but there's
|
|
||||||
# a <br>-inside-<span> case in one of the tests that also depends upon it.
|
|
||||||
if stripped[0:1] == '-' or stripped[0:1] == '*':
|
|
||||||
return True
|
|
||||||
# If the text begins with a single -, *, or +, followed by a space, or an integer,
|
|
||||||
# followed by a ., followed by a space (in either case optionally preceeded by
|
|
||||||
# whitespace), it's a list; don't wrap.
|
|
||||||
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def wrapwrite(text):
|
|
||||||
text = text.encode('utf-8')
|
|
||||||
try: #Python3
|
|
||||||
sys.stdout.buffer.write(text)
|
|
||||||
except AttributeError:
|
|
||||||
sys.stdout.write(text)
|
|
||||||
|
|
||||||
def html2text(html, baseurl=''):
|
|
||||||
h = HTML2Text(baseurl=baseurl)
|
|
||||||
return h.handle(html)
|
|
||||||
|
|
||||||
def unescape(s, unicode_snob=False):
|
|
||||||
h = HTML2Text()
|
|
||||||
h.unicode_snob = unicode_snob
|
|
||||||
return h.unescape(s)
|
|
||||||
|
|
||||||
def escape_md(text):
|
|
||||||
"""Escapes markdown-sensitive characters within other markdown constructs."""
|
|
||||||
return md_chars_matcher.sub(r"\\\1", text)
|
|
||||||
|
|
||||||
def escape_md_section(text, snob=False):
|
|
||||||
"""Escapes markdown-sensitive characters across whole document sections."""
|
|
||||||
text = md_backslash_matcher.sub(r"\\\1", text)
|
|
||||||
if snob:
|
|
||||||
text = md_chars_matcher_all.sub(r"\\\1", text)
|
|
||||||
text = md_dot_matcher.sub(r"\1\\\2", text)
|
|
||||||
text = md_plus_matcher.sub(r"\1\\\2", text)
|
|
||||||
text = md_dash_matcher.sub(r"\1\\\2", text)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
baseurl = ''
|
|
||||||
|
|
||||||
p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__)
|
|
||||||
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
|
|
||||||
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
|
|
||||||
p.add_option("--ignore-links", dest="ignore_links", action="store_true",
|
|
||||||
default=IGNORE_ANCHORS, help="don't include any formatting for links")
|
|
||||||
p.add_option("--ignore-images", dest="ignore_images", action="store_true",
|
|
||||||
default=IGNORE_IMAGES, help="don't include any formatting for images")
|
|
||||||
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
|
|
||||||
default=False, help="convert an html-exported Google Document")
|
|
||||||
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
|
|
||||||
default=False, help="use a dash rather than a star for unordered list items")
|
|
||||||
p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
|
|
||||||
default=False, help="use an asterisk rather than an underscore for emphasized text")
|
|
||||||
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
|
|
||||||
default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
|
|
||||||
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
|
|
||||||
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
|
|
||||||
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
|
|
||||||
default=False, help="hide strike-through text. only relevant when -g is specified as well")
|
|
||||||
p.add_option("--escape-all", action="store_true", dest="escape_snob",
|
|
||||||
default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
|
|
||||||
(options, args) = p.parse_args()
|
|
||||||
|
|
||||||
# process input
|
|
||||||
encoding = "utf-8"
|
|
||||||
if len(args) > 0:
|
|
||||||
file_ = args[0]
|
|
||||||
if len(args) == 2:
|
|
||||||
encoding = args[1]
|
|
||||||
if len(args) > 2:
|
|
||||||
p.error('Too many arguments')
|
|
||||||
|
|
||||||
if file_.startswith('http://') or file_.startswith('https://'):
|
|
||||||
baseurl = file_
|
|
||||||
j = urllib.urlopen(baseurl)
|
|
||||||
data = j.read()
|
|
||||||
if encoding is None:
|
|
||||||
try:
|
|
||||||
from feedparser import _getCharacterEncoding as enc
|
|
||||||
except ImportError:
|
|
||||||
enc = lambda x, y: ('utf-8', 1)
|
|
||||||
encoding = enc(j.headers, data)[0]
|
|
||||||
if encoding == 'en-ascii':
|
|
||||||
encoding = 'utf-8'
|
|
||||||
else:
|
|
||||||
data = open(file_, 'rb').read()
|
|
||||||
if encoding is None:
|
|
||||||
try:
|
|
||||||
from chardet import detect
|
|
||||||
except ImportError:
|
|
||||||
detect = lambda x: {'encoding': 'utf-8'}
|
|
||||||
encoding = detect(data)['encoding']
|
|
||||||
else:
|
|
||||||
data = sys.stdin.read()
|
|
||||||
|
|
||||||
data = data.decode(encoding)
|
|
||||||
h = HTML2Text(baseurl=baseurl)
|
|
||||||
# handle options
|
|
||||||
if options.ul_style_dash: h.ul_item_mark = '-'
|
|
||||||
if options.em_style_asterisk:
|
|
||||||
h.emphasis_mark = '*'
|
|
||||||
h.strong_mark = '__'
|
|
||||||
|
|
||||||
h.body_width = options.body_width
|
|
||||||
h.list_indent = options.list_indent
|
|
||||||
h.ignore_emphasis = options.ignore_emphasis
|
|
||||||
h.ignore_links = options.ignore_links
|
|
||||||
h.ignore_images = options.ignore_images
|
|
||||||
h.google_doc = options.google_doc
|
|
||||||
h.hide_strikethrough = options.hide_strikethrough
|
|
||||||
h.escape_snob = options.escape_snob
|
|
||||||
|
|
||||||
wrapwrite(h.handle(data))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -24,5 +24,5 @@ def json_tables():
|
||||||
base, d = bson.decode_document(bs, base)
|
base, d = bson.decode_document(bs, base)
|
||||||
lc.append(d)
|
lc.append(d)
|
||||||
data[table] = lc
|
data[table] = lc
|
||||||
open('migration/data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder))
|
open('dump/discours/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder))
|
||||||
|
|
||||||
|
|
|
@ -1,215 +0,0 @@
|
||||||
from html.parser import HTMLParser
|
|
||||||
import os
|
|
||||||
import codecs
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
|
|
||||||
class Converter(HTMLParser):
|
|
||||||
md_file: str
|
|
||||||
temp_tag: str
|
|
||||||
code_box: bool
|
|
||||||
div_count: int
|
|
||||||
code_box_div_num: int
|
|
||||||
ol_count: int
|
|
||||||
related_data: list
|
|
||||||
is_link: bool
|
|
||||||
link_ref: str
|
|
||||||
ignore_data: bool
|
|
||||||
class_div_count: int
|
|
||||||
ignore_div: bool
|
|
||||||
table_start: Tuple[int, int]
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self.md_file = ''
|
|
||||||
self.code_box = False
|
|
||||||
self.div_count = 0
|
|
||||||
self.span_count = 0
|
|
||||||
self.code_box_div_num = 0
|
|
||||||
self.ol_count = 0
|
|
||||||
self.temp_tag = ''
|
|
||||||
self.related_data = []
|
|
||||||
self.is_link = False
|
|
||||||
self.link_ref = ''
|
|
||||||
self.ignore_data = False
|
|
||||||
self.class_div_count = 0
|
|
||||||
self.ignore_div = False
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
|
||||||
if self.ignore_data:
|
|
||||||
return None
|
|
||||||
elif tag == 'sup':
|
|
||||||
self.md_file += '<sup>'
|
|
||||||
elif tag == 'p':
|
|
||||||
self.temp_tag = 'p'
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'i':
|
|
||||||
self.temp_tag = 'i'
|
|
||||||
self.md_file += '*'
|
|
||||||
elif tag == 'wbr':
|
|
||||||
self.temp_tag = 'wbr'
|
|
||||||
self.md_file += ''
|
|
||||||
elif tag == 'span':
|
|
||||||
self.temp_tag = 'span'
|
|
||||||
self.span_count += 1
|
|
||||||
self.md_file += ' '
|
|
||||||
elif tag == 'figcaption':
|
|
||||||
self.md_file += ''
|
|
||||||
elif tag == 'hr':
|
|
||||||
self.md_file += '\n*** \n'
|
|
||||||
elif tag == 'title':
|
|
||||||
self.md_file += '# '
|
|
||||||
elif tag == 'h1':
|
|
||||||
self.md_file += '# '
|
|
||||||
elif tag == 'h2':
|
|
||||||
self.md_file += '## '
|
|
||||||
elif tag == 'h3':
|
|
||||||
self.md_file += '### '
|
|
||||||
elif tag == 'b' or tag == 'strong':
|
|
||||||
self.md_file += '**'
|
|
||||||
elif tag == 'ul':
|
|
||||||
self.temp_tag = 'ul'
|
|
||||||
self.md_file += ' \n'
|
|
||||||
elif tag == 'ol':
|
|
||||||
self.ol_count = 0
|
|
||||||
self.temp_tag = 'ol'
|
|
||||||
self.md_file += ' \n'
|
|
||||||
elif tag == 'li':
|
|
||||||
if self.temp_tag == 'ul':
|
|
||||||
self.md_file += '* '
|
|
||||||
elif self.temp_tag == 'ol':
|
|
||||||
self.ol_count += 1
|
|
||||||
self.md_file += f'{self.ol_count}. '
|
|
||||||
elif tag == 'div':
|
|
||||||
self.div_count += 1
|
|
||||||
attrs_dict = dict(attrs)
|
|
||||||
if 'style' in attrs_dict and 'codeblock' in attrs_dict['style']:
|
|
||||||
self.code_box_div_num = self.div_count
|
|
||||||
self.code_box = True
|
|
||||||
self.md_file += '```\n'
|
|
||||||
elif 'class' in attrs_dict:
|
|
||||||
self.class_div_count = self.div_count
|
|
||||||
self.ignore_div = True
|
|
||||||
elif tag == 'pre' or tag == 'code':
|
|
||||||
self.code_box = True
|
|
||||||
self.md_file += '\n```\n'
|
|
||||||
elif tag == 'a':
|
|
||||||
self.is_link = True
|
|
||||||
attrs_dict = dict(attrs)
|
|
||||||
self.link_ref = attrs_dict.get('href', '#')
|
|
||||||
if not self.link_ref.startswith('http') and not self.link_ref.endswith('html') and not '@' in self.link_ref:
|
|
||||||
self.related_data.append(self.link_ref)
|
|
||||||
elif tag == 'style':
|
|
||||||
self.ignore_data = True
|
|
||||||
elif tag == 'symbol':
|
|
||||||
self.ignore_data = True
|
|
||||||
elif tag == 'svg':
|
|
||||||
self.ignore_data = True
|
|
||||||
elif tag == 'path':
|
|
||||||
self.ignore_data = True
|
|
||||||
elif tag == 'img':
|
|
||||||
attrs_dict = dict(attrs)
|
|
||||||
img_ref = attrs_dict['src']
|
|
||||||
alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'x'
|
|
||||||
if self.is_link:
|
|
||||||
self.related_data.append(img_ref)
|
|
||||||
self.md_file += f'[]({self.link_ref})'
|
|
||||||
else:
|
|
||||||
self.related_data.append(img_ref)
|
|
||||||
self.md_file += f''
|
|
||||||
elif tag == 'table':
|
|
||||||
self.ignore_data = True
|
|
||||||
self.table_start = self.getpos()
|
|
||||||
else:
|
|
||||||
print('<' + tag + '>')
|
|
||||||
|
|
||||||
def get_rawdata(self, start, stop, offset):
|
|
||||||
temp_rawdata = self.rawdata
|
|
||||||
for i in range(offset-1):
|
|
||||||
next_section = temp_rawdata.find('\n')
|
|
||||||
temp_rawdata = temp_rawdata[next_section+1:]
|
|
||||||
return temp_rawdata[start:stop]
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
if tag == 'b' or tag == 'strong':
|
|
||||||
self.md_file += '** '
|
|
||||||
elif tag == 'sup':
|
|
||||||
self.md_file += '</sup>'
|
|
||||||
elif tag == 'iframe':
|
|
||||||
self.ignore_data = False
|
|
||||||
elif tag == 'wbr':
|
|
||||||
self.md_file += ''
|
|
||||||
elif tag == 'title':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'h1':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'h2':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'h3':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'h4':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'span':
|
|
||||||
self.span_count -= 1
|
|
||||||
self.md_file += ' '
|
|
||||||
elif tag == 'figcaption':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'i':
|
|
||||||
self.md_file += '* '
|
|
||||||
elif tag == 'p':
|
|
||||||
self.md_file += '\n'
|
|
||||||
elif tag == 'div':
|
|
||||||
if self.code_box and self.code_box_div_num == self.div_count:
|
|
||||||
self.code_box = False
|
|
||||||
self.md_file += '```\n'
|
|
||||||
elif self.ignore_div and self.class_div_count == self.div_count:
|
|
||||||
self.ignore_div = False
|
|
||||||
else:
|
|
||||||
self.md_file += ' \n'
|
|
||||||
self.div_count -= 1
|
|
||||||
elif tag == 'pre' or tag == 'code':
|
|
||||||
self.code_box = False
|
|
||||||
self.md_file += '```\n'
|
|
||||||
elif tag == 'a':
|
|
||||||
self.is_link = False
|
|
||||||
elif tag == 'style':
|
|
||||||
self.ignore_data = False
|
|
||||||
elif tag == 'symbol':
|
|
||||||
self.ignore_data = False
|
|
||||||
elif tag == 'svg':
|
|
||||||
self.ignore_data = False
|
|
||||||
elif tag == 'li':
|
|
||||||
self.md_file += ' \n'
|
|
||||||
elif tag == 'table':
|
|
||||||
offset, lineno_stop = self.getpos()
|
|
||||||
lineno_stop = lineno_stop + len(tag) + 3
|
|
||||||
_, lineno_start = self.table_start
|
|
||||||
raw_data = self.get_rawdata(lineno_start, lineno_stop, offset)
|
|
||||||
self.md_file += '\n' + raw_data
|
|
||||||
self.ignore_data = False
|
|
||||||
else:
|
|
||||||
print('</' + tag + '>')
|
|
||||||
|
|
||||||
def handle_startendtag(self, tag, attrs):
|
|
||||||
if tag == 'br':
|
|
||||||
self.md_file += ' \n'
|
|
||||||
elif tag == 'wbr':
|
|
||||||
self.md_file += ''
|
|
||||||
elif tag == 'hr':
|
|
||||||
self.md_file += '\n*** \n'
|
|
||||||
elif tag == 'img':
|
|
||||||
attr_dict = dict(attrs)
|
|
||||||
name = attr_dict.get('data-filename', 'image')
|
|
||||||
img_ref = attr_dict['src']
|
|
||||||
self.related_data.append(img_ref)
|
|
||||||
self.md_file += f''
|
|
||||||
else:
|
|
||||||
print("<" + tag + " />")
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
if self.is_link:
|
|
||||||
self.md_file += f'[{data}]({self.link_ref})'
|
|
||||||
elif self.ignore_data:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
self.md_file += data
|
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Use Unicode characters instead of their ascii pseudo-replacements
|
# Use Unicode characters instead of their ascii pseudo-replacements
|
||||||
UNICODE_SNOB = False
|
UNICODE_SNOB = True
|
||||||
|
|
||||||
# Marker to use for marking tables for padding post processing
|
# Marker to use for marking tables for padding post processing
|
||||||
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
||||||
|
@ -13,7 +13,7 @@ ESCAPE_SNOB = False
|
||||||
LINKS_EACH_PARAGRAPH = False
|
LINKS_EACH_PARAGRAPH = False
|
||||||
|
|
||||||
# Wrap long lines at position. 0 for no wrapping.
|
# Wrap long lines at position. 0 for no wrapping.
|
||||||
BODY_WIDTH = 78
|
BODY_WIDTH = 0
|
||||||
|
|
||||||
# Don't show internal links (href="#local-anchor") -- corresponding link
|
# Don't show internal links (href="#local-anchor") -- corresponding link
|
||||||
# targets won't be visible in the plain text file anyway.
|
# targets won't be visible in the plain text file anyway.
|
||||||
|
@ -24,7 +24,7 @@ INLINE_LINKS = True
|
||||||
|
|
||||||
# Protect links from line breaks surrounding them with angle brackets (in
|
# Protect links from line breaks surrounding them with angle brackets (in
|
||||||
# addition to their square brackets)
|
# addition to their square brackets)
|
||||||
PROTECT_LINKS = False
|
PROTECT_LINKS = True
|
||||||
WRAP_LINKS = True
|
WRAP_LINKS = True
|
||||||
|
|
||||||
# Wrap list items.
|
# Wrap list items.
|
||||||
|
@ -156,7 +156,7 @@ IGNORE_TABLES = False
|
||||||
|
|
||||||
# Use a single line break after a block element rather than two line breaks.
|
# Use a single line break after a block element rather than two line breaks.
|
||||||
# NOTE: Requires body width setting to be 0.
|
# NOTE: Requires body width setting to be 0.
|
||||||
SINGLE_LINE_BREAK = False
|
SINGLE_LINE_BREAK = True
|
||||||
|
|
||||||
|
|
||||||
# Use double quotation marks when converting the <q> tag.
|
# Use double quotation marks when converting the <q> tag.
|
||||||
|
|
|
@ -82,7 +82,7 @@
|
||||||
"blizhniy-vostok": "middle-east",
|
"blizhniy-vostok": "middle-east",
|
||||||
"blizost": "closeness",
|
"blizost": "closeness",
|
||||||
"blokada": "blockade",
|
"blokada": "blockade",
|
||||||
"bob-dilan": "bob-dilan",
|
"bob-dilan": "bob-dylan",
|
||||||
"bog": "god",
|
"bog": "god",
|
||||||
"bol": "pain",
|
"bol": "pain",
|
||||||
"bolotnoe-delo": "bolotnaya-case",
|
"bolotnoe-delo": "bolotnaya-case",
|
||||||
|
@ -205,7 +205,7 @@
|
||||||
"erich-von-neff": "erich-von-neff",
|
"erich-von-neff": "erich-von-neff",
|
||||||
"erotika": "erotics",
|
"erotika": "erotics",
|
||||||
"essay": "essay",
|
"essay": "essay",
|
||||||
"estetika": "aestetic",
|
"estetika": "aestetics",
|
||||||
"etika": "ethics",
|
"etika": "ethics",
|
||||||
"etnos": "ethnics",
|
"etnos": "ethnics",
|
||||||
"everyday-life": "everyday-life",
|
"everyday-life": "everyday-life",
|
||||||
|
@ -219,7 +219,7 @@
|
||||||
"faktcheking": "fact-checking",
|
"faktcheking": "fact-checking",
|
||||||
"falsifikatsii": "falsifications",
|
"falsifikatsii": "falsifications",
|
||||||
"family": "family",
|
"family": "family",
|
||||||
"fanfiki": "fanfiction",
|
"fanfiki": "fan-fiction",
|
||||||
"fantastika": "sci-fi",
|
"fantastika": "sci-fi",
|
||||||
"fatalizm": "fatalism",
|
"fatalizm": "fatalism",
|
||||||
"fedor-dostoevskiy": "fedor-dostoevsky",
|
"fedor-dostoevskiy": "fedor-dostoevsky",
|
||||||
|
@ -234,7 +234,7 @@
|
||||||
"folklor": "folklore",
|
"folklor": "folklore",
|
||||||
"fotoreportazh": "photoreports",
|
"fotoreportazh": "photoreports",
|
||||||
"france": "france",
|
"france": "france",
|
||||||
"frants-kafka": "Franz-Kafka",
|
"frants-kafka": "franz-kafka",
|
||||||
"frederik-begbeder": "frederick-begbeder",
|
"frederik-begbeder": "frederick-begbeder",
|
||||||
"freedom": "freedom",
|
"freedom": "freedom",
|
||||||
"friendship": "friendship",
|
"friendship": "friendship",
|
||||||
|
@ -262,7 +262,7 @@
|
||||||
"graffiti": "graffiti",
|
"graffiti": "graffiti",
|
||||||
"graphics": "graphics",
|
"graphics": "graphics",
|
||||||
"gravyura": "engraving",
|
"gravyura": "engraving",
|
||||||
"grazhdanskaya-oborona": "grob",
|
"grazhdanskaya-oborona": "grazhdanskaya-oborona",
|
||||||
"gretsiya": "greece",
|
"gretsiya": "greece",
|
||||||
"gulag": "gulag",
|
"gulag": "gulag",
|
||||||
"han-batyy": "khan-batyy",
|
"han-batyy": "khan-batyy",
|
||||||
|
@ -332,7 +332,7 @@
|
||||||
"kinoklub": "cinema-club",
|
"kinoklub": "cinema-club",
|
||||||
"kirill-serebrennikov": "kirill-serebrennikov",
|
"kirill-serebrennikov": "kirill-serebrennikov",
|
||||||
"klassika": "classic",
|
"klassika": "classic",
|
||||||
"kollektivnoe-bessoznatelnoe": "kollektivnoe-bessoznatelnoe",
|
"kollektivnoe-bessoznatelnoe": "сollective-unconscious",
|
||||||
"komediya": "comedy",
|
"komediya": "comedy",
|
||||||
"kommunikatsii": "communications",
|
"kommunikatsii": "communications",
|
||||||
"kommunizm": "communism",
|
"kommunizm": "communism",
|
||||||
|
@ -429,18 +429,18 @@
|
||||||
"muzey": "museum",
|
"muzey": "museum",
|
||||||
"muzhchiny": "man",
|
"muzhchiny": "man",
|
||||||
"myshlenie": "thinking",
|
"myshlenie": "thinking",
|
||||||
"nagornyy-karabah": "nagornyy-karabah",
|
"nagornyy-karabah": "nagorno-karabakh",
|
||||||
"natsionalizm": "nationalism",
|
"natsionalizm": "nationalism",
|
||||||
"natsionalnaya-ideya": "national-idea",
|
"natsionalnaya-ideya": "national-idea",
|
||||||
"natsizm": "nazism",
|
"natsizm": "nazism",
|
||||||
"natyurmort": "natyurmort",
|
"natyurmort": "nature-morte",
|
||||||
"nauchpop": "pop-science",
|
"nauchpop": "pop-science",
|
||||||
"nbp": "nbp",
|
"nbp": "nbp",
|
||||||
"nenavist": "hate",
|
"nenavist": "hate",
|
||||||
"neofitsialnaya-literatura": "unofficial-literature",
|
"neofitsialnaya-literatura": "unofficial-literature",
|
||||||
"neoklassika": "neoclassic",
|
"neoklassika": "neoclassic",
|
||||||
"neprozrachnye-smysly": "hidden-meanings",
|
"neprozrachnye-smysly": "hidden-meanings",
|
||||||
"neravenstvo": "non-equality",
|
"neravenstvo": "inequality",
|
||||||
"new-year": "new-year",
|
"new-year": "new-year",
|
||||||
"neyronauka": "neuro-science",
|
"neyronauka": "neuro-science",
|
||||||
"neyroseti": "neural-networks",
|
"neyroseti": "neural-networks",
|
||||||
|
@ -458,7 +458,7 @@
|
||||||
"ocherk": "etudes",
|
"ocherk": "etudes",
|
||||||
"ochevidnyy-nuar": "ochevidnyy-nuar",
|
"ochevidnyy-nuar": "ochevidnyy-nuar",
|
||||||
"odinochestvo": "loneliness",
|
"odinochestvo": "loneliness",
|
||||||
"odna-kniga-odna-istoriya": "odna-kniga-odna-istoriya",
|
"odna-kniga-odna-istoriya": "one-book-one-story",
|
||||||
"okrainy": "outskirts",
|
"okrainy": "outskirts",
|
||||||
"opinions": "opinions",
|
"opinions": "opinions",
|
||||||
"oppozitsiya": "opposition",
|
"oppozitsiya": "opposition",
|
||||||
|
@ -467,7 +467,7 @@
|
||||||
"osip-mandelshtam": "osip-mandelshtam",
|
"osip-mandelshtam": "osip-mandelshtam",
|
||||||
"oskar-uayld": "oscar-wilde",
|
"oskar-uayld": "oscar-wilde",
|
||||||
"osoznanie": "awareness",
|
"osoznanie": "awareness",
|
||||||
"otnosheniya": "relationships",
|
"otnosheniya": "relationship",
|
||||||
"pablo-pikasso": "pablo-picasso",
|
"pablo-pikasso": "pablo-picasso",
|
||||||
"painting": "painting",
|
"painting": "painting",
|
||||||
"paintings": "painting",
|
"paintings": "painting",
|
||||||
|
@ -613,7 +613,7 @@
|
||||||
"sotsializm": "socialism",
|
"sotsializm": "socialism",
|
||||||
"sotsialnaya-filosofiya": "social-philosophy",
|
"sotsialnaya-filosofiya": "social-philosophy",
|
||||||
"sotsseti": "social-networks",
|
"sotsseti": "social-networks",
|
||||||
"sotvorenie-tretego-rima": "sotvorenie-tretego-rima",
|
"sotvorenie-tretego-rima": "third-rome",
|
||||||
"sovremennost": "modernity",
|
"sovremennost": "modernity",
|
||||||
"spaces": "spaces",
|
"spaces": "spaces",
|
||||||
"spektakl": "spectacles",
|
"spektakl": "spectacles",
|
||||||
|
@ -638,7 +638,7 @@
|
||||||
"syurrealizm": "surrealism",
|
"syurrealizm": "surrealism",
|
||||||
"tales": "tales",
|
"tales": "tales",
|
||||||
"tanets": "dance",
|
"tanets": "dance",
|
||||||
"tataro-mongolskoe-igo": "tataro-mongolskoe-igo",
|
"tataro-mongolskoe-igo": "mongol-tatar-yoke",
|
||||||
"tatuirovki": "tattoo",
|
"tatuirovki": "tattoo",
|
||||||
"technology": "technology",
|
"technology": "technology",
|
||||||
"televidenie": "tv",
|
"televidenie": "tv",
|
||||||
|
@ -663,8 +663,8 @@
|
||||||
"trendy": "trends",
|
"trendy": "trends",
|
||||||
"tretiy-reyh": "third-reich",
|
"tretiy-reyh": "third-reich",
|
||||||
"triller": "thriller",
|
"triller": "thriller",
|
||||||
"tsar": "tsar",
|
"tsar": "central-african-republic",
|
||||||
"tsar-edip": "tsar-edip",
|
"tsar-edip": "oedipus",
|
||||||
"tsarevich-dmitriy": "tsarevich-dmitry",
|
"tsarevich-dmitriy": "tsarevich-dmitry",
|
||||||
"tsennosti": "values",
|
"tsennosti": "values",
|
||||||
"tsenzura": "censorship",
|
"tsenzura": "censorship",
|
||||||
|
@ -702,11 +702,11 @@
|
||||||
"videopoeziya": "video-poetry",
|
"videopoeziya": "video-poetry",
|
||||||
"viktor-astafev": "viktor-astafev",
|
"viktor-astafev": "viktor-astafev",
|
||||||
"viktor-pelevin": "viktor-pelevin",
|
"viktor-pelevin": "viktor-pelevin",
|
||||||
"vilgelm-rayh": "vilgelm-rayh",
|
"vilgelm-rayh": "wilhelm-reich",
|
||||||
"vinzavod": "vinzavod",
|
"vinzavod": "vinzavod",
|
||||||
"violence": "violence",
|
"violence": "violence",
|
||||||
"visual-culture": "visual-culture",
|
"visual-culture": "visual-culture",
|
||||||
"vizualnaya-poeziya": "vizual-poetry",
|
"vizualnaya-poeziya": "visual-poetry",
|
||||||
"vladimir-lenin": "vladimir-lenin",
|
"vladimir-lenin": "vladimir-lenin",
|
||||||
"vladimir-nabokov": "vladimir-nabokov",
|
"vladimir-nabokov": "vladimir-nabokov",
|
||||||
"vladimir-putin": "vladimir-putin",
|
"vladimir-putin": "vladimir-putin",
|
||||||
|
@ -716,10 +716,10 @@
|
||||||
"volontery": "volonteurs",
|
"volontery": "volonteurs",
|
||||||
"vong-karvay": "wong-karwai",
|
"vong-karvay": "wong-karwai",
|
||||||
"vospominaniya": "memories",
|
"vospominaniya": "memories",
|
||||||
"vostok": "vostok",
|
"vostok": "east",
|
||||||
"vremya": "time",
|
"vremya": "time",
|
||||||
"vudi-allen": "woody-allen",
|
"vudi-allen": "woody-allen",
|
||||||
"vynuzhdennye-otnosheniya": "forced-relationships",
|
"vynuzhdennye-otnosheniya": "forced-relationship",
|
||||||
"war": "war",
|
"war": "war",
|
||||||
"war-in-ukraine-images": "war-in-ukrahine-images",
|
"war-in-ukraine-images": "war-in-ukrahine-images",
|
||||||
"women": "women",
|
"women": "women",
|
||||||
|
|
|
@ -88,9 +88,6 @@ def migrate(entry):
|
||||||
old = res['old_id']
|
old = res['old_id']
|
||||||
user = User.create(**res.copy())
|
user = User.create(**res.copy())
|
||||||
res['id'] = user.id
|
res['id'] = user.id
|
||||||
if res['slug'] == 'vorovich':
|
|
||||||
print(entry)
|
|
||||||
print(res)
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def migrate_email_subscription(entry):
|
def migrate_email_subscription(entry):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user