diff --git a/migration/aaronsw-html2text.py b/migration/aaronsw-html2text.py deleted file mode 100644 index 12f994da..00000000 --- a/migration/aaronsw-html2text.py +++ /dev/null @@ -1,936 +0,0 @@ -#!/usr/bin/env python -"""html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "3.200.3" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] - -# TODO: -# Support decoded entities with unifiable. - -try: - True -except NameError: - setattr(__builtins__, 'True', 1) - setattr(__builtins__, 'False', 0) - -def has_key(x, y): - if hasattr(x, 'has_key'): return x.has_key(y) - else: return y in x - -import html.entities as htmlentitydefs -import urllib.parse as urlparse -import html.parser as HTMLParser -import urllib.request as urllib -import optparse, re, sys, codecs, types - -try: from textwrap import wrap -except: pass - -#s upport the python3 API -if sys.version_info[0] == 3: - unichr = chr - xrange = range - -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Escape all special characters. Output is less readable, but avoids corner case formatting issues. -ESCAPE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 0 - -# Don't show internal links (href="#local-anchor") -- corresponding link targets -# won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False - -# Use inline, rather than reference, formatting for images and links -INLINE_LINKS = True - -# Number of pixels Google indents nested lists -GOOGLE_LIST_INDENT = 36 - -IGNORE_ANCHORS = False -IGNORE_IMAGES = False -IGNORE_EMPHASIS = False - -### Entity Nonsense ### - -def name2cp(k): - if k == 'apos': return ord("'") - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - else: - k = htmlentitydefs.entitydefs[k] - if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 - return ord(codecs.latin_1_decode(k)[0]) - -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', -'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', -'lrm':' ', 'rlm':' '} - -unifiable_n = {} - -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] - -### End Entity Nonsense ### - -def onlywhite(line): - """Return true if the line does only consist of whitespace characters.""" - for c in line: - if c != ' ' and c != ' ': - return c == ' ' - return line - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): return n - except ValueError: return 0 - -def dumb_property_dict(style): - """returns a hash of css attributes""" - return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); - -def dumb_css_parser(data): - """returns a hash of css selectors, each of which contains a hash of css attributes""" - # remove @import sentences - data += ';' - importIndex = data.find('@import') - while importIndex != -1: - data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] - importIndex = data.find('@import') - - # parse the css. reverted from dictionary compehension in order to support older pythons - elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] - try: - elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) - except ValueError: - elements = {} # not that important - - return elements - -def element_style(attrs, style_def, parent_style): - """returns a hash of the 'final' style attributes of the element""" - style = parent_style.copy() - if 'class' in attrs: - for css_class in attrs['class'].split(): - css_style = style_def['.' + css_class] - style.update(css_style) - if 'style' in attrs: - immediate_style = dumb_property_dict(attrs['style']) - style.update(immediate_style) - return style - -def google_list_style(style): - """finds out whether this is an ordered or unordered list""" - if 'list-style-type' in style: - list_style = style['list-style-type'] - if list_style in ['disc', 'circle', 'square', 'none']: - return 'ul' - return 'ol' - -def google_has_height(style): - """check if the style of the element has the 'height' attribute explicitly defined""" - if 'height' in style: - return True - return False - -def google_text_emphasis(style): - """return a list of all emphasis modifiers of the element""" - emphasis = [] - if 'text-decoration' in style: - emphasis.append(style['text-decoration']) - if 'font-style' in style: - emphasis.append(style['font-style']) - if 'font-weight' in style: - emphasis.append(style['font-weight']) - return emphasis - -def google_fixed_width_font(style): - """check if the css of the current element defines a fixed width font""" - font_family = '' - if 'font-family' in style: - font_family = style['font-family'] - if 'Courier New' == font_family or 'Consolas' == font_family: - return True - return False - -def list_numbering_start(attrs): - """extract numbering from list element attributes""" - if 'start' in attrs: - return int(attrs['start']) - 1 - else: - return 0 - -class HTML2Text(HTMLParser.HTMLParser): - def __init__(self, out=None, baseurl=''): - HTMLParser.HTMLParser.__init__(self) - - # Config options - self.unicode_snob = UNICODE_SNOB - self.escape_snob = ESCAPE_SNOB - self.links_each_paragraph = LINKS_EACH_PARAGRAPH - self.body_width = BODY_WIDTH - self.skip_internal_links = SKIP_INTERNAL_LINKS - self.inline_links = INLINE_LINKS - self.google_list_indent = GOOGLE_LIST_INDENT - self.ignore_links = IGNORE_ANCHORS - self.ignore_images = IGNORE_IMAGES - self.ignore_emphasis = IGNORE_EMPHASIS - self.google_doc = False - self.ul_item_mark = '*' - self.emphasis_mark = '_' - self.strong_mark = '**' - - if out is None: - self.out = self.outtextf - else: - self.out = out - - self.outtextlist = [] # empty list to store output characters before they are "joined" - - try: - self.outtext = unicode() - except NameError: # Python3 - self.outtext = str() - - self.quiet = 0 - self.p_p = 0 # number of newline character to print before next output - self.outcount = 0 - self.start = 1 - self.space = 0 - self.a = [] - self.astack = [] - self.maybe_automatic_link = None - self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') - self.acount = 0 - self.list = [] - self.blockquote = 0 - self.pre = 0 - self.startpre = 0 - self.code = False - self.br_toggle = '' - self.lastWasNL = 0 - self.lastWasList = False - self.style = 0 - self.style_def = {} - self.tag_stack = [] - self.emphasis = 0 - self.drop_white_space = 0 - self.inheader = False - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later - self.baseurl = baseurl - self.header_id = None - self.span_hightlight = False - self.span_lead = False - - try: del unifiable_n[name2cp('nbsp')] - except KeyError: pass - unifiable['nbsp'] = ' _place_holder;' - - - def feed(self, data): - data = data.replace("", "") - HTMLParser.HTMLParser.feed(self, data) - - def handle(self, data): - self.feed(data) - self.feed("") - return self.optwrap(self.close()) - - def outtextf(self, s): - self.outtextlist.append(s) - if s: self.lastWasNL = s[-1] == '\n' - - def close(self): - HTMLParser.HTMLParser.close(self) - - self.pbr() - self.o('', 0, 'end') - - self.outtext = self.outtext.join(self.outtextlist) - if self.unicode_snob: - nbsp = unichr(name2cp('nbsp')) - else: - nbsp = u' ' - self.outtext = self.outtext.replace(u' _place_holder;', nbsp) - - return self.outtext - - def handle_charref(self, c): - self.o(self.charref(c), 1) - - def handle_entityref(self, c): - self.o(self.entityref(c), 1) - - def handle_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) - - def handle_endtag(self, tag): - self.handle_tag(tag, None, 0) - - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not has_key(attrs, 'href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if has_key(a, 'href') and a['href'] == attrs['href']: - if has_key(a, 'title') or has_key(attrs, 'title'): - if (has_key(a, 'title') and has_key(attrs, 'title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - - def drop_last(self, nLetters): - if not self.quiet: - self.outtext = self.outtext[:-nLetters] - - def handle_emphasis(self, start, tag_style, parent_style): - """handles various text emphases""" - tag_emphasis = google_text_emphasis(tag_style) - parent_emphasis = google_text_emphasis(parent_style) - - # handle Google's text emphasis - strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough - bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis - italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis - fixed = google_fixed_width_font(tag_style) and not \ - google_fixed_width_font(parent_style) and not self.pre - - if start: - # crossed-out text must be handled before other attributes - # in order not to output qualifiers unnecessarily - if bold or italic or fixed: - self.emphasis += 1 - if strikethrough: - self.quiet += 1 - if italic: - self.o(self.emphasis_mark) - self.drop_white_space += 1 - if bold: - self.o(self.strong_mark) - self.drop_white_space += 1 - if fixed: - self.o('`') - self.drop_white_space += 1 - self.code = True - else: - if bold or italic or fixed: - # there must not be whitespace before closing emphasis mark - self.emphasis -= 1 - self.space = 0 - self.outtext = self.outtext.rstrip() - if fixed: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(1) - self.drop_white_space -= 1 - else: - self.o('`') - self.code = False - if bold: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(2) - self.drop_white_space -= 1 - else: - self.o(self.strong_mark) - if italic: - if self.drop_white_space: - # empty emphasis, drop it - self.drop_last(1) - self.drop_white_space -= 1 - else: - self.o(self.emphasis_mark) - # space is only allowed after *all* emphasis marks - if (bold or italic) and not self.emphasis: - self.o(" ") - if strikethrough: - self.quiet -= 1 - - def handle_tag(self, tag, attrs, start): - #attrs = fixattrs(attrs) - if attrs is None: - attrs = {} - else: - attrs = dict(attrs) - - if self.google_doc: - # the attrs parameter is empty for a closing tag. in addition, we - # need the attributes of the parent nodes in order to get a - # complete style description for the current element. we assume - # that google docs export well formed html. - parent_style = {} - if start: - if self.tag_stack: - parent_style = self.tag_stack[-1][2] - tag_style = element_style(attrs, self.style_def, parent_style) - self.tag_stack.append((tag, attrs, tag_style)) - else: - dummy, attrs, tag_style = self.tag_stack.pop() - if self.tag_stack: - parent_style = self.tag_stack[-1][2] - - if hn(tag): - if start: - self.p() - self.inheader = True - self.o(hn(tag)*"#" + ' ') - self.header_id = attrs.get('id') - else: - if self.header_id: - self.o(' {#' + self.header_id + '}') - self.header_id = None - self.p() - self.inheader = False - return # prevent redundant emphasis marks on headers - - if tag == 'span': - if start and 'class' in attrs: - if attrs['class'] == 'highlight': - self.o('`') # NOTE: same as - self.span_hightlight = True - elif attrs['class'] == 'lead': - if self.span_lead == False: - self.o('==\n') # NOTE: but CriticMarkup uses {== ==} - self.span_lead = True - else: - if self.span_hightlight: - self.o('`') - self.span_hightlight = False - elif self.span_lead: - if self.span_lead == True: - self.o('\n==') - self.span_lead = False - - if tag in ['p', 'div']: - if self.google_doc: - if start and google_has_height(tag_style): - self.p() - else: - self.soft_br() - else: - self.p() - - if tag == "br" and start: self.o(" \n") - - if tag == "hr" and start: - self.p() - self.o("* * *") - self.p() - - if tag in ["head", "style", 'script']: - if start: self.quiet += 1 - else: self.quiet -= 1 - - if tag == "style": - if start: self.style += 1 - else: self.style -= 1 - - if tag in ["body"]: - self.quiet = 0 # sites like 9rules.com never close - - if tag == "blockquote": - if start: - self.p(); self.o('> ', 0, 1); self.start = 1 - self.blockquote += 1 - else: - self.blockquote -= 1 - self.p() - - if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark) - if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark) - if tag in ['del', 'strike', 's']: - if start: - self.o("<"+tag+">") - else: - self.o("") - - if self.google_doc: - if not self.inheader: - # handle some font attributes, but leave headers clean - self.handle_emphasis(start, tag_style, parent_style) - - if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` - if tag == "abbr": - if start: - self.abbr_title = None - self.abbr_data = '' - if has_key(attrs, 'title'): - self.abbr_title = attrs['title'] - else: - if self.abbr_title != None: - self.abbr_list[self.abbr_data] = self.abbr_title - self.abbr_title = None - self.abbr_data = '' - - if tag == "a" and not self.ignore_links: - if start: - if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): - self.astack.append(attrs) - self.maybe_automatic_link = attrs['href'][:2000] - else: - self.astack.append(None) - else: - if self.astack: - a = self.astack.pop() - if self.maybe_automatic_link: - self.maybe_automatic_link = None - elif a: - if self.inline_links: - self.o("](" + escape_md(a['href']) + ")") - else: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + str(a['count']) + "]") - - if tag == "img" and start and not self.ignore_images: - if has_key(attrs, 'src'): - attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') - self.o("![" + escape_md(alt) + "]") - - if self.inline_links: - self.o("(" + escape_md(attrs['href']) + ")") - else: - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("[" + str(attrs['count']) + "]") - - if tag == 'dl' and start: self.p() - if tag == 'dt' and not start: self.pbr() - if tag == 'dd' and start: self.o(' ') - if tag == 'dd' and not start: self.pbr() - - if tag in ["ol", "ul"]: - # Google Docs create sub lists as top level lists - if (not self.list) and (not self.lastWasList): - self.p() - if start: - if self.google_doc: - list_style = google_list_style(tag_style) - else: - list_style = tag - numbering_start = list_numbering_start(attrs) - self.list.append({'name':list_style, 'num':numbering_start}) - else: - if self.list: self.list.pop() - self.lastWasList = True - else: - self.lastWasList = False - - if tag == 'li': - self.pbr() - if start: - if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} - if self.google_doc: - nest_count = self.google_nest_count(tag_style) - else: - nest_count = len(self.list) - self.o(" " * int(nest_count)) #TODO: line up
  1. s > 9 correctly. - if li['name'] == "ul": self.o(self.ul_item_mark + " ") - elif li['name'] == "ol": - li['num'] += 1 - self.o(str(li['num'])+". ") - self.start = 1 - - if tag in ["table", "tr"] and start: self.p() - if tag == 'td': self.pbr() - - if tag == "pre": - if start: - self.startpre = 1 - self.pre = 1 - else: - self.pre = 0 - self.p() - - def pbr(self): - if self.p_p == 0: - self.p_p = 1 - - def p(self): - self.p_p = 2 - - def soft_br(self): - self.pbr() - self.br_toggle = ' ' - - def o(self, data, puredata=0, force=0): - if self.abbr_data is not None: - self.abbr_data += data - - if not self.quiet: - if self.google_doc: - # prevent white space immediately after 'begin emphasis' marks ('**' and '_') - lstripped_data = data.lstrip() - if self.drop_white_space and not (self.pre or self.code): - data = lstripped_data - if lstripped_data != '': - self.drop_white_space = 0 - - if puredata and not self.pre: - data = re.sub('\s+', ' ', data) - if data and data[0] == ' ': - self.space = 1 - data = data[1:] - if not data and not force: return - - if self.startpre: - #self.out(" :") #TODO: not output when already one there - if not data.startswith("\n"): #
    stuff...
    -                    data = "\n" + data
    -
    -            bq = (">" * self.blockquote)
    -            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
    -
    -            if self.pre:
    -                if not self.list:
    -                    bq += "    "
    -                #else: list content is already partially indented
    -                for i in xrange(len(self.list)):
    -                    bq += "    "
    -                data = data.replace("\n", "\n"+bq)
    -
    -            if self.startpre:
    -                self.startpre = 0
    -                if self.list:
    -                    data = data.lstrip("\n") # use existing initial indentation
    -
    -            if self.start:
    -                self.space = 0
    -                self.p_p = 0
    -                self.start = 0
    -
    -            if force == 'end':
    -                # It's the end.
    -                self.p_p = 0
    -                self.out("\n")
    -                self.space = 0
    -
    -            if self.p_p:
    -                self.out((self.br_toggle+'\n'+bq)*self.p_p)
    -                self.space = 0
    -                self.br_toggle = ''
    -
    -            if self.space:
    -                if not self.lastWasNL: self.out(' ')
    -                self.space = 0
    -
    -            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
    -                if force == "end": self.out("\n")
    -
    -                newa = []
    -                for link in self.a:
    -                    if self.outcount > link['outcount']:
    -                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
    -                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
    -                        self.out("\n")
    -                    else:
    -                        newa.append(link)
    -
    -                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
    -
    -                self.a = newa
    -
    -            if self.abbr_list and force == "end":
    -                for abbr, definition in self.abbr_list.items():
    -                    self.out("  *[" + abbr + "]: " + definition + "\n")
    -
    -            self.p_p = 0
    -            self.out(data)
    -            self.outcount += 1
    -
    -    def handle_data(self, data):
    -        if r'\/script>' in data: self.quiet -= 1
    -
    -        if self.style:
    -            self.style_def.update(dumb_css_parser(data))
    -
    -        if not self.maybe_automatic_link is None:
    -            href = self.maybe_automatic_link
    -            if href == data and self.absolute_url_matcher.match(href):
    -                self.o("<" + data + ">")
    -                return
    -            else:
    -                self.o("[")
    -                self.maybe_automatic_link = None
    -
    -        if not self.code and not self.pre:
    -            data = escape_md_section(data, snob=self.escape_snob)
    -        self.o(data, 1)
    -
    -    def unknown_decl(self, data): pass
    -
    -    def charref(self, name):
    -        if name[0] in ['x','X']:
    -            c = int(name[1:], 16)
    -        else:
    -            c = int(name)
    -
    -        if not self.unicode_snob and c in unifiable_n.keys():
    -            return unifiable_n[c]
    -        else:
    -            try:
    -                return unichr(c)
    -            except NameError: #Python3
    -                return chr(c)
    -
    -    def entityref(self, c):
    -        if not self.unicode_snob and c in unifiable.keys():
    -            return unifiable[c]
    -        else:
    -            try: name2cp(c)
    -            except KeyError: return "&" + c + ';'
    -            else:
    -                try:
    -                    return unichr(name2cp(c))
    -                except NameError: #Python3
    -                    return chr(name2cp(c))
    -
    -    def replaceEntities(self, s):
    -        s = s.group(1)
    -        if s[0] == "#":
    -            return self.charref(s[1:])
    -        else: return self.entityref(s)
    -
    -    r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
    -    def unescape(self, s):
    -        return self.r_unescape.sub(self.replaceEntities, s)
    -
    -    def google_nest_count(self, style):
    -        """calculate the nesting count of google doc lists"""
    -        nest_count = 0
    -        if 'margin-left' in style:
    -            nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
    -        return nest_count
    -
    -
    -    def optwrap(self, text):
    -        """Wrap all paragraphs in the provided text."""
    -        if not self.body_width:
    -            return text
    -
    -        assert wrap, "Requires Python 2.3."
    -        result = ''
    -        newlines = 0
    -        for para in text.split("\n"):
    -            if len(para) > 0:
    -                if not skipwrap(para):
    -                    result += "\n".join(wrap(para, self.body_width))
    -                    if para.endswith('  '):
    -                        result += "  \n"
    -                        newlines = 1
    -                    else:
    -                        result += "\n\n"
    -                        newlines = 2
    -                else:
    -                    if not onlywhite(para):
    -                        result += para + "\n"
    -                        newlines = 1
    -            else:
    -                if newlines < 2:
    -                    result += "\n"
    -                    newlines += 1
    -        return result
    -
    -ordered_list_matcher = re.compile(r'\d+\.\s')
    -unordered_list_matcher = re.compile(r'[-\*\+]\s')
    -md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
    -md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
    -md_dot_matcher = re.compile(r"""
    -    ^             # start of line
    -    (\s*\d+)      # optional whitespace and a number
    -    (\.)          # dot
    -    (?=\s)        # lookahead assert whitespace
    -    """, re.MULTILINE | re.VERBOSE)
    -md_plus_matcher = re.compile(r"""
    -    ^
    -    (\s*)
    -    (\+)
    -    (?=\s)
    -    """, flags=re.MULTILINE | re.VERBOSE)
    -md_dash_matcher = re.compile(r"""
    -    ^
    -    (\s*)
    -    (-)
    -    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
    -                  # or another dash (header or hr)
    -    """, flags=re.MULTILINE | re.VERBOSE)
    -slash_chars = r'\`*_{}[]()#+-.!'
    -md_backslash_matcher = re.compile(r'''
    -    (\\)          # match one slash
    -    (?=[%s])      # followed by a char that requires escaping
    -    ''' % re.escape(slash_chars),
    -    flags=re.VERBOSE)
    -
    -def skipwrap(para):
    -    # If the text begins with four spaces or one tab, it's a code block; don't wrap
    -    if para[0:4] == '    ' or para[0] == '\t':
    -        return True
    -    # If the text begins with only two "--", possibly preceded by whitespace, that's
    -    # an emdash; so wrap.
    -    stripped = para.lstrip()
    -    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
    -        return False
    -    # I'm not sure what this is for; I thought it was to detect lists, but there's
    -    # a 
    -inside- case in one of the tests that also depends upon it. - if stripped[0:1] == '-' or stripped[0:1] == '*': - return True - # If the text begins with a single -, *, or +, followed by a space, or an integer, - # followed by a ., followed by a space (in either case optionally preceeded by - # whitespace), it's a list; don't wrap. - if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): - return True - return False - -def wrapwrite(text): - text = text.encode('utf-8') - try: #Python3 - sys.stdout.buffer.write(text) - except AttributeError: - sys.stdout.write(text) - -def html2text(html, baseurl=''): - h = HTML2Text(baseurl=baseurl) - return h.handle(html) - -def unescape(s, unicode_snob=False): - h = HTML2Text() - h.unicode_snob = unicode_snob - return h.unescape(s) - -def escape_md(text): - """Escapes markdown-sensitive characters within other markdown constructs.""" - return md_chars_matcher.sub(r"\\\1", text) - -def escape_md_section(text, snob=False): - """Escapes markdown-sensitive characters across whole document sections.""" - text = md_backslash_matcher.sub(r"\\\1", text) - if snob: - text = md_chars_matcher_all.sub(r"\\\1", text) - text = md_dot_matcher.sub(r"\1\\\2", text) - text = md_plus_matcher.sub(r"\1\\\2", text) - text = md_dash_matcher.sub(r"\1\\\2", text) - return text - - -def main(): - baseurl = '' - - p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) - p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", - default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") - p.add_option("--ignore-links", dest="ignore_links", action="store_true", - default=IGNORE_ANCHORS, help="don't include any formatting for links") - p.add_option("--ignore-images", dest="ignore_images", action="store_true", - default=IGNORE_IMAGES, help="don't include any formatting for images") - p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", - default=False, help="convert an html-exported Google Document") - p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", - default=False, help="use a dash rather than a star for unordered list items") - p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", - default=False, help="use an asterisk rather than an underscore for emphasized text") - p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", - default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") - p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", - default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") - p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", - default=False, help="hide strike-through text. only relevant when -g is specified as well") - p.add_option("--escape-all", action="store_true", dest="escape_snob", - default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") - (options, args) = p.parse_args() - - # process input - encoding = "utf-8" - if len(args) > 0: - file_ = args[0] - if len(args) == 2: - encoding = args[1] - if len(args) > 2: - p.error('Too many arguments') - - if file_.startswith('http://') or file_.startswith('https://'): - baseurl = file_ - j = urllib.urlopen(baseurl) - data = j.read() - if encoding is None: - try: - from feedparser import _getCharacterEncoding as enc - except ImportError: - enc = lambda x, y: ('utf-8', 1) - encoding = enc(j.headers, data)[0] - if encoding == 'en-ascii': - encoding = 'utf-8' - else: - data = open(file_, 'rb').read() - if encoding is None: - try: - from chardet import detect - except ImportError: - detect = lambda x: {'encoding': 'utf-8'} - encoding = detect(data)['encoding'] - else: - data = sys.stdin.read() - - data = data.decode(encoding) - h = HTML2Text(baseurl=baseurl) - # handle options - if options.ul_style_dash: h.ul_item_mark = '-' - if options.em_style_asterisk: - h.emphasis_mark = '*' - h.strong_mark = '__' - - h.body_width = options.body_width - h.list_indent = options.list_indent - h.ignore_emphasis = options.ignore_emphasis - h.ignore_links = options.ignore_links - h.ignore_images = options.ignore_images - h.google_doc = options.google_doc - h.hide_strikethrough = options.hide_strikethrough - h.escape_snob = options.escape_snob - - wrapwrite(h.handle(data)) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/migration/bson2json.py b/migration/bson2json.py index bbe909f7..9559665c 100644 --- a/migration/bson2json.py +++ b/migration/bson2json.py @@ -24,5 +24,5 @@ def json_tables(): base, d = bson.decode_document(bs, base) lc.append(d) data[table] = lc - open('migration/data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder)) + open('dump/discours/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder)) diff --git a/migration/html2md.py b/migration/html2md.py deleted file mode 100644 index d2f23e1c..00000000 --- a/migration/html2md.py +++ /dev/null @@ -1,215 +0,0 @@ -from html.parser import HTMLParser -import os -import codecs -from typing import Tuple - - -class Converter(HTMLParser): - md_file: str - temp_tag: str - code_box: bool - div_count: int - code_box_div_num: int - ol_count: int - related_data: list - is_link: bool - link_ref: str - ignore_data: bool - class_div_count: int - ignore_div: bool - table_start: Tuple[int, int] - - def __init__(self): - super().__init__() - self.md_file = '' - self.code_box = False - self.div_count = 0 - self.span_count = 0 - self.code_box_div_num = 0 - self.ol_count = 0 - self.temp_tag = '' - self.related_data = [] - self.is_link = False - self.link_ref = '' - self.ignore_data = False - self.class_div_count = 0 - self.ignore_div = False - - def handle_starttag(self, tag, attrs): - if self.ignore_data: - return None - elif tag == 'sup': - self.md_file += '' - elif tag == 'p': - self.temp_tag = 'p' - self.md_file += '\n' - elif tag == 'i': - self.temp_tag = 'i' - self.md_file += '*' - elif tag == 'wbr': - self.temp_tag = 'wbr' - self.md_file += '' - elif tag == 'span': - self.temp_tag = 'span' - self.span_count += 1 - self.md_file += ' ' - elif tag == 'figcaption': - self.md_file += '' - elif tag == 'hr': - self.md_file += '\n*** \n' - elif tag == 'title': - self.md_file += '# ' - elif tag == 'h1': - self.md_file += '# ' - elif tag == 'h2': - self.md_file += '## ' - elif tag == 'h3': - self.md_file += '### ' - elif tag == 'b' or tag == 'strong': - self.md_file += '**' - elif tag == 'ul': - self.temp_tag = 'ul' - self.md_file += ' \n' - elif tag == 'ol': - self.ol_count = 0 - self.temp_tag = 'ol' - self.md_file += ' \n' - elif tag == 'li': - if self.temp_tag == 'ul': - self.md_file += '* ' - elif self.temp_tag == 'ol': - self.ol_count += 1 - self.md_file += f'{self.ol_count}. ' - elif tag == 'div': - self.div_count += 1 - attrs_dict = dict(attrs) - if 'style' in attrs_dict and 'codeblock' in attrs_dict['style']: - self.code_box_div_num = self.div_count - self.code_box = True - self.md_file += '```\n' - elif 'class' in attrs_dict: - self.class_div_count = self.div_count - self.ignore_div = True - elif tag == 'pre' or tag == 'code': - self.code_box = True - self.md_file += '\n```\n' - elif tag == 'a': - self.is_link = True - attrs_dict = dict(attrs) - self.link_ref = attrs_dict.get('href', '#') - if not self.link_ref.startswith('http') and not self.link_ref.endswith('html') and not '@' in self.link_ref: - self.related_data.append(self.link_ref) - elif tag == 'style': - self.ignore_data = True - elif tag == 'symbol': - self.ignore_data = True - elif tag == 'svg': - self.ignore_data = True - elif tag == 'path': - self.ignore_data = True - elif tag == 'img': - attrs_dict = dict(attrs) - img_ref = attrs_dict['src'] - alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'x' - if self.is_link: - self.related_data.append(img_ref) - self.md_file += f'[![{alt_name}]({img_ref})]({self.link_ref})' - else: - self.related_data.append(img_ref) - self.md_file += f'![{alt_name}]({img_ref})' - elif tag == 'table': - self.ignore_data = True - self.table_start = self.getpos() - else: - print('<' + tag + '>') - - def get_rawdata(self, start, stop, offset): - temp_rawdata = self.rawdata - for i in range(offset-1): - next_section = temp_rawdata.find('\n') - temp_rawdata = temp_rawdata[next_section+1:] - return temp_rawdata[start:stop] - - def handle_endtag(self, tag): - if tag == 'b' or tag == 'strong': - self.md_file += '** ' - elif tag == 'sup': - self.md_file += '' - elif tag == 'iframe': - self.ignore_data = False - elif tag == 'wbr': - self.md_file += '' - elif tag == 'title': - self.md_file += '\n' - elif tag == 'h1': - self.md_file += '\n' - elif tag == 'h2': - self.md_file += '\n' - elif tag == 'h3': - self.md_file += '\n' - elif tag == 'h4': - self.md_file += '\n' - elif tag == 'span': - self.span_count -= 1 - self.md_file += ' ' - elif tag == 'figcaption': - self.md_file += '\n' - elif tag == 'i': - self.md_file += '* ' - elif tag == 'p': - self.md_file += '\n' - elif tag == 'div': - if self.code_box and self.code_box_div_num == self.div_count: - self.code_box = False - self.md_file += '```\n' - elif self.ignore_div and self.class_div_count == self.div_count: - self.ignore_div = False - else: - self.md_file += ' \n' - self.div_count -= 1 - elif tag == 'pre' or tag == 'code': - self.code_box = False - self.md_file += '```\n' - elif tag == 'a': - self.is_link = False - elif tag == 'style': - self.ignore_data = False - elif tag == 'symbol': - self.ignore_data = False - elif tag == 'svg': - self.ignore_data = False - elif tag == 'li': - self.md_file += ' \n' - elif tag == 'table': - offset, lineno_stop = self.getpos() - lineno_stop = lineno_stop + len(tag) + 3 - _, lineno_start = self.table_start - raw_data = self.get_rawdata(lineno_start, lineno_stop, offset) - self.md_file += '\n' + raw_data - self.ignore_data = False - else: - print('') - - def handle_startendtag(self, tag, attrs): - if tag == 'br': - self.md_file += ' \n' - elif tag == 'wbr': - self.md_file += '' - elif tag == 'hr': - self.md_file += '\n*** \n' - elif tag == 'img': - attr_dict = dict(attrs) - name = attr_dict.get('data-filename', 'image') - img_ref = attr_dict['src'] - self.related_data.append(img_ref) - self.md_file += f'![{name}]({img_ref})' - else: - print("<" + tag + " />") - - def handle_data(self, data): - if self.is_link: - self.md_file += f'[{data}]({self.link_ref})' - elif self.ignore_data: - pass - else: - self.md_file += data diff --git a/migration/html2text/config.py b/migration/html2text/config.py index 9c10445a..9962b125 100644 --- a/migration/html2text/config.py +++ b/migration/html2text/config.py @@ -1,7 +1,7 @@ import re # Use Unicode characters instead of their ascii pseudo-replacements -UNICODE_SNOB = False +UNICODE_SNOB = True # Marker to use for marking tables for padding post processing TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding" @@ -13,7 +13,7 @@ ESCAPE_SNOB = False LINKS_EACH_PARAGRAPH = False # Wrap long lines at position. 0 for no wrapping. -BODY_WIDTH = 78 +BODY_WIDTH = 0 # Don't show internal links (href="#local-anchor") -- corresponding link # targets won't be visible in the plain text file anyway. @@ -24,7 +24,7 @@ INLINE_LINKS = True # Protect links from line breaks surrounding them with angle brackets (in # addition to their square brackets) -PROTECT_LINKS = False +PROTECT_LINKS = True WRAP_LINKS = True # Wrap list items. @@ -156,7 +156,7 @@ IGNORE_TABLES = False # Use a single line break after a block element rather than two line breaks. # NOTE: Requires body width setting to be 0. -SINGLE_LINE_BREAK = False +SINGLE_LINE_BREAK = True # Use double quotation marks when converting the tag. diff --git a/migration/tables/replacements.json b/migration/tables/replacements.json index 234715d3..544ef95e 100644 --- a/migration/tables/replacements.json +++ b/migration/tables/replacements.json @@ -82,7 +82,7 @@ "blizhniy-vostok": "middle-east", "blizost": "closeness", "blokada": "blockade", - "bob-dilan": "bob-dilan", + "bob-dilan": "bob-dylan", "bog": "god", "bol": "pain", "bolotnoe-delo": "bolotnaya-case", @@ -205,7 +205,7 @@ "erich-von-neff": "erich-von-neff", "erotika": "erotics", "essay": "essay", - "estetika": "aestetic", + "estetika": "aestetics", "etika": "ethics", "etnos": "ethnics", "everyday-life": "everyday-life", @@ -219,7 +219,7 @@ "faktcheking": "fact-checking", "falsifikatsii": "falsifications", "family": "family", - "fanfiki": "fanfiction", + "fanfiki": "fan-fiction", "fantastika": "sci-fi", "fatalizm": "fatalism", "fedor-dostoevskiy": "fedor-dostoevsky", @@ -234,7 +234,7 @@ "folklor": "folklore", "fotoreportazh": "photoreports", "france": "france", - "frants-kafka": "Franz-Kafka", + "frants-kafka": "franz-kafka", "frederik-begbeder": "frederick-begbeder", "freedom": "freedom", "friendship": "friendship", @@ -262,7 +262,7 @@ "graffiti": "graffiti", "graphics": "graphics", "gravyura": "engraving", - "grazhdanskaya-oborona": "grob", + "grazhdanskaya-oborona": "grazhdanskaya-oborona", "gretsiya": "greece", "gulag": "gulag", "han-batyy": "khan-batyy", @@ -332,7 +332,7 @@ "kinoklub": "cinema-club", "kirill-serebrennikov": "kirill-serebrennikov", "klassika": "classic", - "kollektivnoe-bessoznatelnoe": "kollektivnoe-bessoznatelnoe", + "kollektivnoe-bessoznatelnoe": "сollective-unconscious", "komediya": "comedy", "kommunikatsii": "communications", "kommunizm": "communism", @@ -429,18 +429,18 @@ "muzey": "museum", "muzhchiny": "man", "myshlenie": "thinking", - "nagornyy-karabah": "nagornyy-karabah", + "nagornyy-karabah": "nagorno-karabakh", "natsionalizm": "nationalism", "natsionalnaya-ideya": "national-idea", "natsizm": "nazism", - "natyurmort": "natyurmort", + "natyurmort": "nature-morte", "nauchpop": "pop-science", "nbp": "nbp", "nenavist": "hate", "neofitsialnaya-literatura": "unofficial-literature", "neoklassika": "neoclassic", "neprozrachnye-smysly": "hidden-meanings", - "neravenstvo": "non-equality", + "neravenstvo": "inequality", "new-year": "new-year", "neyronauka": "neuro-science", "neyroseti": "neural-networks", @@ -458,7 +458,7 @@ "ocherk": "etudes", "ochevidnyy-nuar": "ochevidnyy-nuar", "odinochestvo": "loneliness", - "odna-kniga-odna-istoriya": "odna-kniga-odna-istoriya", + "odna-kniga-odna-istoriya": "one-book-one-story", "okrainy": "outskirts", "opinions": "opinions", "oppozitsiya": "opposition", @@ -467,7 +467,7 @@ "osip-mandelshtam": "osip-mandelshtam", "oskar-uayld": "oscar-wilde", "osoznanie": "awareness", - "otnosheniya": "relationships", + "otnosheniya": "relationship", "pablo-pikasso": "pablo-picasso", "painting": "painting", "paintings": "painting", @@ -613,7 +613,7 @@ "sotsializm": "socialism", "sotsialnaya-filosofiya": "social-philosophy", "sotsseti": "social-networks", - "sotvorenie-tretego-rima": "sotvorenie-tretego-rima", + "sotvorenie-tretego-rima": "third-rome", "sovremennost": "modernity", "spaces": "spaces", "spektakl": "spectacles", @@ -638,7 +638,7 @@ "syurrealizm": "surrealism", "tales": "tales", "tanets": "dance", - "tataro-mongolskoe-igo": "tataro-mongolskoe-igo", + "tataro-mongolskoe-igo": "mongol-tatar-yoke", "tatuirovki": "tattoo", "technology": "technology", "televidenie": "tv", @@ -663,8 +663,8 @@ "trendy": "trends", "tretiy-reyh": "third-reich", "triller": "thriller", - "tsar": "tsar", - "tsar-edip": "tsar-edip", + "tsar": "central-african-republic", + "tsar-edip": "oedipus", "tsarevich-dmitriy": "tsarevich-dmitry", "tsennosti": "values", "tsenzura": "censorship", @@ -702,11 +702,11 @@ "videopoeziya": "video-poetry", "viktor-astafev": "viktor-astafev", "viktor-pelevin": "viktor-pelevin", - "vilgelm-rayh": "vilgelm-rayh", + "vilgelm-rayh": "wilhelm-reich", "vinzavod": "vinzavod", "violence": "violence", "visual-culture": "visual-culture", - "vizualnaya-poeziya": "vizual-poetry", + "vizualnaya-poeziya": "visual-poetry", "vladimir-lenin": "vladimir-lenin", "vladimir-nabokov": "vladimir-nabokov", "vladimir-putin": "vladimir-putin", @@ -716,10 +716,10 @@ "volontery": "volonteurs", "vong-karvay": "wong-karwai", "vospominaniya": "memories", - "vostok": "vostok", + "vostok": "east", "vremya": "time", "vudi-allen": "woody-allen", - "vynuzhdennye-otnosheniya": "forced-relationships", + "vynuzhdennye-otnosheniya": "forced-relationship", "war": "war", "war-in-ukraine-images": "war-in-ukrahine-images", "women": "women", diff --git a/migration/tables/users.py b/migration/tables/users.py index f40f942e..0d51c291 100644 --- a/migration/tables/users.py +++ b/migration/tables/users.py @@ -88,9 +88,6 @@ def migrate(entry): old = res['old_id'] user = User.create(**res.copy()) res['id'] = user.id - if res['slug'] == 'vorovich': - print(entry) - print(res) return res def migrate_email_subscription(entry):