diff --git a/.gitignore b/.gitignore index cf8d70c7..6655941d 100644 --- a/.gitignore +++ b/.gitignore @@ -134,4 +134,5 @@ discours.key discours.crt Pipfile.lock -migration/data \ No newline at end of file +migration/data +migration/content \ No newline at end of file diff --git a/Pipfile b/Pipfile index a04611a3..cb2e780c 100644 --- a/Pipfile +++ b/Pipfile @@ -18,6 +18,7 @@ psycopg2-binary = "*" Authlib = "*" bson = "*" python-frontmatter = "*" +bs4 = "*" [dev-packages] diff --git a/migrate.py b/migrate.py index 67581f9d..c9135010 100644 --- a/migrate.py +++ b/migrate.py @@ -4,7 +4,7 @@ from migration.tables.content_items import migrate as migrateShout from migration.tables.content_item_categories import migrate as migrateTopic from migration.utils import DateTimeEncoder -def users(): +def users(limit): print('migrating users...') data = json.loads(open('migration/data/users.json').read()) newdata = {} @@ -14,6 +14,8 @@ def users(): oid = entry['_id'] newdata[oid] = migrateUser(entry) counter += 1 + if counter > limit: + break #except Exception: # print(str(counter) + '/' + str(len(data)) + ' users entries were migrated') # print('try to remove database first') @@ -21,7 +23,7 @@ def users(): print(str(counter) + ' users entries were migrated') -def topics(): +def topics(limit): print('migrating topics...') data = json.loads(open('migration/data/content_item_categories.json').read()) newdata = {} @@ -31,47 +33,57 @@ def topics(): oid = entry['_id'] newdata[oid] = migrateTopic(entry) counter += 1 + if counter > limit: + break except Exception: print(str(counter) + '/' + str(len(data)) + ' topics were migrated') print('try to remove database first') open('migration/data/topics.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) print(str(counter) + ' topics were migrated') -def shouts(): - print('migrating shouts...') +def shouts(limit): + print('loading shouts...') counter = 0 + discoursAuthor = 0 data = json.loads(open('migration/data/content_items.json').read()) newdata = {} - + print(str(len(data)) + ' entries was loaded. now migrating...') for entry in data: oid = entry['_id'] newdata[oid] = migrateShout(entry) counter += 1 - print(str(counter) + ': ' + newdata['slug']) - if counter > 9: + author = newdata[oid]['authors'][0]['slug'] + if author == 'discours': + discoursAuthor += 1 + line = str(counter) + ': ' + newdata[oid]['slug'] + " @" + author + print(line) + open('./shouts.id.log','a').write(line + '\n') + if counter > limit: break open('migration/data/shouts.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) print(str(counter) + ' shouts were migrated') + print(str(discoursAuthor) + ' from them by uknown users') if __name__ == '__main__': import sys if len(sys.argv) > 1: + limit = int(sys.argv[2]) if sys.argv[1] == "users": - users() + users(limit) elif sys.argv[1] == "topics": - topics() + topics(limit) elif sys.argv[1] == "shouts": - shouts() + shouts(limit) elif sys.argv[1] == "comments": - # comments() + comments(limit) pass elif sys.argv[1] == "all": - topics() - users() - shouts() + topics(limit) + users(limit) + shouts(limit) elif sys.argv[1] == "bson": import migration.bson2json bson2json.json_tables() else: - print('usage: python migrate.py ') \ No newline at end of file + print('usage: python migrate.py ') \ No newline at end of file diff --git a/migration/README.md b/migration/README.md index 0752749c..594b6ab4 100644 --- a/migration/README.md +++ b/migration/README.md @@ -24,6 +24,7 @@ bson2json.json_tables() # creates all the needed data json from bson mongodump 2. migrate users ```sh +pipenv install pipenv run python migrate.py users ``` diff --git a/migration/bson2json.py b/migration/bson2json.py index c2ee8f62..27f1b327 100644 --- a/migration/bson2json.py +++ b/migration/bson2json.py @@ -5,18 +5,16 @@ import importlib import DateTimeEncoder from utils -data = { - "content_items": [], - "content_item_categories": [], - "tags": [], - "email_subscriptions": [], - "users": [], - "comments": [] -} - def json_tables(): print('creating json files at data/') - + data = { + "content_items": [], + "content_item_categories": [], + "tags": [], + "email_subscriptions": [], + "users": [], + "comments": [] + } for table in data.keys(): lc = [] with open('data/'+table+'.bson', 'rb') as f: @@ -27,4 +25,5 @@ def json_tables(): lc.append(d) data[table] = lc open('data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder)) + return data diff --git a/migration/html2md.py b/migration/html2md.py index 8846d39b..d2f23e1c 100644 --- a/migration/html2md.py +++ b/migration/html2md.py @@ -24,6 +24,7 @@ class Converter(HTMLParser): self.md_file = '' self.code_box = False self.div_count = 0 + self.span_count = 0 self.code_box_div_num = 0 self.ol_count = 0 self.temp_tag = '' @@ -37,8 +38,23 @@ class Converter(HTMLParser): def handle_starttag(self, tag, attrs): if self.ignore_data: return None - elif tag == 'br': - self.md_file += ' \n' + elif tag == 'sup': + self.md_file += '' + elif tag == 'p': + self.temp_tag = 'p' + self.md_file += '\n' + elif tag == 'i': + self.temp_tag = 'i' + self.md_file += '*' + elif tag == 'wbr': + self.temp_tag = 'wbr' + self.md_file += '' + elif tag == 'span': + self.temp_tag = 'span' + self.span_count += 1 + self.md_file += ' ' + elif tag == 'figcaption': + self.md_file += '' elif tag == 'hr': self.md_file += '\n*** \n' elif tag == 'title': @@ -74,7 +90,7 @@ class Converter(HTMLParser): elif 'class' in attrs_dict: self.class_div_count = self.div_count self.ignore_div = True - elif tag == 'en-codeblock': + elif tag == 'pre' or tag == 'code': self.code_box = True self.md_file += '\n```\n' elif tag == 'a': @@ -94,7 +110,7 @@ class Converter(HTMLParser): elif tag == 'img': attrs_dict = dict(attrs) img_ref = attrs_dict['src'] - alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'Placeholder' + alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'x' if self.is_link: self.related_data.append(img_ref) self.md_file += f'[![{alt_name}]({img_ref})]({self.link_ref})' @@ -104,6 +120,8 @@ class Converter(HTMLParser): elif tag == 'table': self.ignore_data = True self.table_start = self.getpos() + else: + print('<' + tag + '>') def get_rawdata(self, start, stop, offset): temp_rawdata = self.rawdata @@ -114,7 +132,32 @@ class Converter(HTMLParser): def handle_endtag(self, tag): if tag == 'b' or tag == 'strong': - self.md_file += '** \n' + self.md_file += '** ' + elif tag == 'sup': + self.md_file += '' + elif tag == 'iframe': + self.ignore_data = False + elif tag == 'wbr': + self.md_file += '' + elif tag == 'title': + self.md_file += '\n' + elif tag == 'h1': + self.md_file += '\n' + elif tag == 'h2': + self.md_file += '\n' + elif tag == 'h3': + self.md_file += '\n' + elif tag == 'h4': + self.md_file += '\n' + elif tag == 'span': + self.span_count -= 1 + self.md_file += ' ' + elif tag == 'figcaption': + self.md_file += '\n' + elif tag == 'i': + self.md_file += '* ' + elif tag == 'p': + self.md_file += '\n' elif tag == 'div': if self.code_box and self.code_box_div_num == self.div_count: self.code_box = False @@ -124,7 +167,7 @@ class Converter(HTMLParser): else: self.md_file += ' \n' self.div_count -= 1 - elif tag == 'en-codeblock': + elif tag == 'pre' or tag == 'code': self.code_box = False self.md_file += '```\n' elif tag == 'a': @@ -144,18 +187,24 @@ class Converter(HTMLParser): raw_data = self.get_rawdata(lineno_start, lineno_stop, offset) self.md_file += '\n' + raw_data self.ignore_data = False + else: + print('') def handle_startendtag(self, tag, attrs): if tag == 'br': self.md_file += ' \n' + elif tag == 'wbr': + self.md_file += '' elif tag == 'hr': self.md_file += '\n*** \n' elif tag == 'img': attr_dict = dict(attrs) - name = attr_dict['data-filename'] + name = attr_dict.get('data-filename', 'image') img_ref = attr_dict['src'] self.related_data.append(img_ref) self.md_file += f'![{name}]({img_ref})' + else: + print("<" + tag + " />") def handle_data(self, data): if self.is_link: diff --git a/migration/html2text.py b/migration/html2text.py new file mode 100644 index 00000000..4bdb8c77 --- /dev/null +++ b/migration/html2text.py @@ -0,0 +1,914 @@ +#!/usr/bin/env python +"""html2text: Turn HTML into equivalent Markdown-structured text.""" +__version__ = "3.200.3" +__author__ = "Aaron Swartz (me@aaronsw.com)" +__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] + +# TODO: +# Support decoded entities with unifiable. + +try: + True +except NameError: + setattr(__builtins__, 'True', 1) + setattr(__builtins__, 'False', 0) + +def has_key(x, y): + if hasattr(x, 'has_key'): return x.has_key(y) + else: return y in x + +try: + import htmlentitydefs + import urlparse + import HTMLParser +except ImportError: #Python3 + import html.entities as htmlentitydefs + import urllib.parse as urlparse + import html.parser as HTMLParser +try: #Python3 + import urllib.request as urllib +except: + import urllib +import optparse, re, sys, codecs, types + +try: from textwrap import wrap +except: pass + +# Use Unicode characters instead of their ascii psuedo-replacements +UNICODE_SNOB = 0 + +# Escape all special characters. Output is less readable, but avoids corner case formatting issues. +ESCAPE_SNOB = 0 + +# Put the links after each paragraph instead of at the end. +LINKS_EACH_PARAGRAPH = 0 + +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) +BODY_WIDTH = 78 + +# Don't show internal links (href="#local-anchor") -- corresponding link targets +# won't be visible in the plain text file anyway. +SKIP_INTERNAL_LINKS = True + +# Use inline, rather than reference, formatting for images and links +INLINE_LINKS = True + +# Number of pixels Google indents nested lists +GOOGLE_LIST_INDENT = 36 + +IGNORE_ANCHORS = False +IGNORE_IMAGES = False +IGNORE_EMPHASIS = False + +### Entity Nonsense ### + +def name2cp(k): + if k == 'apos': return ord("'") + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 + return htmlentitydefs.name2codepoint[k] + else: + k = htmlentitydefs.entitydefs[k] + if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 + return ord(codecs.latin_1_decode(k)[0]) + +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', +'ndash':'-', 'oelig':'oe', 'aelig':'ae', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', +'lrm':'', 'rlm':''} + +unifiable_n = {} + +for k in unifiable.keys(): + unifiable_n[name2cp(k)] = unifiable[k] + +### End Entity Nonsense ### + +def onlywhite(line): + """Return true if the line does only consist of whitespace characters.""" + for c in line: + if c != ' ' and c != ' ': + return c == ' ' + return line + +def hn(tag): + if tag[0] == 'h' and len(tag) == 2: + try: + n = int(tag[1]) + if n in range(1, 10): return n + except ValueError: return 0 + +def dumb_property_dict(style): + """returns a hash of css attributes""" + return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); + +def dumb_css_parser(data): + """returns a hash of css selectors, each of which contains a hash of css attributes""" + # remove @import sentences + data += ';' + importIndex = data.find('@import') + while importIndex != -1: + data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] + importIndex = data.find('@import') + + # parse the css. reverted from dictionary compehension in order to support older pythons + elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] + try: + elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) + except ValueError: + elements = {} # not that important + + return elements + +def element_style(attrs, style_def, parent_style): + """returns a hash of the 'final' style attributes of the element""" + style = parent_style.copy() + if 'class' in attrs: + for css_class in attrs['class'].split(): + css_style = style_def['.' + css_class] + style.update(css_style) + if 'style' in attrs: + immediate_style = dumb_property_dict(attrs['style']) + style.update(immediate_style) + return style + +def google_list_style(style): + """finds out whether this is an ordered or unordered list""" + if 'list-style-type' in style: + list_style = style['list-style-type'] + if list_style in ['disc', 'circle', 'square', 'none']: + return 'ul' + return 'ol' + +def google_has_height(style): + """check if the style of the element has the 'height' attribute explicitly defined""" + if 'height' in style: + return True + return False + +def google_text_emphasis(style): + """return a list of all emphasis modifiers of the element""" + emphasis = [] + if 'text-decoration' in style: + emphasis.append(style['text-decoration']) + if 'font-style' in style: + emphasis.append(style['font-style']) + if 'font-weight' in style: + emphasis.append(style['font-weight']) + return emphasis + +def google_fixed_width_font(style): + """check if the css of the current element defines a fixed width font""" + font_family = '' + if 'font-family' in style: + font_family = style['font-family'] + if 'Courier New' == font_family or 'Consolas' == font_family: + return True + return False + +def list_numbering_start(attrs): + """extract numbering from list element attributes""" + if 'start' in attrs: + return int(attrs['start']) - 1 + else: + return 0 + +class HTML2Text(HTMLParser.HTMLParser): + def __init__(self, out=None, baseurl=''): + HTMLParser.HTMLParser.__init__(self) + + # Config options + self.unicode_snob = UNICODE_SNOB + self.escape_snob = ESCAPE_SNOB + self.links_each_paragraph = LINKS_EACH_PARAGRAPH + self.body_width = BODY_WIDTH + self.skip_internal_links = SKIP_INTERNAL_LINKS + self.inline_links = INLINE_LINKS + self.google_list_indent = GOOGLE_LIST_INDENT + self.ignore_links = IGNORE_ANCHORS + self.ignore_images = IGNORE_IMAGES + self.ignore_emphasis = IGNORE_EMPHASIS + self.google_doc = False + self.ul_item_mark = '*' + self.emphasis_mark = '_' + self.strong_mark = '**' + + if out is None: + self.out = self.outtextf + else: + self.out = out + + self.outtextlist = [] # empty list to store output characters before they are "joined" + + try: + self.outtext = unicode() + except NameError: # Python3 + self.outtext = str() + + self.quiet = 0 + self.p_p = 0 # number of newline character to print before next output + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.maybe_automatic_link = None + self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.code = False + self.br_toggle = '' + self.lastWasNL = 0 + self.lastWasList = False + self.style = 0 + self.style_def = {} + self.tag_stack = [] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + try: del unifiable_n[name2cp('nbsp')] + except KeyError: pass + unifiable['nbsp'] = ' _place_holder;' + + + def feed(self, data): + data = data.replace("", "") + HTMLParser.HTMLParser.feed(self, data) + + def handle(self, data): + self.feed(data) + self.feed("") + return self.optwrap(self.close()) + + def outtextf(self, s): + self.outtextlist.append(s) + if s: self.lastWasNL = s[-1] == '\n' + + def close(self): + HTMLParser.HTMLParser.close(self) + + self.pbr() + self.o('', 0, 'end') + + self.outtext = self.outtext.join(self.outtextlist) + if self.unicode_snob: + nbsp = unichr(name2cp('nbsp')) + else: + nbsp = u' ' + self.outtext = self.outtext.replace(u' _place_holder;', nbsp) + + return self.outtext + + def handle_charref(self, c): + self.o(self.charref(c), 1) + + def handle_entityref(self, c): + self.o(self.entityref(c), 1) + + def handle_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def handle_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ returns the index of certain set of attributes (of a link) in the + self.a list + + If the set of attributes is not found, returns None + """ + if not has_key(attrs, 'href'): return None + + i = -1 + for a in self.a: + i += 1 + match = 0 + + if has_key(a, 'href') and a['href'] == attrs['href']: + if has_key(a, 'title') or has_key(attrs, 'title'): + if (has_key(a, 'title') and has_key(attrs, 'title') and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: return i + + def drop_last(self, nLetters): + if not self.quiet: + self.outtext = self.outtext[:-nLetters] + + def handle_emphasis(self, start, tag_style, parent_style): + """handles various text emphases""" + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough + bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis + italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis + fixed = google_fixed_width_font(tag_style) and not \ + google_fixed_width_font(parent_style) and not self.pre + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o(self.emphasis_mark) + self.drop_white_space += 1 + if bold: + self.o(self.strong_mark) + self.drop_white_space += 1 + if fixed: + self.o('`') + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = 0 + self.outtext = self.outtext.rstrip() + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(1) + self.drop_white_space -= 1 + else: + self.o('`') + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(2) + self.drop_white_space -= 1 + else: + self.o(self.strong_mark) + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(1) + self.drop_white_space -= 1 + else: + self.o(self.emphasis_mark) + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + + def handle_tag(self, tag, attrs, start): + #attrs = fixattrs(attrs) + if attrs is None: + attrs = {} + else: + attrs = dict(attrs) + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style = {} + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = self.tag_stack.pop() + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + + if hn(tag): + self.p() + if start: + self.inheader = True + self.o(hn(tag)*"#" + ' ') + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if tag in ['p', 'div']: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + else: + self.p() + + if tag == "br" and start: self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: self.quiet += 1 + else: self.quiet -= 1 + + if tag == "style": + if start: self.style += 1 + else: self.style -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p(); self.o('> ', 0, 1); self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark) + if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark) + if tag in ['del', 'strike', 's']: + if start: + self.o("<"+tag+">") + else: + self.o("") + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` + if tag == "abbr": + if start: + self.abbr_title = None + self.abbr_data = '' + if has_key(attrs, 'title'): + self.abbr_title = attrs['title'] + else: + if self.abbr_title != None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a" and not self.ignore_links: + if start: + if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): + self.astack.append(attrs) + self.maybe_automatic_link = attrs['href'] + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if self.maybe_automatic_link: + self.maybe_automatic_link = None + elif a: + if self.inline_links: + self.o("](" + escape_md(a['href']) + ")") + else: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + str(a['count']) + "]") + + if tag == "img" and start and not self.ignore_images: + if has_key(attrs, 'src'): + attrs['href'] = attrs['src'] + alt = attrs.get('alt', '') + self.o("![" + escape_md(alt) + "]") + + if self.inline_links: + self.o("(" + escape_md(attrs['href']) + ")") + else: + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("[" + str(attrs['count']) + "]") + + if tag == 'dl' and start: self.p() + if tag == 'dt' and not start: self.pbr() + if tag == 'dd' and start: self.o(' ') + if tag == 'dd' and not start: self.pbr() + + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if (not self.list) and (not self.lastWasList): + self.p() + if start: + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append({'name':list_style, 'num':numbering_start}) + else: + if self.list: self.list.pop() + self.lastWasList = True + else: + self.lastWasList = False + + if tag == 'li': + self.pbr() + if start: + if self.list: li = self.list[-1] + else: li = {'name':'ul', 'num':0} + if self.google_doc: + nest_count = self.google_nest_count(tag_style) + else: + nest_count = len(self.list) + self.o(" " * nest_count) #TODO: line up
  1. s > 9 correctly. + if li['name'] == "ul": self.o(self.ul_item_mark + " ") + elif li['name'] == "ol": + li['num'] += 1 + self.o(str(li['num'])+". ") + self.start = 1 + + if tag in ["table", "tr"] and start: self.p() + if tag == 'td': self.pbr() + + if tag == "pre": + if start: + self.startpre = 1 + self.pre = 1 + else: + self.pre = 0 + self.p() + + def pbr(self): + if self.p_p == 0: + self.p_p = 1 + + def p(self): + self.p_p = 2 + + def soft_br(self): + self.pbr() + self.br_toggle = ' ' + + def o(self, data, puredata=0, force=0): + if self.abbr_data is not None: + self.abbr_data += data + + if not self.quiet: + if self.google_doc: + # prevent white space immediately after 'begin emphasis' marks ('**' and '_') + lstripped_data = data.lstrip() + if self.drop_white_space and not (self.pre or self.code): + data = lstripped_data + if lstripped_data != '': + self.drop_white_space = 0 + + if puredata and not self.pre: + data = re.sub('\s+', ' ', data) + if data and data[0] == ' ': + self.space = 1 + data = data[1:] + if not data and not force: return + + if self.startpre: + #self.out(" :") #TODO: not output when already one there + if not data.startswith("\n"): #
    stuff...
    +                    data = "\n" + data
    +
    +            bq = (">" * self.blockquote)
    +            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
    +
    +            if self.pre:
    +                if not self.list:
    +                    bq += "    "
    +                #else: list content is already partially indented
    +                for i in range(len(self.list)):
    +                    bq += "    "
    +                data = data.replace("\n", "\n"+bq)
    +
    +            if self.startpre:
    +                self.startpre = 0
    +                if self.list:
    +                    data = data.lstrip("\n") # use existing initial indentation
    +
    +            if self.start:
    +                self.space = 0
    +                self.p_p = 0
    +                self.start = 0
    +
    +            if force == 'end':
    +                # It's the end.
    +                self.p_p = 0
    +                self.out("\n")
    +                self.space = 0
    +
    +            if self.p_p:
    +                self.out((self.br_toggle+'\n'+bq)*self.p_p)
    +                self.space = 0
    +                self.br_toggle = ''
    +
    +            if self.space:
    +                if not self.lastWasNL: self.out(' ')
    +                self.space = 0
    +
    +            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
    +                if force == "end": self.out("\n")
    +
    +                newa = []
    +                for link in self.a:
    +                    if self.outcount > link['outcount']:
    +                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
    +                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
    +                        self.out("\n")
    +                    else:
    +                        newa.append(link)
    +
    +                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
    +
    +                self.a = newa
    +
    +            if self.abbr_list and force == "end":
    +                for abbr, definition in self.abbr_list.items():
    +                    self.out("  *[" + abbr + "]: " + definition + "\n")
    +
    +            self.p_p = 0
    +            self.out(data)
    +            self.outcount += 1
    +
    +    def handle_data(self, data):
    +        if r'\/script>' in data: self.quiet -= 1
    +
    +        if self.style:
    +            self.style_def.update(dumb_css_parser(data))
    +
    +        if not self.maybe_automatic_link is None:
    +            href = self.maybe_automatic_link
    +            if href == data and self.absolute_url_matcher.match(href):
    +                self.o("<" + data + ">")
    +                return
    +            else:
    +                self.o("[")
    +                self.maybe_automatic_link = None
    +
    +        if not self.code and not self.pre:
    +            data = escape_md_section(data, snob=self.escape_snob)
    +        self.o(data, 1)
    +
    +    def unknown_decl(self, data): pass
    +
    +    def charref(self, name):
    +        if name[0] in ['x','X']:
    +            c = int(name[1:], 16)
    +        else:
    +            c = int(name)
    +
    +        if not self.unicode_snob and c in unifiable_n.keys():
    +            return unifiable_n[c]
    +        else:
    +            try:
    +                return unichr(c)
    +            except NameError: #Python3
    +                return chr(c)
    +
    +    def entityref(self, c):
    +        if not self.unicode_snob and c in unifiable.keys():
    +            return unifiable[c]
    +        else:
    +            try: name2cp(c)
    +            except KeyError: return "&" + c + ';'
    +            else:
    +                try:
    +                    return unichr(name2cp(c))
    +                except NameError: #Python3
    +                    return chr(name2cp(c))
    +
    +    def replaceEntities(self, s):
    +        s = s.group(1)
    +        if s[0] == "#":
    +            return self.charref(s[1:])
    +        else: return self.entityref(s)
    +
    +    r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
    +    def unescape(self, s):
    +        return self.r_unescape.sub(self.replaceEntities, s)
    +
    +    def google_nest_count(self, style):
    +        """calculate the nesting count of google doc lists"""
    +        nest_count = 0
    +        if 'margin-left' in style:
    +            nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
    +        return nest_count
    +
    +
    +    def optwrap(self, text):
    +        """Wrap all paragraphs in the provided text."""
    +        if not self.body_width:
    +            return text
    +
    +        assert wrap, "Requires Python 2.3."
    +        result = ''
    +        newlines = 0
    +        for para in text.split("\n"):
    +            if len(para) > 0:
    +                if not skipwrap(para):
    +                    result += "\n".join(wrap(para, self.body_width))
    +                    if para.endswith('  '):
    +                        result += "  \n"
    +                        newlines = 1
    +                    else:
    +                        result += "\n\n"
    +                        newlines = 2
    +                else:
    +                    if not onlywhite(para):
    +                        result += para + "\n"
    +                        newlines = 1
    +            else:
    +                if newlines < 2:
    +                    result += "\n"
    +                    newlines += 1
    +        return result
    +
    +ordered_list_matcher = re.compile(r'\d+\.\s')
    +unordered_list_matcher = re.compile(r'[-\*\+]\s')
    +md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
    +md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
    +md_dot_matcher = re.compile(r"""
    +    ^             # start of line
    +    (\s*\d+)      # optional whitespace and a number
    +    (\.)          # dot
    +    (?=\s)        # lookahead assert whitespace
    +    """, re.MULTILINE | re.VERBOSE)
    +md_plus_matcher = re.compile(r"""
    +    ^
    +    (\s*)
    +    (\+)
    +    (?=\s)
    +    """, flags=re.MULTILINE | re.VERBOSE)
    +md_dash_matcher = re.compile(r"""
    +    ^
    +    (\s*)
    +    (-)
    +    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
    +                  # or another dash (header or hr)
    +    """, flags=re.MULTILINE | re.VERBOSE)
    +slash_chars = r'\`*_{}[]()#+-.!'
    +md_backslash_matcher = re.compile(r'''
    +    (\\)          # match one slash
    +    (?=[%s])      # followed by a char that requires escaping
    +    ''' % re.escape(slash_chars),
    +    flags=re.VERBOSE)
    +
    +def skipwrap(para):
    +    # If the text begins with four spaces or one tab, it's a code block; don't wrap
    +    if para[0:4] == '    ' or para[0] == '\t':
    +        return True
    +    # If the text begins with only two "--", possibly preceded by whitespace, that's
    +    # an emdash; so wrap.
    +    stripped = para.lstrip()
    +    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
    +        return False
    +    # I'm not sure what this is for; I thought it was to detect lists, but there's
    +    # a 
    -inside- case in one of the tests that also depends upon it. + if stripped[0:1] == '-' or stripped[0:1] == '*': + return True + # If the text begins with a single -, *, or +, followed by a space, or an integer, + # followed by a ., followed by a space (in either case optionally preceeded by + # whitespace), it's a list; don't wrap. + if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): + return True + return False + +def wrapwrite(text): + text = text.encode('utf-8') + try: #Python3 + sys.stdout.buffer.write(text) + except AttributeError: + sys.stdout.write(text) + +def html2text(html, baseurl=''): + h = HTML2Text(baseurl=baseurl) + return h.handle(html) + +def unescape(s, unicode_snob=False): + h = HTML2Text() + h.unicode_snob = unicode_snob + return h.unescape(s) + +def escape_md(text): + """Escapes markdown-sensitive characters within other markdown constructs.""" + return md_chars_matcher.sub(r"\\\1", text) + +def escape_md_section(text, snob=False): + """Escapes markdown-sensitive characters across whole document sections.""" + text = md_backslash_matcher.sub(r"\\\1", text) + if snob: + text = md_chars_matcher_all.sub(r"\\\1", text) + text = md_dot_matcher.sub(r"\1\\\2", text) + text = md_plus_matcher.sub(r"\1\\\2", text) + text = md_dash_matcher.sub(r"\1\\\2", text) + return text + + +def main(): + baseurl = '' + + p = optparse.OptionParser('%prog [(filename|url) [encoding]]', + version='%prog ' + __version__) + p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", + default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") + p.add_option("--ignore-links", dest="ignore_links", action="store_true", + default=IGNORE_ANCHORS, help="don't include any formatting for links") + p.add_option("--ignore-images", dest="ignore_images", action="store_true", + default=IGNORE_IMAGES, help="don't include any formatting for images") + p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", + default=False, help="convert an html-exported Google Document") + p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", + default=False, help="use a dash rather than a star for unordered list items") + p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", + default=False, help="use an asterisk rather than an underscore for emphasized text") + p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", + default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") + p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", + default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") + p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", + default=False, help="hide strike-through text. only relevant when -g is specified as well") + p.add_option("--escape-all", action="store_true", dest="escape_snob", + default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") + (options, args) = p.parse_args() + + # process input + encoding = "utf-8" + if len(args) > 0: + file_ = args[0] + if len(args) == 2: + encoding = args[1] + if len(args) > 2: + p.error('Too many arguments') + + if file_.startswith('http://') or file_.startswith('https://'): + baseurl = file_ + j = urllib.urlopen(baseurl) + data = j.read() + if encoding is None: + try: + from feedparser import _getCharacterEncoding as enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + encoding = enc(j.headers, data)[0] + if encoding == 'us-ascii': + encoding = 'utf-8' + else: + data = open(file_, 'rb').read() + if encoding is None: + try: + from chardet import detect + except ImportError: + detect = lambda x: {'encoding': 'utf-8'} + encoding = detect(data)['encoding'] + else: + data = sys.stdin.read() + + data = data.decode(encoding) + h = HTML2Text(baseurl=baseurl) + # handle options + if options.ul_style_dash: h.ul_item_mark = '-' + if options.em_style_asterisk: + h.emphasis_mark = '*' + h.strong_mark = '__' + + h.body_width = options.body_width + h.list_indent = options.list_indent + h.ignore_emphasis = options.ignore_emphasis + h.ignore_links = options.ignore_links + h.ignore_images = options.ignore_images + h.google_doc = options.google_doc + h.hide_strikethrough = options.hide_strikethrough + h.escape_snob = options.escape_snob + + wrapwrite(h.handle(data)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/migration/tables/comments.py b/migration/tables/comments.py index 1eb92621..4d1b6044 100644 --- a/migration/tables/comments.py +++ b/migration/tables/comments.py @@ -1,10 +1,10 @@ -from html2md import Converter +# from html2md import Converter import datetime -markdown = Converter() +# markdown = Converter() def migrate(entry): - ``` + ''' # is comment type Shout { org: String! @@ -25,12 +25,12 @@ def migrate(entry): visibleForRoles: [String] # role ids are strings visibleForUsers: [Int] } - ``` + ''' # TODO: implement comments migration return { 'org': 'discours.io', 'slug': entry['slug'], 'createdAt': entry['createdAt'], - 'body': markdown.feed(entry['body']), + 'body': html2text(entry['body']), 'replyTo': entry[''] } \ No newline at end of file diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index 8c581f49..fee88d56 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -1,14 +1,16 @@ -from migration.html2md import Converter +# from migration.html2md import Converter from dateutil.parser import parse from os.path import abspath import frontmatter import json from orm import Shout +from bs4 import BeautifulSoup +from migration.html2text import html2text users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read()) -users_dict['0'] = {'id': 99999 } +users_dict['0'] = {'id': 9999999, 'slug': 'discours', 'viewname': 'Дискурс' } -markdown = Converter() +# markdown = Converter() type2layout = { 'Article': 'article', @@ -18,7 +20,7 @@ type2layout = { 'Image': 'image' } -def migrate(entry): +def migrate(entry, data=users_dict): ''' type Shout { org_id: Int! @@ -43,27 +45,37 @@ def migrate(entry): views: Int } ''' + try: + author = data[entry['createdBy']] + except KeyError: + author = data['0'] + + # print(author) r = { 'org_id': 0, 'layout': type2layout[entry['type']], 'title': entry['title'], - 'authors': [ users_dict[entry['createdBy']]['id'], ], + 'authors': [ { 'slug': author['slug'], 'name': author['viewname'], 'pic': author.get('userpic', '') }, ], 'topics': [], 'published': entry['published'], 'views': entry['views'], 'rating': entry['rating'], 'ratings': [] } - r['slug'] = entry.get('slug') + r['slug'] = entry.get('slug', '') + body_orig = entry.get('body', '') if not r['slug'] and entry.get('friendlySlugs') is not None: r['slug'] = entry['friendlySlugs']['slug'][0]['slug'] if(r['slug'] is None): r['slug'] = entry['friendlySlugs'][0]['slug'] + if not r['slug']: + print('NO SLUG ERROR') + # print(entry) + raise Exception if entry.get('image') is not None: r['cover'] = entry['image']['url'] - elif entry.get('thumborId') is not None: - r['cover'] = 'https://discours.io/' + entry['thumborId'] - + if entry.get('thumborId') is not None: + r['cover'] = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId'] if entry.get('publishedAt') is not None: r['publishedAt'] = entry['publishedAt'] if entry.get('createdAt') is not None: @@ -71,20 +83,60 @@ def migrate(entry): if entry.get('updatedAt') is not None: r['updatedAt'] = entry['updatedAt'] if entry.get('type') == 'Literature': - r['body'] = markdown.feed(entry['media'][0]['literatureBody']) + media = entry.get('media', '') + # print(media[0]['literatureBody']) + if type(media) == list: + body_orig = media[0].get('literatureBody', '') + if body_orig == '': + print('EMPTY BODY!') + else: + # body_html = str(BeautifulSoup(body_orig, features="html.parser")) + #markdown.feed(body_html) + body = html2text(body_orig).replace('****', '**') + r['body'] = body + # r['body2'] = markdown.md_file + else: + print(r['slug'] + ': literature has no media') elif entry.get('type') == 'Video': - r['body'] = '' + m = entry['media'][0] + yt = m.get('youtubeId', '') + vm = m.get('vimeoId', '') + videoUrl = 'https://www.youtube.com/watch?v=' + yt if yt else '#' + if videoUrl == '#': + videoUrl = 'https://vimeo.com/' + vm if vm else '#' + if videoUrl == '#': + print(m) + # raise Exception + r['body'] = '' + html2text(m.get('body', '')) # FIXME elif entry.get('type') == 'Music': - r['body'] = '' - elif entry.get('body') is not None: - r['body'] = markdown.feed(entry['body']) - else: - r['body'] = '## ' + r['title'] - body = r['body'] - del r['body'] - metadata = frontmatter.dumps(r) - open('migration/content/' + r['slug'] + '.md', 'w').write(metadata + '\n' + body) - r['body'] = body - shout = Shout.create(**r.copy()) - r['id'] = shout['id'] + r['body'] = '' # FIXME + + if r.get('body') is None: + body_orig = entry.get('body', '') + # body_html = BeautifulSoup(body_orig, features="html.parser") + r['body'] = html2text(body_orig).replace('****', '**') + # markdown.feed(body_html) + # r['body2'] = markdown.md_file + if not r['body']: + r['body'] = entry.get('body') + metadata = {} + metadata['title'] = r.get('title') + metadata['authors'] = r.get('authors') + if r.get('cover', False): + metadata['cover'] = r.get('cover') + body = r.get('body') + post = frontmatter.Post(body, **metadata) + dumped = frontmatter.dumps(post) + # raise Exception + + open('migration/content/' + entry['type'].lower() + '/' + r['slug'] + '.md', 'w').write(dumped) + # open('migration/content/' + entry['type'].lower() + '/' + r['slug'] + '.my.md', 'w').write(r['body2']) + #if body_orig: + # open('migration/content/' + entry['type'].lower() + '/' + r['slug'] + '.html', 'w').write(body_orig) + #markdown.related_data = [] + #markdown.md_file = '' + #markdown.reset() + r['body'] = dumped + # shout = Shout.create(**r.copy()) + # r['id'] = shout['id'] return r diff --git a/migration/tables/users.py b/migration/tables/users.py index 2eac49e7..4dfa94b9 100644 --- a/migration/tables/users.py +++ b/migration/tables/users.py @@ -1,10 +1,17 @@ from orm import User, Role import frontmatter from dateutil.parser import parse -from migration.html2md import Converter -markdown = Converter() +from migration.html2text import html2text +# from migration.html2md import Converter +# markdown = Converter() counter = 0 + +def add(data): + data.emailConfirmed = False + user = User.create(**data) + return user + def migrate(entry): ''' @@ -45,10 +52,10 @@ def migrate(entry): res['notifications'] = [] res['links'] = [] res['muted'] = False - res['bio'] = markdown.feed(entry.get('bio', '')) + res['bio'] = html2text(entry.get('bio', '')) if entry['profile']: res['slug'] = entry['profile'].get('path') - res['userpic'] = entry['profile'].get('image', {'url': ''}).get('url', '') + res['userpic'] = entry['profile'].get('image', {'thumborId': ''}).get('thumborId', '') # adding 'https://assets.discours.io/unsafe/1600x' in web ui fn = entry['profile'].get('firstName', '') ln = entry['profile'].get('lastName', '') viewname = res['slug'] if res['slug'] else 'anonymous' diff --git a/requirements.txt b/requirements.txt index bd0ab4b2..ef38b755 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ authlib httpx psycopg2-binary bson -python-frontmatter \ No newline at end of file +python-frontmatter +bs4 \ No newline at end of file diff --git a/schema.graphql b/schema.graphql index a1578226..2acdb4fb 100644 --- a/schema.graphql +++ b/schema.graphql @@ -145,23 +145,23 @@ type UserNotification { } type User { + id: Int! username: String! # email createdAt: DateTime! + slug: String! + viewname: String # to display email: String password: String oauth: String # provider:token - viewname: String # to display userpic: String links: [String] emailConfirmed: Boolean # should contain all emails too # TODO: pagination here - id: Int! muted: Boolean rating: Int roles: [Role] updatedAt: DateTime wasOnlineAt: DateTime ratings: [Rating] - slug: String bio: String notifications: [Int] }