migration wip, minor fixes scheme

This commit is contained in:
Untone 2021-08-23 11:44:46 +03:00
parent 8cef32c7a4
commit 9343f784b4
12 changed files with 1106 additions and 69 deletions

1
.gitignore vendored
View File

@ -135,3 +135,4 @@ discours.crt
Pipfile.lock Pipfile.lock
migration/data migration/data
migration/content

View File

@ -18,6 +18,7 @@ psycopg2-binary = "*"
Authlib = "*" Authlib = "*"
bson = "*" bson = "*"
python-frontmatter = "*" python-frontmatter = "*"
bs4 = "*"
[dev-packages] [dev-packages]

View File

@ -4,7 +4,7 @@ from migration.tables.content_items import migrate as migrateShout
from migration.tables.content_item_categories import migrate as migrateTopic from migration.tables.content_item_categories import migrate as migrateTopic
from migration.utils import DateTimeEncoder from migration.utils import DateTimeEncoder
def users(): def users(limit):
print('migrating users...') print('migrating users...')
data = json.loads(open('migration/data/users.json').read()) data = json.loads(open('migration/data/users.json').read())
newdata = {} newdata = {}
@ -14,6 +14,8 @@ def users():
oid = entry['_id'] oid = entry['_id']
newdata[oid] = migrateUser(entry) newdata[oid] = migrateUser(entry)
counter += 1 counter += 1
if counter > limit:
break
#except Exception: #except Exception:
# print(str(counter) + '/' + str(len(data)) + ' users entries were migrated') # print(str(counter) + '/' + str(len(data)) + ' users entries were migrated')
# print('try to remove database first') # print('try to remove database first')
@ -21,7 +23,7 @@ def users():
print(str(counter) + ' users entries were migrated') print(str(counter) + ' users entries were migrated')
def topics(): def topics(limit):
print('migrating topics...') print('migrating topics...')
data = json.loads(open('migration/data/content_item_categories.json').read()) data = json.loads(open('migration/data/content_item_categories.json').read())
newdata = {} newdata = {}
@ -31,47 +33,57 @@ def topics():
oid = entry['_id'] oid = entry['_id']
newdata[oid] = migrateTopic(entry) newdata[oid] = migrateTopic(entry)
counter += 1 counter += 1
if counter > limit:
break
except Exception: except Exception:
print(str(counter) + '/' + str(len(data)) + ' topics were migrated') print(str(counter) + '/' + str(len(data)) + ' topics were migrated')
print('try to remove database first') print('try to remove database first')
open('migration/data/topics.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) open('migration/data/topics.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) )
print(str(counter) + ' topics were migrated') print(str(counter) + ' topics were migrated')
def shouts(): def shouts(limit):
print('migrating shouts...') print('loading shouts...')
counter = 0 counter = 0
discoursAuthor = 0
data = json.loads(open('migration/data/content_items.json').read()) data = json.loads(open('migration/data/content_items.json').read())
newdata = {} newdata = {}
print(str(len(data)) + ' entries was loaded. now migrating...')
for entry in data: for entry in data:
oid = entry['_id'] oid = entry['_id']
newdata[oid] = migrateShout(entry) newdata[oid] = migrateShout(entry)
counter += 1 counter += 1
print(str(counter) + ': ' + newdata['slug']) author = newdata[oid]['authors'][0]['slug']
if counter > 9: if author == 'discours':
discoursAuthor += 1
line = str(counter) + ': ' + newdata[oid]['slug'] + " @" + author
print(line)
open('./shouts.id.log','a').write(line + '\n')
if counter > limit:
break break
open('migration/data/shouts.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) open('migration/data/shouts.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) )
print(str(counter) + ' shouts were migrated') print(str(counter) + ' shouts were migrated')
print(str(discoursAuthor) + ' from them by uknown users')
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
if len(sys.argv) > 1: if len(sys.argv) > 1:
limit = int(sys.argv[2])
if sys.argv[1] == "users": if sys.argv[1] == "users":
users() users(limit)
elif sys.argv[1] == "topics": elif sys.argv[1] == "topics":
topics() topics(limit)
elif sys.argv[1] == "shouts": elif sys.argv[1] == "shouts":
shouts() shouts(limit)
elif sys.argv[1] == "comments": elif sys.argv[1] == "comments":
# comments() comments(limit)
pass pass
elif sys.argv[1] == "all": elif sys.argv[1] == "all":
topics() topics(limit)
users() users(limit)
shouts() shouts(limit)
elif sys.argv[1] == "bson": elif sys.argv[1] == "bson":
import migration.bson2json import migration.bson2json
bson2json.json_tables() bson2json.json_tables()
else: else:
print('usage: python migrate.py <all|topics|users|shouts|comments>') print('usage: python migrate.py <all|topics|users|shouts|comments> <stop_index>')

View File

@ -24,6 +24,7 @@ bson2json.json_tables() # creates all the needed data json from bson mongodump
2. migrate users 2. migrate users
```sh ```sh
pipenv install
pipenv run python migrate.py users pipenv run python migrate.py users
``` ```

View File

@ -5,6 +5,8 @@ import importlib
import DateTimeEncoder from utils import DateTimeEncoder from utils
def json_tables():
print('creating json files at data/')
data = { data = {
"content_items": [], "content_items": [],
"content_item_categories": [], "content_item_categories": [],
@ -13,10 +15,6 @@ data = {
"users": [], "users": [],
"comments": [] "comments": []
} }
def json_tables():
print('creating json files at data/')
for table in data.keys(): for table in data.keys():
lc = [] lc = []
with open('data/'+table+'.bson', 'rb') as f: with open('data/'+table+'.bson', 'rb') as f:
@ -27,4 +25,5 @@ def json_tables():
lc.append(d) lc.append(d)
data[table] = lc data[table] = lc
open('data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder)) open('data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder))
return data

View File

@ -24,6 +24,7 @@ class Converter(HTMLParser):
self.md_file = '' self.md_file = ''
self.code_box = False self.code_box = False
self.div_count = 0 self.div_count = 0
self.span_count = 0
self.code_box_div_num = 0 self.code_box_div_num = 0
self.ol_count = 0 self.ol_count = 0
self.temp_tag = '' self.temp_tag = ''
@ -37,8 +38,23 @@ class Converter(HTMLParser):
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if self.ignore_data: if self.ignore_data:
return None return None
elif tag == 'br': elif tag == 'sup':
self.md_file += '<sup>'
elif tag == 'p':
self.temp_tag = 'p'
self.md_file += '\n' self.md_file += '\n'
elif tag == 'i':
self.temp_tag = 'i'
self.md_file += '*'
elif tag == 'wbr':
self.temp_tag = 'wbr'
self.md_file += ''
elif tag == 'span':
self.temp_tag = 'span'
self.span_count += 1
self.md_file += ' '
elif tag == 'figcaption':
self.md_file += ''
elif tag == 'hr': elif tag == 'hr':
self.md_file += '\n*** \n' self.md_file += '\n*** \n'
elif tag == 'title': elif tag == 'title':
@ -74,7 +90,7 @@ class Converter(HTMLParser):
elif 'class' in attrs_dict: elif 'class' in attrs_dict:
self.class_div_count = self.div_count self.class_div_count = self.div_count
self.ignore_div = True self.ignore_div = True
elif tag == 'en-codeblock': elif tag == 'pre' or tag == 'code':
self.code_box = True self.code_box = True
self.md_file += '\n```\n' self.md_file += '\n```\n'
elif tag == 'a': elif tag == 'a':
@ -94,7 +110,7 @@ class Converter(HTMLParser):
elif tag == 'img': elif tag == 'img':
attrs_dict = dict(attrs) attrs_dict = dict(attrs)
img_ref = attrs_dict['src'] img_ref = attrs_dict['src']
alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'Placeholder' alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'x'
if self.is_link: if self.is_link:
self.related_data.append(img_ref) self.related_data.append(img_ref)
self.md_file += f'[![{alt_name}]({img_ref})]({self.link_ref})' self.md_file += f'[![{alt_name}]({img_ref})]({self.link_ref})'
@ -104,6 +120,8 @@ class Converter(HTMLParser):
elif tag == 'table': elif tag == 'table':
self.ignore_data = True self.ignore_data = True
self.table_start = self.getpos() self.table_start = self.getpos()
else:
print('<' + tag + '>')
def get_rawdata(self, start, stop, offset): def get_rawdata(self, start, stop, offset):
temp_rawdata = self.rawdata temp_rawdata = self.rawdata
@ -114,7 +132,32 @@ class Converter(HTMLParser):
def handle_endtag(self, tag): def handle_endtag(self, tag):
if tag == 'b' or tag == 'strong': if tag == 'b' or tag == 'strong':
self.md_file += '** \n' self.md_file += '** '
elif tag == 'sup':
self.md_file += '</sup>'
elif tag == 'iframe':
self.ignore_data = False
elif tag == 'wbr':
self.md_file += ''
elif tag == 'title':
self.md_file += '\n'
elif tag == 'h1':
self.md_file += '\n'
elif tag == 'h2':
self.md_file += '\n'
elif tag == 'h3':
self.md_file += '\n'
elif tag == 'h4':
self.md_file += '\n'
elif tag == 'span':
self.span_count -= 1
self.md_file += ' '
elif tag == 'figcaption':
self.md_file += '\n'
elif tag == 'i':
self.md_file += '* '
elif tag == 'p':
self.md_file += '\n'
elif tag == 'div': elif tag == 'div':
if self.code_box and self.code_box_div_num == self.div_count: if self.code_box and self.code_box_div_num == self.div_count:
self.code_box = False self.code_box = False
@ -124,7 +167,7 @@ class Converter(HTMLParser):
else: else:
self.md_file += ' \n' self.md_file += ' \n'
self.div_count -= 1 self.div_count -= 1
elif tag == 'en-codeblock': elif tag == 'pre' or tag == 'code':
self.code_box = False self.code_box = False
self.md_file += '```\n' self.md_file += '```\n'
elif tag == 'a': elif tag == 'a':
@ -144,18 +187,24 @@ class Converter(HTMLParser):
raw_data = self.get_rawdata(lineno_start, lineno_stop, offset) raw_data = self.get_rawdata(lineno_start, lineno_stop, offset)
self.md_file += '\n' + raw_data self.md_file += '\n' + raw_data
self.ignore_data = False self.ignore_data = False
else:
print('</' + tag + '>')
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):
if tag == 'br': if tag == 'br':
self.md_file += ' \n' self.md_file += ' \n'
elif tag == 'wbr':
self.md_file += ''
elif tag == 'hr': elif tag == 'hr':
self.md_file += '\n*** \n' self.md_file += '\n*** \n'
elif tag == 'img': elif tag == 'img':
attr_dict = dict(attrs) attr_dict = dict(attrs)
name = attr_dict['data-filename'] name = attr_dict.get('data-filename', 'image')
img_ref = attr_dict['src'] img_ref = attr_dict['src']
self.related_data.append(img_ref) self.related_data.append(img_ref)
self.md_file += f'![{name}]({img_ref})' self.md_file += f'![{name}]({img_ref})'
else:
print("<" + tag + " />")
def handle_data(self, data): def handle_data(self, data):
if self.is_link: if self.is_link:

914
migration/html2text.py Normal file
View File

@ -0,0 +1,914 @@
#!/usr/bin/env python
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "3.200.3"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO:
# Support decoded entities with unifiable.
try:
True
except NameError:
setattr(__builtins__, 'True', 1)
setattr(__builtins__, 'False', 0)
def has_key(x, y):
if hasattr(x, 'has_key'): return x.has_key(y)
else: return y in x
try:
import htmlentitydefs
import urlparse
import HTMLParser
except ImportError: #Python3
import html.entities as htmlentitydefs
import urllib.parse as urlparse
import html.parser as HTMLParser
try: #Python3
import urllib.request as urllib
except:
import urllib
import optparse, re, sys, codecs, types
try: from textwrap import wrap
except: pass
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0
# Escape all special characters. Output is less readable, but avoids corner case formatting issues.
ESCAPE_SNOB = 0
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = 78
# Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = True
# Use inline, rather than reference, formatting for images and links
INLINE_LINKS = True
# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36
IGNORE_ANCHORS = False
IGNORE_IMAGES = False
IGNORE_EMPHASIS = False
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
'lrm':'', 'rlm':''}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c != ' ' and c != ' ':
return c == ' '
return line
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): return n
except ValueError: return 0
def dumb_property_dict(style):
"""returns a hash of css attributes"""
return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
def dumb_css_parser(data):
"""returns a hash of css selectors, each of which contains a hash of css attributes"""
# remove @import sentences
data += ';'
importIndex = data.find('@import')
while importIndex != -1:
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
importIndex = data.find('@import')
# parse the css. reverted from dictionary compehension in order to support older pythons
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
try:
elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
except ValueError:
elements = {} # not that important
return elements
def element_style(attrs, style_def, parent_style):
"""returns a hash of the 'final' style attributes of the element"""
style = parent_style.copy()
if 'class' in attrs:
for css_class in attrs['class'].split():
css_style = style_def['.' + css_class]
style.update(css_style)
if 'style' in attrs:
immediate_style = dumb_property_dict(attrs['style'])
style.update(immediate_style)
return style
def google_list_style(style):
"""finds out whether this is an ordered or unordered list"""
if 'list-style-type' in style:
list_style = style['list-style-type']
if list_style in ['disc', 'circle', 'square', 'none']:
return 'ul'
return 'ol'
def google_has_height(style):
"""check if the style of the element has the 'height' attribute explicitly defined"""
if 'height' in style:
return True
return False
def google_text_emphasis(style):
"""return a list of all emphasis modifiers of the element"""
emphasis = []
if 'text-decoration' in style:
emphasis.append(style['text-decoration'])
if 'font-style' in style:
emphasis.append(style['font-style'])
if 'font-weight' in style:
emphasis.append(style['font-weight'])
return emphasis
def google_fixed_width_font(style):
"""check if the css of the current element defines a fixed width font"""
font_family = ''
if 'font-family' in style:
font_family = style['font-family']
if 'Courier New' == font_family or 'Consolas' == font_family:
return True
return False
def list_numbering_start(attrs):
"""extract numbering from list element attributes"""
if 'start' in attrs:
return int(attrs['start']) - 1
else:
return 0
class HTML2Text(HTMLParser.HTMLParser):
def __init__(self, out=None, baseurl=''):
HTMLParser.HTMLParser.__init__(self)
# Config options
self.unicode_snob = UNICODE_SNOB
self.escape_snob = ESCAPE_SNOB
self.links_each_paragraph = LINKS_EACH_PARAGRAPH
self.body_width = BODY_WIDTH
self.skip_internal_links = SKIP_INTERNAL_LINKS
self.inline_links = INLINE_LINKS
self.google_list_indent = GOOGLE_LIST_INDENT
self.ignore_links = IGNORE_ANCHORS
self.ignore_images = IGNORE_IMAGES
self.ignore_emphasis = IGNORE_EMPHASIS
self.google_doc = False
self.ul_item_mark = '*'
self.emphasis_mark = '_'
self.strong_mark = '**'
if out is None:
self.out = self.outtextf
else:
self.out = out
self.outtextlist = [] # empty list to store output characters before they are "joined"
try:
self.outtext = unicode()
except NameError: # Python3
self.outtext = str()
self.quiet = 0
self.p_p = 0 # number of newline character to print before next output
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.maybe_automatic_link = None
self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.code = False
self.br_toggle = ''
self.lastWasNL = 0
self.lastWasList = False
self.style = 0
self.style_def = {}
self.tag_stack = []
self.emphasis = 0
self.drop_white_space = 0
self.inheader = False
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
try: del unifiable_n[name2cp('nbsp')]
except KeyError: pass
unifiable['nbsp'] = '&nbsp_place_holder;'
def feed(self, data):
data = data.replace("</' + 'script>", "</ignore>")
HTMLParser.HTMLParser.feed(self, data)
def handle(self, data):
self.feed(data)
self.feed("")
return self.optwrap(self.close())
def outtextf(self, s):
self.outtextlist.append(s)
if s: self.lastWasNL = s[-1] == '\n'
def close(self):
HTMLParser.HTMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
self.outtext = self.outtext.join(self.outtextlist)
if self.unicode_snob:
nbsp = unichr(name2cp('nbsp'))
else:
nbsp = u' '
self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
return self.outtext
def handle_charref(self, c):
self.o(self.charref(c), 1)
def handle_entityref(self, c):
self.o(self.entityref(c), 1)
def handle_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
def handle_endtag(self, tag):
self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not has_key(attrs, 'href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if has_key(a, 'href') and a['href'] == attrs['href']:
if has_key(a, 'title') or has_key(attrs, 'title'):
if (has_key(a, 'title') and has_key(attrs, 'title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def drop_last(self, nLetters):
if not self.quiet:
self.outtext = self.outtext[:-nLetters]
def handle_emphasis(self, start, tag_style, parent_style):
"""handles various text emphases"""
tag_emphasis = google_text_emphasis(tag_style)
parent_emphasis = google_text_emphasis(parent_style)
# handle Google's text emphasis
strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
fixed = google_fixed_width_font(tag_style) and not \
google_fixed_width_font(parent_style) and not self.pre
if start:
# crossed-out text must be handled before other attributes
# in order not to output qualifiers unnecessarily
if bold or italic or fixed:
self.emphasis += 1
if strikethrough:
self.quiet += 1
if italic:
self.o(self.emphasis_mark)
self.drop_white_space += 1
if bold:
self.o(self.strong_mark)
self.drop_white_space += 1
if fixed:
self.o('`')
self.drop_white_space += 1
self.code = True
else:
if bold or italic or fixed:
# there must not be whitespace before closing emphasis mark
self.emphasis -= 1
self.space = 0
self.outtext = self.outtext.rstrip()
if fixed:
if self.drop_white_space:
# empty emphasis, drop it
self.drop_last(1)
self.drop_white_space -= 1
else:
self.o('`')
self.code = False
if bold:
if self.drop_white_space:
# empty emphasis, drop it
self.drop_last(2)
self.drop_white_space -= 1
else:
self.o(self.strong_mark)
if italic:
if self.drop_white_space:
# empty emphasis, drop it
self.drop_last(1)
self.drop_white_space -= 1
else:
self.o(self.emphasis_mark)
# space is only allowed after *all* emphasis marks
if (bold or italic) and not self.emphasis:
self.o(" ")
if strikethrough:
self.quiet -= 1
def handle_tag(self, tag, attrs, start):
#attrs = fixattrs(attrs)
if attrs is None:
attrs = {}
else:
attrs = dict(attrs)
if self.google_doc:
# the attrs parameter is empty for a closing tag. in addition, we
# need the attributes of the parent nodes in order to get a
# complete style description for the current element. we assume
# that google docs export well formed html.
parent_style = {}
if start:
if self.tag_stack:
parent_style = self.tag_stack[-1][2]
tag_style = element_style(attrs, self.style_def, parent_style)
self.tag_stack.append((tag, attrs, tag_style))
else:
dummy, attrs, tag_style = self.tag_stack.pop()
if self.tag_stack:
parent_style = self.tag_stack[-1][2]
if hn(tag):
self.p()
if start:
self.inheader = True
self.o(hn(tag)*"#" + ' ')
else:
self.inheader = False
return # prevent redundant emphasis marks on headers
if tag in ['p', 'div']:
if self.google_doc:
if start and google_has_height(tag_style):
self.p()
else:
self.soft_br()
else:
self.p()
if tag == "br" and start: self.o(" \n")
if tag == "hr" and start:
self.p()
self.o("* * *")
self.p()
if tag in ["head", "style", 'script']:
if start: self.quiet += 1
else: self.quiet -= 1
if tag == "style":
if start: self.style += 1
else: self.style -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
if tag in ['del', 'strike', 's']:
if start:
self.o("<"+tag+">")
else:
self.o("</"+tag+">")
if self.google_doc:
if not self.inheader:
# handle some font attributes, but leave headers clean
self.handle_emphasis(start, tag_style, parent_style)
if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
self.abbr_title = None
self.abbr_data = ''
if has_key(attrs, 'title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a" and not self.ignore_links:
if start:
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.maybe_automatic_link = attrs['href']
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if self.maybe_automatic_link:
self.maybe_automatic_link = None
elif a:
if self.inline_links:
self.o("](" + escape_md(a['href']) + ")")
else:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + str(a['count']) + "]")
if tag == "img" and start and not self.ignore_images:
if has_key(attrs, 'src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
self.o("![" + escape_md(alt) + "]")
if self.inline_links:
self.o("(" + escape_md(attrs['href']) + ")")
else:
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("[" + str(attrs['count']) + "]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
# Google Docs create sub lists as top level lists
if (not self.list) and (not self.lastWasList):
self.p()
if start:
if self.google_doc:
list_style = google_list_style(tag_style)
else:
list_style = tag
numbering_start = list_numbering_start(attrs)
self.list.append({'name':list_style, 'num':numbering_start})
else:
if self.list: self.list.pop()
self.lastWasList = True
else:
self.lastWasList = False
if tag == 'li':
self.pbr()
if start:
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
if self.google_doc:
nest_count = self.google_nest_count(tag_style)
else:
nest_count = len(self.list)
self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o(self.ul_item_mark + " ")
elif li['name'] == "ol":
li['num'] += 1
self.o(str(li['num'])+". ")
self.start = 1
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0:
self.p_p = 1
def p(self):
self.p_p = 2
def soft_br(self):
self.pbr()
self.br_toggle = ' '
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None:
self.abbr_data += data
if not self.quiet:
if self.google_doc:
# prevent white space immediately after 'begin emphasis' marks ('**' and '_')
lstripped_data = data.lstrip()
if self.drop_white_space and not (self.pre or self.code):
data = lstripped_data
if lstripped_data != '':
self.drop_white_space = 0
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
if not data.startswith("\n"): # <pre>stuff...
data = "\n" + data
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
if not self.list:
bq += " "
#else: list content is already partially indented
for i in range(len(self.list)):
bq += " "
data = data.replace("\n", "\n"+bq)
if self.startpre:
self.startpre = 0
if self.list:
data = data.lstrip("\n") # use existing initial indentation
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out((self.br_toggle+'\n'+bq)*self.p_p)
self.space = 0
self.br_toggle = ''
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
if has_key(link, 'title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.outcount += 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
if self.style:
self.style_def.update(dumb_css_parser(data))
if not self.maybe_automatic_link is None:
href = self.maybe_automatic_link
if href == data and self.absolute_url_matcher.match(href):
self.o("<" + data + ">")
return
else:
self.o("[")
self.maybe_automatic_link = None
if not self.code and not self.pre:
data = escape_md_section(data, snob=self.escape_snob)
self.o(data, 1)
def unknown_decl(self, data): pass
def charref(self, name):
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not self.unicode_snob and c in unifiable_n.keys():
return unifiable_n[c]
else:
try:
return unichr(c)
except NameError: #Python3
return chr(c)
def entityref(self, c):
if not self.unicode_snob and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c + ';'
else:
try:
return unichr(name2cp(c))
except NameError: #Python3
return chr(name2cp(c))
def replaceEntities(self, s):
s = s.group(1)
if s[0] == "#":
return self.charref(s[1:])
else: return self.entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(self, s):
return self.r_unescape.sub(self.replaceEntities, s)
def google_nest_count(self, style):
"""calculate the nesting count of google doc lists"""
nest_count = 0
if 'margin-left' in style:
nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
return nest_count
def optwrap(self, text):
"""Wrap all paragraphs in the provided text."""
if not self.body_width:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if not skipwrap(para):
result += "\n".join(wrap(para, self.body_width))
if para.endswith(' '):
result += " \n"
newlines = 1
else:
result += "\n\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
ordered_list_matcher = re.compile(r'\d+\.\s')
unordered_list_matcher = re.compile(r'[-\*\+]\s')
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
md_dot_matcher = re.compile(r"""
^ # start of line
(\s*\d+) # optional whitespace and a number
(\.) # dot
(?=\s) # lookahead assert whitespace
""", re.MULTILINE | re.VERBOSE)
md_plus_matcher = re.compile(r"""
^
(\s*)
(\+)
(?=\s)
""", flags=re.MULTILINE | re.VERBOSE)
md_dash_matcher = re.compile(r"""
^
(\s*)
(-)
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
# or another dash (header or hr)
""", flags=re.MULTILINE | re.VERBOSE)
slash_chars = r'\`*_{}[]()#+-.!'
md_backslash_matcher = re.compile(r'''
(\\) # match one slash
(?=[%s]) # followed by a char that requires escaping
''' % re.escape(slash_chars),
flags=re.VERBOSE)
def skipwrap(para):
# If the text begins with four spaces or one tab, it's a code block; don't wrap
if para[0:4] == ' ' or para[0] == '\t':
return True
# If the text begins with only two "--", possibly preceded by whitespace, that's
# an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists, but there's
# a <br>-inside-<span> case in one of the tests that also depends upon it.
if stripped[0:1] == '-' or stripped[0:1] == '*':
return True
# If the text begins with a single -, *, or +, followed by a space, or an integer,
# followed by a ., followed by a space (in either case optionally preceeded by
# whitespace), it's a list; don't wrap.
if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
return True
return False
def wrapwrite(text):
text = text.encode('utf-8')
try: #Python3
sys.stdout.buffer.write(text)
except AttributeError:
sys.stdout.write(text)
def html2text(html, baseurl=''):
h = HTML2Text(baseurl=baseurl)
return h.handle(html)
def unescape(s, unicode_snob=False):
h = HTML2Text()
h.unicode_snob = unicode_snob
return h.unescape(s)
def escape_md(text):
"""Escapes markdown-sensitive characters within other markdown constructs."""
return md_chars_matcher.sub(r"\\\1", text)
def escape_md_section(text, snob=False):
"""Escapes markdown-sensitive characters across whole document sections."""
text = md_backslash_matcher.sub(r"\\\1", text)
if snob:
text = md_chars_matcher_all.sub(r"\\\1", text)
text = md_dot_matcher.sub(r"\1\\\2", text)
text = md_plus_matcher.sub(r"\1\\\2", text)
text = md_dash_matcher.sub(r"\1\\\2", text)
return text
def main():
baseurl = ''
p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
version='%prog ' + __version__)
p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
p.add_option("--ignore-links", dest="ignore_links", action="store_true",
default=IGNORE_ANCHORS, help="don't include any formatting for links")
p.add_option("--ignore-images", dest="ignore_images", action="store_true",
default=IGNORE_IMAGES, help="don't include any formatting for images")
p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
default=False, help="convert an html-exported Google Document")
p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
default=False, help="use a dash rather than a star for unordered list items")
p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
default=False, help="use an asterisk rather than an underscore for emphasized text")
p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
default=False, help="hide strike-through text. only relevant when -g is specified as well")
p.add_option("--escape-all", action="store_true", dest="escape_snob",
default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
(options, args) = p.parse_args()
# process input
encoding = "utf-8"
if len(args) > 0:
file_ = args[0]
if len(args) == 2:
encoding = args[1]
if len(args) > 2:
p.error('Too many arguments')
if file_.startswith('http://') or file_.startswith('https://'):
baseurl = file_
j = urllib.urlopen(baseurl)
data = j.read()
if encoding is None:
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
encoding = enc(j.headers, data)[0]
if encoding == 'us-ascii':
encoding = 'utf-8'
else:
data = open(file_, 'rb').read()
if encoding is None:
try:
from chardet import detect
except ImportError:
detect = lambda x: {'encoding': 'utf-8'}
encoding = detect(data)['encoding']
else:
data = sys.stdin.read()
data = data.decode(encoding)
h = HTML2Text(baseurl=baseurl)
# handle options
if options.ul_style_dash: h.ul_item_mark = '-'
if options.em_style_asterisk:
h.emphasis_mark = '*'
h.strong_mark = '__'
h.body_width = options.body_width
h.list_indent = options.list_indent
h.ignore_emphasis = options.ignore_emphasis
h.ignore_links = options.ignore_links
h.ignore_images = options.ignore_images
h.google_doc = options.google_doc
h.hide_strikethrough = options.hide_strikethrough
h.escape_snob = options.escape_snob
wrapwrite(h.handle(data))
if __name__ == "__main__":
main()

View File

@ -1,10 +1,10 @@
from html2md import Converter # from html2md import Converter
import datetime import datetime
markdown = Converter() # markdown = Converter()
def migrate(entry): def migrate(entry):
``` '''
# is comment # is comment
type Shout { type Shout {
org: String! org: String!
@ -25,12 +25,12 @@ def migrate(entry):
visibleForRoles: [String] # role ids are strings visibleForRoles: [String] # role ids are strings
visibleForUsers: [Int] visibleForUsers: [Int]
} }
``` '''
# TODO: implement comments migration # TODO: implement comments migration
return { return {
'org': 'discours.io', 'org': 'discours.io',
'slug': entry['slug'], 'slug': entry['slug'],
'createdAt': entry['createdAt'], 'createdAt': entry['createdAt'],
'body': markdown.feed(entry['body']), 'body': html2text(entry['body']),
'replyTo': entry[''] 'replyTo': entry['']
} }

View File

@ -1,14 +1,16 @@
from migration.html2md import Converter # from migration.html2md import Converter
from dateutil.parser import parse from dateutil.parser import parse
from os.path import abspath from os.path import abspath
import frontmatter import frontmatter
import json import json
from orm import Shout from orm import Shout
from bs4 import BeautifulSoup
from migration.html2text import html2text
users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read()) users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read())
users_dict['0'] = {'id': 99999 } users_dict['0'] = {'id': 9999999, 'slug': 'discours', 'viewname': 'Дискурс' }
markdown = Converter() # markdown = Converter()
type2layout = { type2layout = {
'Article': 'article', 'Article': 'article',
@ -18,7 +20,7 @@ type2layout = {
'Image': 'image' 'Image': 'image'
} }
def migrate(entry): def migrate(entry, data=users_dict):
''' '''
type Shout { type Shout {
org_id: Int! org_id: Int!
@ -43,27 +45,37 @@ def migrate(entry):
views: Int views: Int
} }
''' '''
try:
author = data[entry['createdBy']]
except KeyError:
author = data['0']
# print(author)
r = { r = {
'org_id': 0, 'org_id': 0,
'layout': type2layout[entry['type']], 'layout': type2layout[entry['type']],
'title': entry['title'], 'title': entry['title'],
'authors': [ users_dict[entry['createdBy']]['id'], ], 'authors': [ { 'slug': author['slug'], 'name': author['viewname'], 'pic': author.get('userpic', '') }, ],
'topics': [], 'topics': [],
'published': entry['published'], 'published': entry['published'],
'views': entry['views'], 'views': entry['views'],
'rating': entry['rating'], 'rating': entry['rating'],
'ratings': [] 'ratings': []
} }
r['slug'] = entry.get('slug') r['slug'] = entry.get('slug', '')
body_orig = entry.get('body', '')
if not r['slug'] and entry.get('friendlySlugs') is not None: if not r['slug'] and entry.get('friendlySlugs') is not None:
r['slug'] = entry['friendlySlugs']['slug'][0]['slug'] r['slug'] = entry['friendlySlugs']['slug'][0]['slug']
if(r['slug'] is None): if(r['slug'] is None):
r['slug'] = entry['friendlySlugs'][0]['slug'] r['slug'] = entry['friendlySlugs'][0]['slug']
if not r['slug']:
print('NO SLUG ERROR')
# print(entry)
raise Exception
if entry.get('image') is not None: if entry.get('image') is not None:
r['cover'] = entry['image']['url'] r['cover'] = entry['image']['url']
elif entry.get('thumborId') is not None: if entry.get('thumborId') is not None:
r['cover'] = 'https://discours.io/' + entry['thumborId'] r['cover'] = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId']
if entry.get('publishedAt') is not None: if entry.get('publishedAt') is not None:
r['publishedAt'] = entry['publishedAt'] r['publishedAt'] = entry['publishedAt']
if entry.get('createdAt') is not None: if entry.get('createdAt') is not None:
@ -71,20 +83,60 @@ def migrate(entry):
if entry.get('updatedAt') is not None: if entry.get('updatedAt') is not None:
r['updatedAt'] = entry['updatedAt'] r['updatedAt'] = entry['updatedAt']
if entry.get('type') == 'Literature': if entry.get('type') == 'Literature':
r['body'] = markdown.feed(entry['media'][0]['literatureBody']) media = entry.get('media', '')
elif entry.get('type') == 'Video': # print(media[0]['literatureBody'])
r['body'] = '<ShoutVideo src=\"' + entry['media'][0]['youtubeId'] + '\" />' if type(media) == list:
elif entry.get('type') == 'Music': body_orig = media[0].get('literatureBody', '')
r['body'] = '<ShoutMusic media={\"' + json.dumps(entry['media']) +'\"} />' if body_orig == '':
elif entry.get('body') is not None: print('EMPTY BODY!')
r['body'] = markdown.feed(entry['body'])
else: else:
r['body'] = '## ' + r['title'] # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
body = r['body'] #markdown.feed(body_html)
del r['body'] body = html2text(body_orig).replace('****', '**')
metadata = frontmatter.dumps(r)
open('migration/content/' + r['slug'] + '.md', 'w').write(metadata + '\n' + body)
r['body'] = body r['body'] = body
shout = Shout.create(**r.copy()) # r['body2'] = markdown.md_file
r['id'] = shout['id'] else:
print(r['slug'] + ': literature has no media')
elif entry.get('type') == 'Video':
m = entry['media'][0]
yt = m.get('youtubeId', '')
vm = m.get('vimeoId', '')
videoUrl = 'https://www.youtube.com/watch?v=' + yt if yt else '#'
if videoUrl == '#':
videoUrl = 'https://vimeo.com/' + vm if vm else '#'
if videoUrl == '#':
print(m)
# raise Exception
r['body'] = '<ShoutVideo src=\"' + videoUrl + '\" />' + html2text(m.get('body', '')) # FIXME
elif entry.get('type') == 'Music':
r['body'] = '<ShoutMusic media={\"' + json.dumps(entry['media']) +'\"} />' # FIXME
if r.get('body') is None:
body_orig = entry.get('body', '')
# body_html = BeautifulSoup(body_orig, features="html.parser")
r['body'] = html2text(body_orig).replace('****', '**')
# markdown.feed(body_html)
# r['body2'] = markdown.md_file
if not r['body']:
r['body'] = entry.get('body')
metadata = {}
metadata['title'] = r.get('title')
metadata['authors'] = r.get('authors')
if r.get('cover', False):
metadata['cover'] = r.get('cover')
body = r.get('body')
post = frontmatter.Post(body, **metadata)
dumped = frontmatter.dumps(post)
# raise Exception
open('migration/content/' + entry['type'].lower() + '/' + r['slug'] + '.md', 'w').write(dumped)
# open('migration/content/' + entry['type'].lower() + '/' + r['slug'] + '.my.md', 'w').write(r['body2'])
#if body_orig:
# open('migration/content/' + entry['type'].lower() + '/' + r['slug'] + '.html', 'w').write(body_orig)
#markdown.related_data = []
#markdown.md_file = ''
#markdown.reset()
r['body'] = dumped
# shout = Shout.create(**r.copy())
# r['id'] = shout['id']
return r return r

View File

@ -1,10 +1,17 @@
from orm import User, Role from orm import User, Role
import frontmatter import frontmatter
from dateutil.parser import parse from dateutil.parser import parse
from migration.html2md import Converter from migration.html2text import html2text
markdown = Converter() # from migration.html2md import Converter
# markdown = Converter()
counter = 0 counter = 0
def add(data):
data.emailConfirmed = False
user = User.create(**data)
return user
def migrate(entry): def migrate(entry):
''' '''
@ -45,10 +52,10 @@ def migrate(entry):
res['notifications'] = [] res['notifications'] = []
res['links'] = [] res['links'] = []
res['muted'] = False res['muted'] = False
res['bio'] = markdown.feed(entry.get('bio', '')) res['bio'] = html2text(entry.get('bio', ''))
if entry['profile']: if entry['profile']:
res['slug'] = entry['profile'].get('path') res['slug'] = entry['profile'].get('path')
res['userpic'] = entry['profile'].get('image', {'url': ''}).get('url', '') res['userpic'] = entry['profile'].get('image', {'thumborId': ''}).get('thumborId', '') # adding 'https://assets.discours.io/unsafe/1600x' in web ui
fn = entry['profile'].get('firstName', '') fn = entry['profile'].get('firstName', '')
ln = entry['profile'].get('lastName', '') ln = entry['profile'].get('lastName', '')
viewname = res['slug'] if res['slug'] else 'anonymous' viewname = res['slug'] if res['slug'] else 'anonymous'

View File

@ -12,3 +12,4 @@ httpx
psycopg2-binary psycopg2-binary
bson bson
python-frontmatter python-frontmatter
bs4

View File

@ -145,23 +145,23 @@ type UserNotification {
} }
type User { type User {
id: Int!
username: String! # email username: String! # email
createdAt: DateTime! createdAt: DateTime!
slug: String!
viewname: String # to display
email: String email: String
password: String password: String
oauth: String # provider:token oauth: String # provider:token
viewname: String # to display
userpic: String userpic: String
links: [String] links: [String]
emailConfirmed: Boolean # should contain all emails too # TODO: pagination here emailConfirmed: Boolean # should contain all emails too # TODO: pagination here
id: Int!
muted: Boolean muted: Boolean
rating: Int rating: Int
roles: [Role] roles: [Role]
updatedAt: DateTime updatedAt: DateTime
wasOnlineAt: DateTime wasOnlineAt: DateTime
ratings: [Rating] ratings: [Rating]
slug: String
bio: String bio: String
notifications: [Int] notifications: [Int]
} }