From 65532ea1a3b6082e0db965bb07852737f77fcbb2 Mon Sep 17 00:00:00 2001 From: tonyrewin Date: Thu, 11 Aug 2022 12:14:12 +0300 Subject: [PATCH] migration-is-back --- migrate.py | 303 ++++++++ migration/__init__.py | 1 + migration/bson2json.py | 28 + migration/export.py | 105 +++ migration/extract.py | 324 +++++++++ migration/html2text/__init__.py | 1041 ++++++++++++++++++++++++++++ migration/html2text/__main__.py | 3 + migration/html2text/cli.py | 322 +++++++++ migration/html2text/config.py | 164 +++++ migration/html2text/elements.py | 18 + migration/html2text/py.typed | 0 migration/html2text/typing.py | 3 + migration/html2text/utils.py | 290 ++++++++ migration/tables/__init__.py | 1 + migration/tables/comments.py | 108 +++ migration/tables/content_items.py | 226 ++++++ migration/tables/replacements.json | 768 ++++++++++++++++++++ migration/tables/topics.py | 28 + migration/tables/users.py | 106 +++ migration/utils.py | 9 + 20 files changed, 3848 insertions(+) create mode 100644 migrate.py create mode 100644 migration/__init__.py create mode 100644 migration/bson2json.py create mode 100644 migration/export.py create mode 100644 migration/extract.py create mode 100644 migration/html2text/__init__.py create mode 100644 migration/html2text/__main__.py create mode 100644 migration/html2text/cli.py create mode 100644 migration/html2text/config.py create mode 100644 migration/html2text/elements.py create mode 100644 migration/html2text/py.typed create mode 100644 migration/html2text/typing.py create mode 100644 migration/html2text/utils.py create mode 100644 migration/tables/__init__.py create mode 100644 migration/tables/comments.py create mode 100644 migration/tables/content_items.py create mode 100644 migration/tables/replacements.json create mode 100644 migration/tables/topics.py create mode 100644 migration/tables/users.py create mode 100644 migration/utils.py diff --git a/migrate.py b/migrate.py new file mode 100644 index 00000000..c5312a3a --- /dev/null +++ b/migrate.py @@ -0,0 +1,303 @@ +''' cmd managed migration ''' +from datetime import datetime +import json +import subprocess +import sys +import os + +# from migration.export import export_email_subscriptions +from migration.export import export_mdx, export_slug +from migration.tables.users import migrate as migrateUser +from migration.tables.users import migrate_2stage as migrateUser_2stage +from migration.tables.content_items import get_shout_slug, migrate as migrateShout +from migration.tables.topics import migrate as migrateTopic +from migration.tables.comments import migrate as migrateComment +from migration.tables.comments import migrate_2stage as migrateComment_2stage +from orm.reaction import Reaction +from settings import DB_URL + +TODAY = datetime.strftime(datetime.now(), '%Y%m%d') + +OLD_DATE = '2016-03-05 22:22:00.350000' + + +def users_handle(storage): + ''' migrating users first ''' + counter = 0 + id_map = {} + print('[migration] migrating %d users' % (len(storage['users']['data']))) + for entry in storage['users']['data']: + oid = entry['_id'] + user = migrateUser(entry) + storage['users']['by_oid'][oid] = user # full + del user['password'] + del user['notifications'] + del user['emailConfirmed'] + del user['username'] + del user['email'] + storage['users']['by_slug'][user['slug']] = user # public + id_map[user['oid']] = user['slug'] + counter += 1 + ce = 0 + for entry in storage['users']['data']: + ce += migrateUser_2stage(entry, id_map) + return storage + + +def topics_handle(storage): + ''' topics from categories and tags ''' + counter = 0 + for t in (storage['topics']['tags'] + storage['topics']['cats']): + if t['slug'] in storage['replacements']: + t['slug'] = storage['replacements'][t['slug']] + topic = migrateTopic(t) + storage['topics']['by_oid'][t['_id']] = topic + storage['topics']['by_slug'][t['slug']] = topic + counter += 1 + else: + print('[migration] topic ' + t['slug'] + ' ignored') + for oldslug, newslug in storage['replacements'].items(): + if oldslug != newslug and oldslug in storage['topics']['by_slug']: + oid = storage['topics']['by_slug'][oldslug]['_id'] + del storage['topics']['by_slug'][oldslug] + storage['topics']['by_oid'][oid] = storage['topics']['by_slug'][newslug] + print('[migration] ' + str(counter) + ' topics migrated') + print('[migration] ' + str(len(storage['topics'] + ['by_oid'].values())) + ' topics by oid') + print('[migration] ' + str(len(storage['topics'] + ['by_slug'].values())) + ' topics by slug') + # raise Exception + return storage + + +def shouts_handle(storage, args): + ''' migrating content items one by one ''' + counter = 0 + discours_author = 0 + pub_counter = 0 + for entry in storage['shouts']['data']: + # slug + slug = get_shout_slug(entry) + + # single slug mode + if '-' in args and slug not in args: continue + + # migrate + shout = migrateShout(entry, storage) + storage['shouts']['by_oid'][entry['_id']] = shout + storage['shouts']['by_slug'][shout['slug']] = shout + # shouts.topics + if not shout['topics']: print('[migration] no topics!') + + # wuth author + author = shout['authors'][0].slug + if author == 'discours': discours_author += 1 + # print('[migration] ' + shout['slug'] + ' with author ' + author) + + if entry.get('published'): + if 'mdx' in args: export_mdx(shout) + pub_counter += 1 + + # print main counter + counter += 1 + line = str(counter+1) + ': ' + shout['slug'] + " @" + author + print(line) + + print('[migration] ' + str(counter) + ' content items were migrated') + print('[migration] ' + str(pub_counter) + ' have been published') + print('[migration] ' + str(discours_author) + ' authored by @discours') + return storage + + +def comments_handle(storage): + id_map = {} + ignored_counter = 0 + missed_shouts = {} + for oldcomment in storage['reactions']['data']: + if not oldcomment.get('deleted'): + reaction = migrateComment(oldcomment, storage) + if type(reaction) == str: + missed_shouts[reaction] = oldcomment + elif type(reaction) == Reaction: + reaction = reaction.dict() + id = reaction['id'] + oid = reaction['oid'] + id_map[oid] = id + else: + ignored_counter += 1 + + for reaction in storage['reactions']['data']: migrateComment_2stage( + reaction, id_map) + print('[migration] ' + str(len(id_map)) + ' comments migrated') + print('[migration] ' + str(ignored_counter) + ' comments ignored') + print('[migration] ' + str(len(missed_shouts.keys())) + + ' commented shouts missed') + missed_counter = 0 + for missed in missed_shouts.values(): + missed_counter += len(missed) + print('[migration] ' + str(missed_counter) + ' comments dropped') + return storage + + +def bson_handle(): + # decode bson # preparing data + from migration import bson2json + bson2json.json_tables() + + +def export_one(slug, storage): + topics_handle(storage) + users_handle(storage) + shouts_handle(storage) + export_slug(slug, storage) + + +def all_handle(storage, args): + print('[migration] handle everything') + users_handle(storage) + topics_handle(storage) + shouts_handle(storage, args) + comments_handle(storage) + # export_email_subscriptions() + print('[migration] done!') + + +def data_load(): + storage = { + 'content_items': { + 'by_oid': {}, + 'by_slug': {}, + }, + 'shouts': { + 'by_oid': {}, + 'by_slug': {}, + 'data': [] + }, + 'reactions': { + 'by_oid': {}, + 'by_slug': {}, + 'by_content': {}, + 'data': [] + }, + 'topics': { + 'by_oid': {}, + 'by_slug': {}, + 'cats': [], + 'tags': [], + }, + 'users': { + 'by_oid': {}, + 'by_slug': {}, + 'data': [] + }, + 'replacements': json.loads(open('migration/tables/replacements.json').read()) + } + users_data = [] + tags_data = [] + cats_data = [] + comments_data = [] + content_data = [] + try: + users_data = json.loads(open('migration/data/users.json').read()) + print('[migration] ' + str(len(users_data)) + ' users ') + tags_data = json.loads(open('migration/data/tags.json').read()) + storage['topics']['tags'] = tags_data + print('[migration] ' + str(len(tags_data)) + ' tags ') + cats_data = json.loads( + open('migration/data/content_item_categories.json').read()) + storage['topics']['cats'] = cats_data + print('[migration] ' + str(len(cats_data)) + ' cats ') + comments_data = json.loads(open('migration/data/comments.json').read()) + storage['reactions']['data'] = comments_data + print('[migration] ' + str(len(comments_data)) + ' comments ') + content_data = json.loads(open('migration/data/content_items.json').read()) + storage['shouts']['data'] = content_data + print('[migration] ' + str(len(content_data)) + ' content items ') + # fill out storage + for x in users_data: + storage['users']['by_oid'][x['_id']] = x + # storage['users']['by_slug'][x['slug']] = x + # no user.slug yet + print('[migration] ' + str(len(storage['users'] + ['by_oid'].keys())) + ' users by oid') + for x in tags_data: + storage['topics']['by_oid'][x['_id']] = x + storage['topics']['by_slug'][x['slug']] = x + for x in cats_data: + storage['topics']['by_oid'][x['_id']] = x + storage['topics']['by_slug'][x['slug']] = x + print('[migration] ' + str(len(storage['topics'] + ['by_slug'].keys())) + ' topics by slug') + for item in content_data: + slug = get_shout_slug(item) + storage['content_items']['by_slug'][slug] = item + storage['content_items']['by_oid'][item['_id']] = item + print('[migration] ' + str(len(content_data)) + ' content items') + for x in comments_data: + storage['reactions']['by_oid'][x['_id']] = x + cid = x['contentItem'] + storage['reactions']['by_content'][cid] = x + ci = storage['content_items']['by_oid'].get(cid, {}) + if 'slug' in ci: storage['reactions']['by_slug'][ci['slug']] = x + print('[migration] ' + str(len(storage['reactions'] + ['by_content'].keys())) + ' with comments') + except Exception as e: raise e + storage['users']['data'] = users_data + storage['topics']['tags'] = tags_data + storage['topics']['cats'] = cats_data + storage['shouts']['data'] = content_data + storage['reactions']['data'] = comments_data + return storage + + +def mongo_download(url): + if not url: raise Exception('\n\nYou should set MONGODB_URL enviroment variable\n') + print('[migration] mongodump ' + url) + subprocess.call([ + 'mongodump', + '--uri', url + '/?authSource=admin', + '--forceTableScan', + ], stderr = subprocess.STDOUT) + + +def create_pgdump(): + pgurl = DB_URL + if not pgurl: raise Exception('\n\nYou should set DATABASE_URL enviroment variable\n') + subprocess.call( + [ 'pg_dump', pgurl, '-f', TODAY + '-pgdump.sql'], + stderr = subprocess.STDOUT + ) + subprocess.call([ + 'scp', + TODAY + '-pgdump.sql', + 'root@build.discours.io:/root/.' + ]) + + +def handle_auto(): + print('[migration] no command given, auto mode') + mongo_download(os.getenv('MONGODB_URL')) + bson_handle() + all_handle(data_load(), sys.argv) + create_pgdump() + +def migrate(): + if len(sys.argv) > 1: + cmd=sys.argv[1] + if type(cmd) == str: print('[migration] command: ' + cmd) + if cmd == 'mongodb': + mongo_download(sys.argv[2]) + elif cmd == 'bson': + bson_handle() + else: + storage=data_load() + if cmd == '-': export_one(sys.argv[2], storage) + else: all_handle(storage, sys.argv) + elif len(sys.argv) == 1: + handle_auto() + else: + print('[migration] usage: python migrate.py ') + print('[migration] commands: mongodb, bson, all, all mdx, - ') + +if __name__ == '__main__': + migrate() diff --git a/migration/__init__.py b/migration/__init__.py new file mode 100644 index 00000000..e2750039 --- /dev/null +++ b/migration/__init__.py @@ -0,0 +1 @@ +__all__ = ["tables", "bson2json", "html2md"] \ No newline at end of file diff --git a/migration/bson2json.py b/migration/bson2json.py new file mode 100644 index 00000000..ba2802db --- /dev/null +++ b/migration/bson2json.py @@ -0,0 +1,28 @@ +import os +import bson +import json + +from migration.utils import DateTimeEncoder + +def json_tables(): + print('[migration] unpack dump/discours/*.bson to migration/data/*.json') + data = { + "content_items": [], + "content_item_categories": [], + "tags": [], + "email_subscriptions": [], + "users": [], + "comments": [] + } + for table in data.keys(): + lc = [] + with open('dump/discours/'+table+'.bson', 'rb') as f: + bs = f.read() + f.close() + base = 0 + while base < len(bs): + base, d = bson.decode_document(bs, base) + lc.append(d) + data[table] = lc + open(os.getcwd() + '/migration/data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder)) + diff --git a/migration/export.py b/migration/export.py new file mode 100644 index 00000000..d4463aa8 --- /dev/null +++ b/migration/export.py @@ -0,0 +1,105 @@ + +from datetime import datetime +import json +import os +import frontmatter +from migration.extract import extract_html, prepare_html_body +from migration.utils import DateTimeEncoder + +OLD_DATE = '2016-03-05 22:22:00.350000' +EXPORT_DEST = '../discoursio-web/data/' +parentDir = '/'.join(os.getcwd().split('/')[:-1]) +contentDir = parentDir + '/discoursio-web/content/' +ts = datetime.now() + +def get_metadata(r): + authors = [] + for a in r['authors']: + authors.append({ # a short version for public listings + 'slug': a.slug or 'discours', + 'name': a.name or 'Дискурс', + 'userpic': a.userpic or 'https://discours.io/static/img/discours.png' + }) + metadata = {} + metadata['title'] = r.get('title', '').replace('{', '(').replace('}', ')') + metadata['authors'] = authors + metadata['createdAt'] = r.get('createdAt', ts) + metadata['layout'] = r['layout'] + metadata['topics'] = [topic for topic in r['topics']] + metadata['topics'].sort() + if r.get('cover', False): metadata['cover'] = r.get('cover') + return metadata + +def export_mdx(r): + # print('[export] mdx %s' % r['slug']) + content = '' + metadata = get_metadata(r) + content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata)) + ext = 'mdx' + filepath = contentDir + r['slug'] + bc = bytes(content,'utf-8').decode('utf-8','ignore') + open(filepath + '.' + ext, 'w').write(bc) + +def export_body(shout, storage): + entry = storage['content_items']['by_oid'][shout['oid']] + if entry: + shout['body'] = prepare_html_body(entry) # prepare_md_body(entry) + export_mdx(shout) + print('[export] html for %s' % shout['slug']) + body = extract_html(entry) + open(contentDir + shout['slug'] + '.html', 'w').write(body) + else: + raise Exception('no content_items entry found') + +def export_slug(slug, storage): + shout = storage['shouts']['by_slug'][slug] + shout = storage['shouts']['by_slug'].get(slug) + assert shout, '[export] no shout found by slug: %s ' % slug + author = shout['authors'][0] + assert author, '[export] no author error' + export_body(shout, storage) + +def export_email_subscriptions(): + email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read()) + for data in email_subscriptions_data: + # migrate_email_subscription(data) + pass + print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions exported') + +def export_shouts(storage): + # update what was just migrated or load json again + if len(storage['users']['by_slugs'].keys()) == 0: + storage['users']['by_slugs'] = json.loads(open(EXPORT_DEST + 'authors.json').read()) + print('[migration] ' + str(len(storage['users']['by_slugs'].keys())) + ' exported authors ') + if len(storage['shouts']['by_slugs'].keys()) == 0: + storage['shouts']['by_slugs'] = json.loads(open(EXPORT_DEST + 'articles.json').read()) + print('[migration] ' + str(len(storage['shouts']['by_slugs'].keys())) + ' exported articles ') + for slug in storage['shouts']['by_slugs'].keys(): export_slug(slug, storage) + +def export_json(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}): + open(EXPORT_DEST + 'authors.json', 'w').write(json.dumps(export_authors, + cls=DateTimeEncoder, + indent=4, + sort_keys=True, + ensure_ascii=False)) + print('[migration] ' + str(len(export_authors.items())) + ' authors exported') + open(EXPORT_DEST + 'topics.json', 'w').write(json.dumps(export_topics, + cls=DateTimeEncoder, + indent=4, + sort_keys=True, + ensure_ascii=False)) + print('[migration] ' + str(len(export_topics.keys())) + ' topics exported') + + open(EXPORT_DEST + 'articles.json', 'w').write(json.dumps(export_articles, + cls=DateTimeEncoder, + indent=4, + sort_keys=True, + ensure_ascii=False)) + print('[migration] ' + str(len(export_articles.items())) + ' articles exported') + open(EXPORT_DEST + 'comments.json', 'w').write(json.dumps(export_comments, + cls=DateTimeEncoder, + indent=4, + sort_keys=True, + ensure_ascii=False)) + print('[migration] ' + str(len(export_comments.items())) + ' exported articles with comments') + diff --git a/migration/extract.py b/migration/extract.py new file mode 100644 index 00000000..c8220609 --- /dev/null +++ b/migration/extract.py @@ -0,0 +1,324 @@ +import os +import re +import base64 +from migration.html2text import html2text + +TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)' +contentDir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'discoursio-web', 'content') +s3 = 'https://discours-io.s3.amazonaws.com/' +cdn = 'https://assets.discours.io' + +def replace_tooltips(body): + # FIXME: if you prefer regexp + newbody = body + matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:] + for match in matches: + newbody = body.replace(match.group(1), '') # FIXME: doesn't work + if len(matches) > 0: + print('[extract] found %d tooltips' % len(matches)) + return newbody + + +def place_tooltips(body): + parts = body.split('&&&') + l = len(parts) + newparts = list(parts) + placed = False + if l & 1: + if l > 1: + i = 1 + print('[extract] found %d tooltips' % (l-1)) + for part in parts[1:]: + if i & 1: + placed = True + if 'a class="footnote-url" href=' in part: + print('[extract] footnote: ' + part) + fn = 'a class="footnote-url" href="' + link = part.split(fn,1)[1].split('"', 1)[0] + extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1] + newparts[i] = '' + extracted_part + '' + else: + newparts[i] = '%s' % part + # print('[extract] ' + newparts[i]) + else: + # print('[extract] ' + part[:10] + '..') + newparts[i] = part + i += 1 + return (''.join(newparts), placed) + +IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)" + +parentDir = '/'.join(os.getcwd().split('/')[:-1]) +public = parentDir + '/discoursio-web/public' +cache = {} + + +def reextract_images(body, oid): + # FIXME: if you prefer regexp + matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:] + i = 0 + for match in matches: + print('[extract] image ' + match.group(1)) + ext = match.group(3) + name = oid + str(i) + link = public + '/upload/image-' + name + '.' + ext + img = match.group(4) + title = match.group(1) # FIXME: this is not the title + if img not in cache: + content = base64.b64decode(img + '==') + print(str(len(img)) + ' image bytes been written') + open('../' + link, 'wb').write(content) + cache[img] = name + i += 1 + else: + print('[extract] image cached ' + cache[img]) + body.replace(str(match), '![' + title + '](' + cdn + link + ')') # FIXME: this does not work + return body + +IMAGES = { + 'data:image/png': 'png', + 'data:image/jpg': 'jpg', + 'data:image/jpeg': 'jpg', +} + +b64 = ';base64,' + +def extract_imageparts(bodyparts, prefix): + # recursive loop + newparts = list(bodyparts) + for current in bodyparts: + i = bodyparts.index(current) + for mime in IMAGES.keys(): + if mime == current[-len(mime):] and (i + 1 < len(bodyparts)): + print('[extract] ' + mime) + next = bodyparts[i+1] + ext = IMAGES[mime] + b64end = next.index(')') + b64encoded = next[:b64end] + name = prefix + '-' + str(len(cache)) + link = '/upload/image-' + name + '.' + ext + print('[extract] name: ' + name) + print('[extract] link: ' + link) + print('[extract] %d bytes' % len(b64encoded)) + if b64encoded not in cache: + try: + content = base64.b64decode(b64encoded + '==') + open(public + link, 'wb').write(content) + print('[extract] ' +str(len(content)) + ' image bytes been written') + cache[b64encoded] = name + except: + raise Exception + # raise Exception('[extract] error decoding image %r' %b64encoded) + else: + print('[extract] cached link ' + cache[b64encoded]) + name = cache[b64encoded] + link = cdn + '/upload/image-' + name + '.' + ext + newparts[i] = current[:-len(mime)] + current[-len(mime):] + link + next[-b64end:] + newparts[i+1] = next[:-b64end] + break + return extract_imageparts(newparts[i] + newparts[i+1] + b64.join(bodyparts[i+2:]), prefix) \ + if len(bodyparts) > (i + 1) else ''.join(newparts) + +def extract_dataimages(parts, prefix): + newparts = list(parts) + for part in parts: + i = parts.index(part) + if part.endswith(']('): + [ext, rest] = parts[i+1].split(b64) + name = prefix + '-' + str(len(cache)) + if ext == '/jpeg': ext = 'jpg' + else: ext = ext.replace('/', '') + link = '/upload/image-' + name + '.' + ext + print('[extract] filename: ' + link) + b64end = rest.find(')') + if b64end !=-1: + b64encoded = rest[:b64end] + print('[extract] %d text bytes' % len(b64encoded)) + # write if not cached + if b64encoded not in cache: + try: + content = base64.b64decode(b64encoded + '==') + open(public + link, 'wb').write(content) + print('[extract] ' +str(len(content)) + ' image bytes') + cache[b64encoded] = name + except: + raise Exception + # raise Exception('[extract] error decoding image %r' %b64encoded) + else: + print('[extract] 0 image bytes, cached for ' + cache[b64encoded]) + name = cache[b64encoded] + + # update link with CDN + link = cdn + '/upload/image-' + name + '.' + ext + + # patch newparts + newparts[i+1] = link + rest[b64end:] + else: + raise Exception('cannot find the end of base64 encoded string') + else: + print('[extract] dataimage skipping part ' + str(i)) + continue + return ''.join(newparts) + +di = 'data:image' + +def extract_md_images(body, oid): + newbody = '' + body = body\ + .replace('\n! []('+di, '\n ![]('+di)\ + .replace('\n[]('+di, '\n![]('+di)\ + .replace(' []('+di, ' ![]('+di) + parts = body.split(di) + i = 0 + if len(parts) > 1: newbody = extract_dataimages(parts, oid) + else: newbody = body + return newbody + + +def cleanup(body): + newbody = body\ + .replace('<', '').replace('>', '')\ + .replace('{', '(').replace('}', ')')\ + .replace('…', '...')\ + .replace(' __ ', ' ')\ + .replace('_ _', ' ')\ + .replace('****', '')\ + .replace('\u00a0', ' ')\ + .replace('\u02c6', '^')\ + .replace('\u00a0',' ')\ + .replace('\ufeff', '')\ + .replace('\u200b', '')\ + .replace('\u200c', '')\ + # .replace('\u2212', '-') + return newbody + +def extract_md(body, oid): + newbody = body + if newbody: + newbody = extract_md_images(newbody, oid) + if not newbody: raise Exception('extract_images error') + newbody = cleanup(newbody) + if not newbody: raise Exception('cleanup error') + newbody, placed = place_tooltips(newbody) + if not newbody: raise Exception('place_tooltips error') + if placed: + newbody = 'import Tooltip from \'$/components/Article/Tooltip\'\n\n' + newbody + return newbody + +def prepare_md_body(entry): + # body modifications + body = '' + kind = entry.get('type') + addon = '' + if kind == 'Video': + addon = '' + for m in entry.get('media', []): + if 'youtubeId' in m: addon += '\n' + elif 'vimeoId' in m: addon += '\n' + else: + print('[extract] media is not supported') + print(m) + body = 'import VideoPlayer from \'$/components/Article/VideoPlayer\'\n\n' + addon + + elif kind == 'Music': + addon = '' + for m in entry.get('media', []): + artist = m.get('performer') + trackname = '' + if artist: trackname += artist + ' - ' + if 'title' in m: trackname += m.get('title','') + addon += '\n' + body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + addon + + body_orig = extract_html(entry) + if body_orig: body += extract_md(html2text(body_orig), entry['_id']) + if not body: print('[extract] empty MDX body') + return body + +def prepare_html_body(entry): + # body modifications + body = '' + kind = entry.get('type') + addon = '' + if kind == 'Video': + addon = '' + for m in entry.get('media', []): + if 'youtubeId' in m: + addon += '\n' + elif 'vimeoId' in m: + addon += '' + else: + print('[extract] media is not supported') + print(m) + body += addon + + elif kind == 'Music': + addon = '' + for m in entry.get('media', []): + artist = m.get('performer') + trackname = '' + if artist: trackname += artist + ' - ' + if 'title' in m: trackname += m.get('title','') + addon += '
' + addon += trackname + addon += '
' + body += addon + + body = extract_html(entry) + # if body_orig: body += extract_md(html2text(body_orig), entry['_id']) + if not body: print('[extract] empty HTML body') + return body + +def extract_html(entry): + body_orig = entry.get('body') or '' + media = entry.get('media', []) + kind = entry.get('type') or '' + print('[extract] kind: ' + kind) + mbodies = set([]) + if media: + # print('[extract] media is found') + for m in media: + mbody = m.get('body', '') + addon = '' + if kind == 'Literature': + mbody = m.get('literatureBody') or m.get('body', '') + elif kind == 'Image': + cover = '' + if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId'] + if not cover: + if 'image' in entry: cover = entry['image'].get('url', '') + if 'cloudinary' in cover: cover = '' + # else: print('[extract] cover: ' + cover) + title = m.get('title','').replace('\n', ' ').replace(' ', ' ') + u = m.get('thumborId') or cover or '' + if title: addon += '

' + title + '

\n' + if not u.startswith('http'): u = s3 + u + if not u: print('[extract] no image url for ' + str(m)) + if 'cloudinary' in u: u = 'img/lost.svg' + if u != cover or (u == cover and media.index(m) == 0): + addon += '\"'+\n' + if addon: + body_orig += addon + # print('[extract] item addon: ' + addon) + # if addon: print('[extract] addon: %s' % addon) + if mbody and mbody not in mbodies: + mbodies.add(mbody) + body_orig += mbody + if len(list(mbodies)) != len(media): + print('[extract] %d/%d media item bodies appended' % (len(list(mbodies)),len(media))) + # print('[extract] media items body: \n' + body_orig) + if not body_orig: + for up in entry.get('bodyHistory', []) or []: + body_orig = up.get('text', '') or '' + if body_orig: + print('[extract] got html body from history') + break + if not body_orig: print('[extract] empty HTML body') + # body_html = str(BeautifulSoup(body_orig, features="html.parser")) + return body_orig \ No newline at end of file diff --git a/migration/html2text/__init__.py b/migration/html2text/__init__.py new file mode 100644 index 00000000..26810d42 --- /dev/null +++ b/migration/html2text/__init__.py @@ -0,0 +1,1041 @@ +"""html2text: Turn HTML into equivalent Markdown-structured text.""" + +import html.entities +import html.parser +import re +import string +import urllib.parse as urlparse +from textwrap import wrap +from typing import Dict, List, Optional, Tuple, Union + +from . import config +from .elements import AnchorElement, ListElement +from .typing import OutCallback +from .utils import ( + dumb_css_parser, + element_style, + escape_md, + escape_md_section, + google_fixed_width_font, + google_has_height, + google_list_style, + google_text_emphasis, + hn, + list_numbering_start, + pad_tables_in_text, + skipwrap, + unifiable_n, +) + +__version__ = (2020, 1, 16) + + +# TODO: +# Support decoded entities with UNIFIABLE. + + +class HTML2Text(html.parser.HTMLParser): + def __init__( + self, + out: Optional[OutCallback] = None, + baseurl: str = "", + bodywidth: int = config.BODY_WIDTH, + ) -> None: + """ + Input parameters: + out: possible custom replacement for self.outtextf (which + appends lines of text). + baseurl: base URL of the document we process + """ + super().__init__(convert_charrefs=False) + + # Config options + self.split_next_td = False + self.td_count = 0 + self.table_start = False + self.unicode_snob = config.UNICODE_SNOB # covered in cli + self.escape_snob = config.ESCAPE_SNOB # covered in cli + self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH + self.body_width = bodywidth # covered in cli + self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli + self.inline_links = config.INLINE_LINKS # covered in cli + self.protect_links = config.PROTECT_LINKS # covered in cli + self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli + self.ignore_links = config.IGNORE_ANCHORS # covered in cli + self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli + self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_as_html = config.IMAGES_AS_HTML # covered in cli + self.images_to_alt = config.IMAGES_TO_ALT # covered in cli + self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli + self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli + self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.ignore_tables = config.IGNORE_TABLES # covered in cli + self.google_doc = False # covered in cli + self.ul_item_mark = "*" # covered in cli + self.emphasis_mark = "_" # covered in cli + self.strong_mark = "**" + self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli + self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli + self.hide_strikethrough = False # covered in cli + self.mark_code = config.MARK_CODE + self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli + self.wrap_links = config.WRAP_LINKS # covered in cli + self.wrap_tables = config.WRAP_TABLES + self.pad_tables = config.PAD_TABLES # covered in cli + self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli + self.tag_callback = None + self.open_quote = config.OPEN_QUOTE # covered in cli + self.close_quote = config.CLOSE_QUOTE # covered in cli + self.header_id = None + self.span_highlight = False + self.span_lead = False + + if out is None: + self.out = self.outtextf + else: + self.out = out + + # empty list to store output characters before they are "joined" + self.outtextlist = [] # type: List[str] + + self.quiet = 0 + self.p_p = 0 # number of newline character to print before next output + self.outcount = 0 + self.start = True + self.space = False + self.a = [] # type: List[AnchorElement] + self.astack = [] # type: List[Optional[Dict[str, Optional[str]]]] + self.maybe_automatic_link = None # type: Optional[str] + self.empty_link = False + self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") + self.acount = 0 + self.list = [] # type: List[ListElement] + self.blockquote = 0 + self.pre = False + self.startpre = False + self.code = False + self.quote = False + self.br_toggle = "" + self.lastWasNL = False + self.lastWasList = False + self.style = 0 + self.style_def = {} # type: Dict[str, Dict[str, str]] + self.tag_stack = ( + [] + ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + # Current abbreviation definition + self.abbr_title = None # type: Optional[str] + # Last inner HTML (for abbr being defined) + self.abbr_data = None # type: Optional[str] + # Stack of abbreviations to write later + self.abbr_list = {} # type: Dict[str, str] + self.baseurl = baseurl + self.stressed = False + self.preceding_stressed = False + self.preceding_data = "" + self.current_tag = "" + self.current_class = "" + + config.UNIFIABLE["nbsp"] = " _place_holder;" + + def feed(self, data: str) -> None: + data = data.replace("", "") + super().feed(data) + + def handle(self, data: str) -> str: + self.feed(data) + self.feed("") + markdown = self.optwrap(self.finish()) + if self.pad_tables: + return pad_tables_in_text(markdown) + else: + return markdown + + def outtextf(self, s: str) -> None: + self.outtextlist.append(s) + if s: + self.lastWasNL = s[-1] == "\n" + + def finish(self) -> str: + self.close() + + self.pbr() + self.o("", force="end") + + outtext = "".join(self.outtextlist) + + if self.unicode_snob: + nbsp = html.entities.html5["nbsp;"] + else: + nbsp = " " + outtext = outtext.replace(" _place_holder;", nbsp) + + # Clear self.outtextlist to avoid memory leak of its content to + # the next handling. + self.outtextlist = [] + + return outtext + + def handle_charref(self, c: str) -> None: + self.handle_data(self.charref(c), True) + + def handle_entityref(self, c: str) -> None: + ref = self.entityref(c) + + # ref may be an empty string (e.g. for ‎/‏ markers that should + # not contribute to the final output). + # self.handle_data cannot handle a zero-length string right after a + # stressed tag or mid-text within a stressed tag (text get split and + # self.stressed/self.preceding_stressed gets switched after the first + # part of that text). + if ref: + self.handle_data(ref, True) + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + self.handle_tag(tag, dict(attrs), start=True) + + def handle_endtag(self, tag: str) -> None: + self.handle_tag(tag, {}, start=False) + + def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: + """ + :type attrs: dict + + :returns: The index of certain set of attributes (of a link) in the + self.a list. If the set of attributes is not found, returns None + :rtype: int + """ + if "href" not in attrs: + return None + + match = False + for i, a in enumerate(self.a): + if "href" in a.attrs and a.attrs["href"] == attrs["href"]: + if "title" in a.attrs or "title" in attrs: + if ( + "title" in a.attrs + and "title" in attrs + and a.attrs["title"] == attrs["title"] + ): + match = True + else: + match = True + + if match: + return i + return None + + def handle_emphasis( + self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str] + ) -> None: + """ + Handles various text emphases + """ + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough + + # google and others may mark a font's weight as `bold` or `700` + bold = False + for bold_marker in config.BOLD_TEXT_STYLE_VALUES: + bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis + if bold: + break + + italic = "italic" in tag_emphasis and "italic" not in parent_emphasis + fixed = ( + google_fixed_width_font(tag_style) + and not google_fixed_width_font(parent_style) + and not self.pre + ) + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o(self.emphasis_mark) + self.drop_white_space += 1 + if bold: + self.o(self.strong_mark) + self.drop_white_space += 1 + if fixed: + self.o("`") + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = False + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o("`") + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.strong_mark) + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.emphasis_mark) + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + + def handle_tag( + self, tag: str, attrs: Dict[str, Optional[str]], start: bool + ) -> None: + self.current_tag = tag + + if self.tag_callback is not None: + if self.tag_callback(self, tag, attrs, start) is True: + return + + # first thing inside the anchor tag is another tag + # that produces some output + if ( + start + and self.maybe_automatic_link is not None + and tag not in ["p", "div", "style", "dl", "dt"] + and (tag != "img" or self.ignore_images) + ): + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style = {} # type: Dict[str, str] + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = ( + self.tag_stack.pop() if self.tag_stack else (None, {}, {}) + ) + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + + if hn(tag): + # check if nh is inside of an 'a' tag + # (incorrect but found in the wild) + if self.astack: + if start: + self.inheader = True + # are inside link name, so only add '#' if it can appear before '[' + if self.outtextlist and self.outtextlist[-1] == "[": + self.outtextlist.pop() + self.space = False + self.o(hn(tag) * "#" + " ") + self.o("[") + self.header_id = attrs.get('id') + else: + self.p() + if start: + self.inheader = True + self.o(hn(tag) * "#" + " ") + if self.header_id: + self.o(' {#' + self.header_id + '}') + self.header_id = None + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if 'class' in attrs: + self.current_class = attrs.get('class', '') + # self.p() + if not start: + self.current_class = '' + + if tag == 'span': + if 'style' in attrs: + if attrs.get('style') == 'text-align: center': + self.current_class = 'center' + if not start: + self.current_class = '' + if start: + if self.current_class == 'highlight' and \ + self.inheader == False and \ + self.span_lead == False and \ + self.astack == False: + self.o('`') # NOTE: same as + self.span_highlight = True + elif self.current_class == 'lead' and \ + self.inheader == False and \ + self.span_highlight == False: + #self.o("==") # NOTE: CriticMarkup {== + self.span_lead = True + else: + if self.span_highlight: + self.o('`') + self.span_highlight = False + elif self.span_lead: + #self.o('==') + self.span_lead = False + + if tag in ["p", "div"]: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + elif self.astack or self.inheader: + pass + else: + self.p() + + if tag == "br" and start: + if self.blockquote > 0: + self.o(" \n> ") + else: + self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", "script"]: + if start: + self.quiet += 1 + else: + self.quiet -= 1 + + if tag == "style": + if start: + self.style += 1 + else: + self.style -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p() + self.o("> ", force=True) + self.start = True + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ["em", "i", "u"] and not self.ignore_emphasis: + # Separate with a space if we immediately follow an alphanumeric + # character, since otherwise Markdown won't render the emphasis + # marks, and we'll be left with eg 'foo_bar_' visible. + # (Don't add a space otherwise, though, since there isn't one in the + # original HTML.) + if ( + start + and self.preceding_data + and self.preceding_data[-1] not in string.whitespace + and self.preceding_data[-1] not in string.punctuation + ): + emphasis = " " + self.emphasis_mark + self.preceding_data += " " + else: + emphasis = self.emphasis_mark + + self.o(emphasis) + if start: + self.stressed = True + + if tag in ["strong", "b"] and not self.ignore_emphasis: + # Separate with space if we immediately follow an * character, since + # without it, Markdown won't render the resulting *** correctly. + # (Don't add a space otherwise, though, since there isn't one in the + # original HTML.) + if not self.inheader and not self.astack \ + and not self.span_lead and not self.span_highlight: + if ( + start + and self.preceding_data + and self.preceding_data[-1] == self.strong_mark[0] + ): + strong = " " + self.strong_mark + self.preceding_data += " " + else: + strong = self.strong_mark + + self.o(strong) + if start: + self.stressed = True + + if tag in ["del", "strike", "s"]: + if start and self.preceding_data and self.preceding_data[-1] == "~": + strike = " ~~" + self.preceding_data += " " + else: + strike = "~~" + + self.o(strike) + if start: + self.stressed = True + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["kbd", "code", "tt"] and not self.pre: + self.o("`") # TODO: `` `this` `` + self.code = not self.code + + if tag == "abbr": + if start: + self.abbr_title = None + self.abbr_data = "" + if "title" in attrs: + self.abbr_title = attrs["title"] + else: + if self.abbr_title is not None: + assert self.abbr_data is not None + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = None + + if tag == "q": + if not self.quote: + self.o(self.open_quote) + else: + self.o(self.close_quote) + self.quote = not self.quote + + def link_url(self: HTML2Text, link: str, title: str = "") -> None: + url = urlparse.urljoin(self.baseurl, link) + title = ' "{}"'.format(title) if title.strip() else "" + self.o("]({url}{title})".format(url=escape_md(url), title=title)) + + if tag == "a" and not self.ignore_links: + if start: + if 'data-original-title' in attrs: + # WARNING: old discours specific code + self.o('&&&%s&&&' % attrs['data-original-title']) + else: + if ( + "href" in attrs + and not attrs["href"].startswith('#_ftn') + and attrs["href"] is not None + and not (self.skip_internal_links and attrs["href"].startswith("#")) + and not (self.ignore_mailto_links and attrs["href"].startswith("mailto:")) + ): + self.astack.append(attrs) + self.maybe_automatic_link = attrs["href"] + self.empty_link = True + if self.protect_links: + attrs["href"] = "<" + attrs["href"] + ">" + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if self.maybe_automatic_link and not self.empty_link: + self.maybe_automatic_link = None + elif a: + assert a["href"] is not None + if self.empty_link: + self.o("[") + self.empty_link = False + self.maybe_automatic_link = None + if self.inline_links: + self.p_p = 0 + title = a.get("title") or "" + title = escape_md(title) + link_url(self, a["href"], title) + else: + i = self.previousIndex(a) + if i is not None: + a_props = self.a[i] + else: + self.acount += 1 + a_props = AnchorElement(a, self.acount, self.outcount) + self.a.append(a_props) + self.o("][" + str(a_props.count) + "]") + + if tag == "img" and start and not self.ignore_images: + # skip cloudinary images + if "src" in attrs and 'cloudinary' not in attrs['src']: + assert attrs["src"] is not None + if not self.images_to_alt: + attrs["href"] = attrs["src"] + alt = attrs.get("alt") or self.default_image_alt + + # If we have images_with_size, write raw html including width, + # height, and alt attributes + if self.images_as_html or ( + self.images_with_size and ("width" in attrs or "height" in attrs) + ): + self.o("") + return + + # If we have a link to create, output the start + if self.maybe_automatic_link is not None: + href = self.maybe_automatic_link + if ( + self.images_to_alt + and escape_md(alt) == href + and self.absolute_url_matcher.match(href) + ): + self.o("<" + escape_md(alt) + ">") + self.empty_link = False + return + else: + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + # If we have images_to_alt, we discard the image itself, + # considering only the alt text. + if self.images_to_alt: + self.o(escape_md(alt)) + else: + self.o("![" + escape_md(alt) + "]") + if self.inline_links: + href = attrs.get("href") or "" + self.o( + "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" + ) + else: + i = self.previousIndex(attrs) + if i is not None: + a_props = self.a[i] + else: + self.acount += 1 + a_props = AnchorElement(attrs, self.acount, self.outcount) + self.a.append(a_props) + self.o("[" + str(a_props.count) + "]") + + if tag == "dl" and start: + self.p() + if tag == "dt" and not start: + self.pbr() + if tag == "dd" and start: + self.o(" ") + if tag == "dd" and not start: + self.pbr() + + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if not self.list and not self.lastWasList: + self.p() + if start: + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append(ListElement(list_style, numbering_start)) + else: + if self.list: + self.list.pop() + if not self.google_doc and not self.list: + self.o("\n") + self.lastWasList = True + else: + self.lastWasList = False + + if tag == "li": + self.pbr() + if start: + if self.list: + li = self.list[-1] + else: + li = ListElement("ul", 0) + if self.google_doc: + self.o(" " * self.google_nest_count(tag_style)) + else: + # Indent two spaces per list, except use three spaces for an + # unordered list inside an ordered list. + # https://spec.commonmark.org/0.28/#motivation + # TODO: line up
  1. s > 9 correctly. + parent_list = None + for list in self.list: + self.o( + " " if parent_list == "ol" and list.name == "ul" else " " + ) + parent_list = list.name + + if li.name == "ul": + self.o(self.ul_item_mark + " ") + elif li.name == "ol": + li.num += 1 + self.o(str(li.num) + ". ") + self.start = True + + if tag in ["table", "tr", "td", "th"]: + if self.ignore_tables: + if tag == "tr": + if start: + pass + else: + self.soft_br() + else: + pass + + elif self.bypass_tables: + if start: + self.soft_br() + if tag in ["td", "th"]: + if start: + self.o("<{}>\n\n".format(tag)) + else: + self.o("\n".format(tag)) + else: + if start: + self.o("<{}>".format(tag)) + else: + self.o("".format(tag)) + + else: + if tag == "table": + if start: + self.table_start = True + if self.pad_tables: + self.o("<" + config.TABLE_MARKER_FOR_PAD + ">") + self.o(" \n") + else: + if self.pad_tables: + # add break in case the table is empty or its 1 row table + self.soft_br() + self.o("") + self.o(" \n") + if tag in ["td", "th"] and start: + if self.split_next_td: + self.o("| ") + self.split_next_td = True + + if tag == "tr" and start: + self.td_count = 0 + if tag == "tr" and not start: + self.split_next_td = False + self.soft_br() + if tag == "tr" and not start and self.table_start: + # Underline table header + self.o("|".join(["---"] * self.td_count)) + self.soft_br() + self.table_start = False + if tag in ["td", "th"] and start: + self.td_count += 1 + + if tag == "pre": + if start: + self.startpre = True + self.pre = True + else: + self.pre = False + if self.mark_code: + self.out("\n[/code]") + self.p() + + # TODO: Add docstring for these one letter functions + def pbr(self) -> None: + "Pretty print has a line break" + if self.p_p == 0: + self.p_p = 1 + + def p(self) -> None: + "Set pretty print to 1 or 2 lines" + self.p_p = 1 if self.single_line_break else 2 + + def soft_br(self) -> None: + "Soft breaks" + self.pbr() + self.br_toggle = " " + + def o( + self, data: str, puredata: bool = False, force: Union[bool, str] = False + ) -> None: + """ + Deal with indentation and whitespace + """ + if self.abbr_data is not None: + self.abbr_data += data + + if not self.quiet: + if self.google_doc: + # prevent white space immediately after 'begin emphasis' + # marks ('**' and '_') + lstripped_data = data.lstrip() + if self.drop_white_space and not (self.pre or self.code): + data = lstripped_data + if lstripped_data != "": + self.drop_white_space = 0 + + if puredata and not self.pre: + # This is a very dangerous call ... it could mess up + # all handling of   when not handled properly + # (see entityref) + data = re.sub(r"\s+", r" ", data) + if data and data[0] == " ": + self.space = True + data = data[1:] + if not data and not force: + return + + if self.startpre: + # self.out(" :") #TODO: not output when already one there + if not data.startswith("\n") and not data.startswith("\r\n"): + #
    stuff...
    +					data = "\n" + data
    +				if self.mark_code:
    +					self.out("\n[code]")
    +					self.p_p = 0
    +
    +			bq = ">" * self.blockquote
    +			if not (force and data and data[0] == ">") and self.blockquote:
    +				bq += " "
    +
    +			if self.pre:
    +				if not self.list:
    +					bq += "    "
    +				# else: list content is already partially indented
    +				bq += "    " * len(self.list)
    +				data = data.replace("\n", "\n" + bq)
    +
    +			if self.startpre:
    +				self.startpre = False
    +				if self.list:
    +					# use existing initial indentation
    +					data = data.lstrip("\n")
    +
    +			if self.start:
    +				self.space = False
    +				self.p_p = 0
    +				self.start = False
    +
    +			if force == "end":
    +				# It's the end.
    +				self.p_p = 0
    +				self.out("\n")
    +				self.space = False
    +
    +			if self.p_p:
    +				self.out((self.br_toggle + "\n" + bq) * self.p_p)
    +				self.space = False
    +				self.br_toggle = ""
    +
    +			if self.space:
    +				if not self.lastWasNL:
    +					self.out(" ")
    +				self.space = False
    +
    +			if self.a and (
    +				(self.p_p == 2 and self.links_each_paragraph) or force == "end"
    +			):
    +				if force == "end":
    +					self.out("\n")
    +
    +				newa = []
    +				for link in self.a:
    +					if self.outcount > link.outcount:
    +						self.out(
    +							"   ["
    +							+ str(link.count)
    +							+ "]: "
    +							+ urlparse.urljoin(self.baseurl, link.attrs["href"])
    +						)
    +						if "title" in link.attrs:
    +							assert link.attrs["title"] is not None
    +							self.out(" (" + link.attrs["title"] + ")")
    +						self.out("\n")
    +					else:
    +						newa.append(link)
    +
    +				# Don't need an extra line when nothing was done.
    +				if self.a != newa:
    +					self.out("\n")
    +
    +				self.a = newa
    +
    +			if self.abbr_list and force == "end":
    +				for abbr, definition in self.abbr_list.items():
    +					self.out("  *[" + abbr + "]: " + definition + "\n")
    +
    +			self.p_p = 0
    +			self.out(data)
    +			self.outcount += 1
    +
    +	def handle_data(self, data: str, entity_char: bool = False) -> None:
    +		if not data:
    +			# Data may be empty for some HTML entities. For example,
    +			# LEFT-TO-RIGHT MARK.
    +			return
    +
    +		if self.stressed:
    +			data = data.strip()
    +			self.stressed = False
    +			self.preceding_stressed = True
    +		elif self.preceding_stressed:
    +			if (
    +				re.match(r"[^][(){}\s.!?]", data[0])
    +				and not hn(self.current_tag)
    +				and self.current_tag not in ["a", "code", "pre"]
    +			):
    +				# should match a letter or common punctuation
    +				data = " " + data
    +			self.preceding_stressed = False
    +
    +		if self.style:
    +			self.style_def.update(dumb_css_parser(data))
    +
    +		if self.maybe_automatic_link is not None:
    +			href = self.maybe_automatic_link
    +			if (
    +				href == data
    +				and self.absolute_url_matcher.match(href)
    +				and self.use_automatic_links
    +			):
    +				self.o("<" + data + ">")
    +				self.empty_link = False
    +				return
    +			else:
    +				self.o("[")
    +				self.maybe_automatic_link = None
    +				self.empty_link = False
    +
    +		if not self.code and not self.pre and not entity_char:
    +			data = escape_md_section(data, snob=self.escape_snob)
    +		self.preceding_data = data
    +		self.o(data, puredata=True)
    +
    +	def charref(self, name: str) -> str:
    +		if name[0] in ["x", "X"]:
    +			c = int(name[1:], 16)
    +		else:
    +			c = int(name)
    +
    +		if not self.unicode_snob and c in unifiable_n:
    +			return unifiable_n[c]
    +		else:
    +			try:
    +				return chr(c)
    +			except ValueError:  # invalid unicode
    +				return ""
    +
    +	def entityref(self, c: str) -> str:
    +		if not self.unicode_snob and c in config.UNIFIABLE:
    +			return config.UNIFIABLE[c]
    +		try:
    +			ch = html.entities.html5[c + ";"]
    +		except KeyError:
    +			return "&" + c + ";"
    +		return config.UNIFIABLE[c] if c == "nbsp" else ch
    +
    +	def google_nest_count(self, style: Dict[str, str]) -> int:
    +		"""
    +		Calculate the nesting count of google doc lists
    +
    +		:type style: dict
    +
    +		:rtype: int
    +		"""
    +		nest_count = 0
    +		if "margin-left" in style:
    +			nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
    +
    +		return nest_count
    +
    +	def optwrap(self, text: str) -> str:
    +		"""
    +		Wrap all paragraphs in the provided text.
    +
    +		:type text: str
    +
    +		:rtype: str
    +		"""
    +		if not self.body_width:
    +			return text
    +
    +		result = ""
    +		newlines = 0
    +		# I cannot think of a better solution for now.
    +		# To avoid the non-wrap behaviour for entire paras
    +		# because of the presence of a link in it
    +		if not self.wrap_links:
    +			self.inline_links = False
    +		for para in text.split("\n"):
    +			if len(para) > 0:
    +				if not skipwrap(
    +					para, self.wrap_links, self.wrap_list_items, self.wrap_tables
    +				):
    +					indent = ""
    +					if para.startswith("  " + self.ul_item_mark):
    +						# list item continuation: add a double indent to the
    +						# new lines
    +						indent = "    "
    +					elif para.startswith("> "):
    +						# blockquote continuation: add the greater than symbol
    +						# to the new lines
    +						indent = "> "
    +					wrapped = wrap(
    +						para,
    +						self.body_width,
    +						break_long_words=False,
    +						subsequent_indent=indent,
    +					)
    +					result += "\n".join(wrapped)
    +					if para.endswith("  "):
    +						result += "  \n"
    +						newlines = 1
    +					elif indent:
    +						result += "\n"
    +						newlines = 1
    +					else:
    +						result += "\n\n"
    +						newlines = 2
    +				else:
    +					# Warning for the tempted!!!
    +					# Be aware that obvious replacement of this with
    +					# line.isspace()
    +					# DOES NOT work! Explanations are welcome.
    +					if not config.RE_SPACE.match(para):
    +						result += para + "\n"
    +						newlines = 1
    +			else:
    +				if newlines < 2:
    +					result += "\n"
    +					newlines += 1
    +		return result
    +
    +
    +def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = config.BODY_WIDTH) -> str:
    +	h = html.strip() or ''
    +	if h: 
    +		h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
    +		h = h.handle(html.strip())
    +		# print('[html2text] %d bytes' % len(html))
    +	return h
    diff --git a/migration/html2text/__main__.py b/migration/html2text/__main__.py
    new file mode 100644
    index 00000000..4e28416e
    --- /dev/null
    +++ b/migration/html2text/__main__.py
    @@ -0,0 +1,3 @@
    +from .cli import main
    +
    +main()
    diff --git a/migration/html2text/cli.py b/migration/html2text/cli.py
    new file mode 100644
    index 00000000..d0c62c97
    --- /dev/null
    +++ b/migration/html2text/cli.py
    @@ -0,0 +1,322 @@
    +import argparse
    +import sys
    +
    +from . import HTML2Text, __version__, config
    +
    +
    +def main() -> None:
    +    baseurl = ""
    +
    +    class bcolors:
    +        HEADER = "\033[95m"
    +        OKBLUE = "\033[94m"
    +        OKGREEN = "\033[92m"
    +        WARNING = "\033[93m"
    +        FAIL = "\033[91m"
    +        ENDC = "\033[0m"
    +        BOLD = "\033[1m"
    +        UNDERLINE = "\033[4m"
    +
    +    p = argparse.ArgumentParser()
    +    p.add_argument(
    +        "--default-image-alt",
    +        dest="default_image_alt",
    +        default=config.DEFAULT_IMAGE_ALT,
    +        help="The default alt string for images with missing ones",
    +    )
    +    p.add_argument(
    +        "--pad-tables",
    +        dest="pad_tables",
    +        action="store_true",
    +        default=config.PAD_TABLES,
    +        help="pad the cells to equal column width in tables",
    +    )
    +    p.add_argument(
    +        "--no-wrap-links",
    +        dest="wrap_links",
    +        action="store_false",
    +        default=config.WRAP_LINKS,
    +        help="don't wrap links during conversion",
    +    )
    +    p.add_argument(
    +        "--wrap-list-items",
    +        dest="wrap_list_items",
    +        action="store_true",
    +        default=config.WRAP_LIST_ITEMS,
    +        help="wrap list items during conversion",
    +    )
    +    p.add_argument(
    +        "--wrap-tables",
    +        dest="wrap_tables",
    +        action="store_true",
    +        default=config.WRAP_TABLES,
    +        help="wrap tables",
    +    )
    +    p.add_argument(
    +        "--ignore-emphasis",
    +        dest="ignore_emphasis",
    +        action="store_true",
    +        default=config.IGNORE_EMPHASIS,
    +        help="don't include any formatting for emphasis",
    +    )
    +    p.add_argument(
    +        "--reference-links",
    +        dest="inline_links",
    +        action="store_false",
    +        default=config.INLINE_LINKS,
    +        help="use reference style links instead of inline links",
    +    )
    +    p.add_argument(
    +        "--ignore-links",
    +        dest="ignore_links",
    +        action="store_true",
    +        default=config.IGNORE_ANCHORS,
    +        help="don't include any formatting for links",
    +    )
    +    p.add_argument(
    +        "--ignore-mailto-links",
    +        action="store_true",
    +        dest="ignore_mailto_links",
    +        default=config.IGNORE_MAILTO_LINKS,
    +        help="don't include mailto: links",
    +    )
    +    p.add_argument(
    +        "--protect-links",
    +        dest="protect_links",
    +        action="store_true",
    +        default=config.PROTECT_LINKS,
    +        help="protect links from line breaks surrounding them with angle brackets",
    +    )
    +    p.add_argument(
    +        "--ignore-images",
    +        dest="ignore_images",
    +        action="store_true",
    +        default=config.IGNORE_IMAGES,
    +        help="don't include any formatting for images",
    +    )
    +    p.add_argument(
    +        "--images-as-html",
    +        dest="images_as_html",
    +        action="store_true",
    +        default=config.IMAGES_AS_HTML,
    +        help=(
    +            "Always write image tags as raw html; preserves `height`, `width` and "
    +            "`alt` if possible."
    +        ),
    +    )
    +    p.add_argument(
    +        "--images-to-alt",
    +        dest="images_to_alt",
    +        action="store_true",
    +        default=config.IMAGES_TO_ALT,
    +        help="Discard image data, only keep alt text",
    +    )
    +    p.add_argument(
    +        "--images-with-size",
    +        dest="images_with_size",
    +        action="store_true",
    +        default=config.IMAGES_WITH_SIZE,
    +        help=(
    +            "Write image tags with height and width attrs as raw html to retain "
    +            "dimensions"
    +        ),
    +    )
    +    p.add_argument(
    +        "-g",
    +        "--google-doc",
    +        action="store_true",
    +        dest="google_doc",
    +        default=False,
    +        help="convert an html-exported Google Document",
    +    )
    +    p.add_argument(
    +        "-d",
    +        "--dash-unordered-list",
    +        action="store_true",
    +        dest="ul_style_dash",
    +        default=False,
    +        help="use a dash rather than a star for unordered list items",
    +    )
    +    p.add_argument(
    +        "-e",
    +        "--asterisk-emphasis",
    +        action="store_true",
    +        dest="em_style_asterisk",
    +        default=False,
    +        help="use an asterisk rather than an underscore for emphasized text",
    +    )
    +    p.add_argument(
    +        "-b",
    +        "--body-width",
    +        dest="body_width",
    +        type=int,
    +        default=config.BODY_WIDTH,
    +        help="number of characters per output line, 0 for no wrap",
    +    )
    +    p.add_argument(
    +        "-i",
    +        "--google-list-indent",
    +        dest="list_indent",
    +        type=int,
    +        default=config.GOOGLE_LIST_INDENT,
    +        help="number of pixels Google indents nested lists",
    +    )
    +    p.add_argument(
    +        "-s",
    +        "--hide-strikethrough",
    +        action="store_true",
    +        dest="hide_strikethrough",
    +        default=False,
    +        help="hide strike-through text. only relevant when -g is " "specified as well",
    +    )
    +    p.add_argument(
    +        "--escape-all",
    +        action="store_true",
    +        dest="escape_snob",
    +        default=False,
    +        help=(
    +            "Escape all special characters.  Output is less readable, but avoids "
    +            "corner case formatting issues."
    +        ),
    +    )
    +    p.add_argument(
    +        "--bypass-tables",
    +        action="store_true",
    +        dest="bypass_tables",
    +        default=config.BYPASS_TABLES,
    +        help="Format tables in HTML rather than Markdown syntax.",
    +    )
    +    p.add_argument(
    +        "--ignore-tables",
    +        action="store_true",
    +        dest="ignore_tables",
    +        default=config.IGNORE_TABLES,
    +        help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
    +    )
    +    p.add_argument(
    +        "--single-line-break",
    +        action="store_true",
    +        dest="single_line_break",
    +        default=config.SINGLE_LINE_BREAK,
    +        help=(
    +            "Use a single line break after a block element rather than two line "
    +            "breaks. NOTE: Requires --body-width=0"
    +        ),
    +    )
    +    p.add_argument(
    +        "--unicode-snob",
    +        action="store_true",
    +        dest="unicode_snob",
    +        default=config.UNICODE_SNOB,
    +        help="Use unicode throughout document",
    +    )
    +    p.add_argument(
    +        "--no-automatic-links",
    +        action="store_false",
    +        dest="use_automatic_links",
    +        default=config.USE_AUTOMATIC_LINKS,
    +        help="Do not use automatic links wherever applicable",
    +    )
    +    p.add_argument(
    +        "--no-skip-internal-links",
    +        action="store_false",
    +        dest="skip_internal_links",
    +        default=config.SKIP_INTERNAL_LINKS,
    +        help="Do not skip internal links",
    +    )
    +    p.add_argument(
    +        "--links-after-para",
    +        action="store_true",
    +        dest="links_each_paragraph",
    +        default=config.LINKS_EACH_PARAGRAPH,
    +        help="Put links after each paragraph instead of document",
    +    )
    +    p.add_argument(
    +        "--mark-code",
    +        action="store_true",
    +        dest="mark_code",
    +        default=config.MARK_CODE,
    +        help="Mark program code blocks with [code]...[/code]",
    +    )
    +    p.add_argument(
    +        "--decode-errors",
    +        dest="decode_errors",
    +        default=config.DECODE_ERRORS,
    +        help=(
    +            "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
    +            "acceptable values"
    +        ),
    +    )
    +    p.add_argument(
    +        "--open-quote",
    +        dest="open_quote",
    +        default=config.OPEN_QUOTE,
    +        help="The character used to open quotes",
    +    )
    +    p.add_argument(
    +        "--close-quote",
    +        dest="close_quote",
    +        default=config.CLOSE_QUOTE,
    +        help="The character used to close quotes",
    +    )
    +    p.add_argument(
    +        "--version", action="version", version=".".join(map(str, __version__))
    +    )
    +    p.add_argument("filename", nargs="?")
    +    p.add_argument("encoding", nargs="?", default="utf-8")
    +    args = p.parse_args()
    +
    +    if args.filename and args.filename != "-":
    +        with open(args.filename, "rb") as fp:
    +            data = fp.read()
    +    else:
    +        data = sys.stdin.buffer.read()
    +
    +    try:
    +        html = data.decode(args.encoding, args.decode_errors)
    +    except UnicodeDecodeError as err:
    +        warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
    +        warning += " Use the " + bcolors.OKGREEN
    +        warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
    +        print(warning)
    +        raise err
    +
    +    h = HTML2Text(baseurl=baseurl)
    +    # handle options
    +    if args.ul_style_dash:
    +        h.ul_item_mark = "-"
    +    if args.em_style_asterisk:
    +        h.emphasis_mark = "*"
    +        h.strong_mark = "__"
    +
    +    h.body_width = args.body_width
    +    h.google_list_indent = args.list_indent
    +    h.ignore_emphasis = args.ignore_emphasis
    +    h.ignore_links = args.ignore_links
    +    h.ignore_mailto_links = args.ignore_mailto_links
    +    h.protect_links = args.protect_links
    +    h.ignore_images = args.ignore_images
    +    h.images_as_html = args.images_as_html
    +    h.images_to_alt = args.images_to_alt
    +    h.images_with_size = args.images_with_size
    +    h.google_doc = args.google_doc
    +    h.hide_strikethrough = args.hide_strikethrough
    +    h.escape_snob = args.escape_snob
    +    h.bypass_tables = args.bypass_tables
    +    h.ignore_tables = args.ignore_tables
    +    h.single_line_break = args.single_line_break
    +    h.inline_links = args.inline_links
    +    h.unicode_snob = args.unicode_snob
    +    h.use_automatic_links = args.use_automatic_links
    +    h.skip_internal_links = args.skip_internal_links
    +    h.links_each_paragraph = args.links_each_paragraph
    +    h.mark_code = args.mark_code
    +    h.wrap_links = args.wrap_links
    +    h.wrap_list_items = args.wrap_list_items
    +    h.wrap_tables = args.wrap_tables
    +    h.pad_tables = args.pad_tables
    +    h.default_image_alt = args.default_image_alt
    +    h.open_quote = args.open_quote
    +    h.close_quote = args.close_quote
    +
    +    sys.stdout.write(h.handle(html))
    diff --git a/migration/html2text/config.py b/migration/html2text/config.py
    new file mode 100644
    index 00000000..0f4d29bc
    --- /dev/null
    +++ b/migration/html2text/config.py
    @@ -0,0 +1,164 @@
    +import re
    +
    +# Use Unicode characters instead of their ascii pseudo-replacements
    +UNICODE_SNOB = True
    +
    +# Marker to use for marking tables for padding post processing
    +TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
    +# Escape all special characters.  Output is less readable, but avoids
    +# corner case formatting issues.
    +ESCAPE_SNOB = True
    +
    +# Put the links after each paragraph instead of at the end.
    +LINKS_EACH_PARAGRAPH = False
    +
    +# Wrap long lines at position. 0 for no wrapping.
    +BODY_WIDTH = 0
    +
    +# Don't show internal links (href="#local-anchor") -- corresponding link
    +# targets won't be visible in the plain text file anyway.
    +SKIP_INTERNAL_LINKS = False
    +
    +# Use inline, rather than reference, formatting for images and links
    +INLINE_LINKS = True
    +
    +# Protect links from line breaks surrounding them with angle brackets (in
    +# addition to their square brackets)
    +PROTECT_LINKS = True
    +WRAP_LINKS = True
    +
    +# Wrap list items.
    +WRAP_LIST_ITEMS = False
    +
    +# Wrap tables
    +WRAP_TABLES = False
    +
    +# Number of pixels Google indents nested lists
    +GOOGLE_LIST_INDENT = 36
    +
    +# Values Google and others may use to indicate bold text
    +BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
    +
    +IGNORE_ANCHORS = False
    +IGNORE_MAILTO_LINKS = False
    +IGNORE_IMAGES = False
    +IMAGES_AS_HTML = False
    +IMAGES_TO_ALT = False
    +IMAGES_WITH_SIZE = False
    +IGNORE_EMPHASIS = False
    +MARK_CODE = True
    +DECODE_ERRORS = "strict"
    +DEFAULT_IMAGE_ALT = ""
    +PAD_TABLES = True
    +
    +# Convert links with same href and text to  format
    +# if they are absolute links
    +USE_AUTOMATIC_LINKS = True
    +
    +# For checking space-only lines on line 771
    +RE_SPACE = re.compile(r"\s\+")
    +
    +RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
    +RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
    +RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
    +RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
    +
    +# to find links in the text
    +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
    +
    +# to find table separators
    +RE_TABLE = re.compile(r" \| ")
    +
    +RE_MD_DOT_MATCHER = re.compile(
    +    r"""
    +    ^             # start of line
    +    (\s*\d+)      # optional whitespace and a number
    +    (\.)          # dot
    +    (?=\s)        # lookahead assert whitespace
    +    """,
    +    re.MULTILINE | re.VERBOSE,
    +)
    +RE_MD_PLUS_MATCHER = re.compile(
    +    r"""
    +    ^
    +    (\s*)
    +    (\+)
    +    (?=\s)
    +    """,
    +    flags=re.MULTILINE | re.VERBOSE,
    +)
    +RE_MD_DASH_MATCHER = re.compile(
    +    r"""
    +    ^
    +    (\s*)
    +    (-)
    +    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
    +                  # or another dash (header or hr)
    +    """,
    +    flags=re.MULTILINE | re.VERBOSE,
    +)
    +RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
    +RE_MD_BACKSLASH_MATCHER = re.compile(
    +    r"""
    +    (\\)          # match one slash
    +    (?=[%s])      # followed by a char that requires escaping
    +    """
    +    % re.escape(RE_SLASH_CHARS),
    +    flags=re.VERBOSE,
    +)
    +
    +UNIFIABLE = {
    +    "rsquo": "'",
    +    "lsquo": "'",
    +    "rdquo": '"',
    +    "ldquo": '"',
    +    "copy": "(C)",
    +    "mdash": "--",
    +    "nbsp": " ",
    +    "rarr": "->",
    +    "larr": "<-",
    +    "middot": "*",
    +    "ndash": "-",
    +    "oelig": "oe",
    +    "aelig": "ae",
    +    "agrave": "a",
    +    "aacute": "a",
    +    "acirc": "a",
    +    "atilde": "a",
    +    "auml": "a",
    +    "aring": "a",
    +    "egrave": "e",
    +    "eacute": "e",
    +    "ecirc": "e",
    +    "euml": "e",
    +    "igrave": "i",
    +    "iacute": "i",
    +    "icirc": "i",
    +    "iuml": "i",
    +    "ograve": "o",
    +    "oacute": "o",
    +    "ocirc": "o",
    +    "otilde": "o",
    +    "ouml": "o",
    +    "ugrave": "u",
    +    "uacute": "u",
    +    "ucirc": "u",
    +    "uuml": "u",
    +    "lrm": "",
    +    "rlm": "",
    +}
    +
    +# Format tables in HTML rather than Markdown syntax
    +BYPASS_TABLES = False
    +# Ignore table-related tags (table, th, td, tr) while keeping rows
    +IGNORE_TABLES = False
    +
    +
    +# Use a single line break after a block element rather than two line breaks.
    +# NOTE: Requires body width setting to be 0.
    +SINGLE_LINE_BREAK = False
    +
    +
    +# Use double quotation marks when converting the  tag.
    +OPEN_QUOTE = '"'
    +CLOSE_QUOTE = '"'
    diff --git a/migration/html2text/elements.py b/migration/html2text/elements.py
    new file mode 100644
    index 00000000..2533ec08
    --- /dev/null
    +++ b/migration/html2text/elements.py
    @@ -0,0 +1,18 @@
    +from typing import Dict, Optional
    +
    +
    +class AnchorElement:
    +    __slots__ = ["attrs", "count", "outcount"]
    +
    +    def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
    +        self.attrs = attrs
    +        self.count = count
    +        self.outcount = outcount
    +
    +
    +class ListElement:
    +    __slots__ = ["name", "num"]
    +
    +    def __init__(self, name: str, num: int):
    +        self.name = name
    +        self.num = num
    diff --git a/migration/html2text/py.typed b/migration/html2text/py.typed
    new file mode 100644
    index 00000000..e69de29b
    diff --git a/migration/html2text/typing.py b/migration/html2text/typing.py
    new file mode 100644
    index 00000000..6e17fed2
    --- /dev/null
    +++ b/migration/html2text/typing.py
    @@ -0,0 +1,3 @@
    +class OutCallback:
    +    def __call__(self, s: str) -> None:
    +        ...
    diff --git a/migration/html2text/utils.py b/migration/html2text/utils.py
    new file mode 100644
    index 00000000..366748b6
    --- /dev/null
    +++ b/migration/html2text/utils.py
    @@ -0,0 +1,290 @@
    +import html.entities
    +from typing import Dict, List, Optional
    +
    +from . import config
    +
    +unifiable_n = {
    +    html.entities.name2codepoint[k]: v
    +    for k, v in config.UNIFIABLE.items()
    +    if k != "nbsp"
    +}
    +
    +
    +def hn(tag: str) -> int:
    +    if tag[0] == "h" and len(tag) == 2:
    +        n = tag[1]
    +        if "0" < n <= "9":
    +            return int(n)
    +    return 0
    +
    +
    +def dumb_property_dict(style: str) -> Dict[str, str]:
    +    """
    +    :returns: A hash of css attributes
    +    """
    +    return {
    +        x.strip().lower(): y.strip().lower()
    +        for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
    +    }
    +
    +
    +def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
    +    """
    +    :type data: str
    +
    +    :returns: A hash of css selectors, each of which contains a hash of
    +    css attributes.
    +    :rtype: dict
    +    """
    +    # remove @import sentences
    +    data += ";"
    +    importIndex = data.find("@import")
    +    while importIndex != -1:
    +        data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
    +        importIndex = data.find("@import")
    +
    +    # parse the css. reverted from dictionary comprehension in order to
    +    # support older pythons
    +    pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
    +    try:
    +        elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
    +    except ValueError:
    +        elements = {}  # not that important
    +
    +    return elements
    +
    +
    +def element_style(
    +    attrs: Dict[str, Optional[str]],
    +    style_def: Dict[str, Dict[str, str]],
    +    parent_style: Dict[str, str],
    +) -> Dict[str, str]:
    +    """
    +    :type attrs: dict
    +    :type style_def: dict
    +    :type style_def: dict
    +
    +    :returns: A hash of the 'final' style attributes of the element
    +    :rtype: dict
    +    """
    +    style = parent_style.copy()
    +    if "class" in attrs:
    +        assert attrs["class"] is not None
    +        for css_class in attrs["class"].split():
    +            css_style = style_def.get("." + css_class, {})
    +            style.update(css_style)
    +    if "style" in attrs:
    +        assert attrs["style"] is not None
    +        immediate_style = dumb_property_dict(attrs["style"])
    +        style.update(immediate_style)
    +
    +    return style
    +
    +
    +def google_list_style(style: Dict[str, str]) -> str:
    +    """
    +    Finds out whether this is an ordered or unordered list
    +
    +    :type style: dict
    +
    +    :rtype: str
    +    """
    +    if "list-style-type" in style:
    +        list_style = style["list-style-type"]
    +        if list_style in ["disc", "circle", "square", "none"]:
    +            return "ul"
    +
    +    return "ol"
    +
    +
    +def google_has_height(style: Dict[str, str]) -> bool:
    +    """
    +    Check if the style of the element has the 'height' attribute
    +    explicitly defined
    +
    +    :type style: dict
    +
    +    :rtype: bool
    +    """
    +    return "height" in style
    +
    +
    +def google_text_emphasis(style: Dict[str, str]) -> List[str]:
    +    """
    +    :type style: dict
    +
    +    :returns: A list of all emphasis modifiers of the element
    +    :rtype: list
    +    """
    +    emphasis = []
    +    if "text-decoration" in style:
    +        emphasis.append(style["text-decoration"])
    +    if "font-style" in style:
    +        emphasis.append(style["font-style"])
    +    if "font-weight" in style:
    +        emphasis.append(style["font-weight"])
    +
    +    return emphasis
    +
    +
    +def google_fixed_width_font(style: Dict[str, str]) -> bool:
    +    """
    +    Check if the css of the current element defines a fixed width font
    +
    +    :type style: dict
    +
    +    :rtype: bool
    +    """
    +    font_family = ""
    +    if "font-family" in style:
    +        font_family = style["font-family"]
    +    return "courier new" == font_family or "consolas" == font_family
    +
    +
    +def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
    +    """
    +    Extract numbering from list element attributes
    +
    +    :type attrs: dict
    +
    +    :rtype: int or None
    +    """
    +    if "start" in attrs:
    +        assert attrs["start"] is not None
    +        try:
    +            return int(attrs["start"]) - 1
    +        except ValueError:
    +            pass
    +
    +    return 0
    +
    +
    +def skipwrap(
    +    para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
    +) -> bool:
    +    # If it appears to contain a link
    +    # don't wrap
    +    if not wrap_links and config.RE_LINK.search(para):
    +        return True
    +    # If the text begins with four spaces or one tab, it's a code block;
    +    # don't wrap
    +    if para[0:4] == "    " or para[0] == "\t":
    +        return True
    +
    +    # If the text begins with only two "--", possibly preceded by
    +    # whitespace, that's an emdash; so wrap.
    +    stripped = para.lstrip()
    +    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
    +        return False
    +
    +    # I'm not sure what this is for; I thought it was to detect lists,
    +    # but there's a 
    -inside- case in one of the tests that + # also depends upon it. + if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": + return not wrap_list_items + + # If text contains a pipe character it is likely a table + if not wrap_tables and config.RE_TABLE.search(para): + return True + + # If the text begins with a single -, *, or +, followed by a space, + # or an integer, followed by a ., followed by a space (in either + # case optionally proceeded by whitespace), it's a list; don't wrap. + return bool( + config.RE_ORDERED_LIST_MATCHER.match(stripped) + or config.RE_UNORDERED_LIST_MATCHER.match(stripped) + ) + + +def escape_md(text: str) -> str: + """ + Escapes markdown-sensitive characters within other markdown + constructs. + """ + return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) + + +def escape_md_section(text: str, snob: bool = False) -> str: + """ + Escapes markdown-sensitive characters across whole document sections. + """ + text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) + + if snob: + text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) + + text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) + + return text + + +def reformat_table(lines: List[str], right_margin: int) -> List[str]: + """ + Given the lines of a table + padds the cells and returns the new lines + """ + # find the maximum width of the columns + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] + max_cols = len(max_width) + for line in lines: + cols = [x.rstrip() for x in line.split("|")] + num_cols = len(cols) + + # don't drop any data if colspan attributes result in unequal lengths + if num_cols < max_cols: + cols += [""] * (max_cols - num_cols) + elif max_cols < num_cols: + max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] + max_cols = num_cols + + max_width = [ + max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) + ] + + # reformat + new_lines = [] + for line in lines: + cols = [x.rstrip() for x in line.split("|")] + if set(line.strip()) == set("-|"): + filler = "-" + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("|-" + "|".join(new_cols) + "|") + else: + filler = " " + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("| " + "|".join(new_cols) + "|") + return new_lines + + +def pad_tables_in_text(text: str, right_margin: int = 1) -> str: + """ + Provide padding for tables in the text + """ + lines = text.split("\n") + table_buffer = [] # type: List[str] + table_started = False + new_lines = [] + for line in lines: + # Toggle table started + if config.TABLE_MARKER_FOR_PAD in line: + table_started = not table_started + if not table_started: + table = reformat_table(table_buffer, right_margin) + new_lines.extend(table) + table_buffer = [] + new_lines.append("") + continue + # Process lines + if table_started: + table_buffer.append(line) + else: + new_lines.append(line) + return "\n".join(new_lines) diff --git a/migration/tables/__init__.py b/migration/tables/__init__.py new file mode 100644 index 00000000..6cc37870 --- /dev/null +++ b/migration/tables/__init__.py @@ -0,0 +1 @@ +__all__ = ["users", "tags", "content_items", "comments"], \ No newline at end of file diff --git a/migration/tables/comments.py b/migration/tables/comments.py new file mode 100644 index 00000000..d1147d7a --- /dev/null +++ b/migration/tables/comments.py @@ -0,0 +1,108 @@ +from datetime import datetime +from dateutil.parser import parse as date_parse +from orm import Reaction, User +from orm import reaction +from orm.base import local_session +from migration.html2text import html2text +from orm.reaction import ReactionKind +from orm.shout import Shout + +ts = datetime.now() + +def migrate(entry, storage): + ''' + { + "_id": "hdtwS8fSyFLxXCgSC", + "body": "

    ", + "contentItem": "mnK8KsJHPRi8DrybQ", + "createdBy": "bMFPuyNg6qAD2mhXe", + "thread": "01/", + "createdAt": "2016-04-19 04:33:53+00:00", + "ratings": [ + { "createdBy": "AqmRukvRiExNpAe8C", "value": 1 }, + { "createdBy": "YdE76Wth3yqymKEu5", "value": 1 } + ], + "rating": 2, + "updatedAt": "2020-05-27 19:22:57.091000+00:00", + "updatedBy": "0" + } + + -> + + type Reaction { + id: Int! + shout: Shout! + createdAt: DateTime! + createdBy: User! + updatedAt: DateTime + deletedAt: DateTime + deletedBy: User + range: String # full / 0:2340 + kind: ReactionKind! + body: String + replyTo: Reaction + stat: Stat + old_id: String + old_thread: String + } + ''' + reaction_dict = {} + # FIXME: comment_dict['createdAt'] = ts if not entry.get('createdAt') else date_parse(entry.get('createdAt')) + # print('[migration] comment original date %r' % entry.get('createdAt')) + # print('[migration] comment date %r ' % comment_dict['createdAt']) + reaction_dict['body'] = html2text(entry.get('body', '')) + reaction_dict['oid'] = entry['_id'] + if entry.get('createdAt'): reaction_dict['createdAt'] = date_parse(entry.get('createdAt')) + shout_oid = entry.get('contentItem') + if not shout_oid in storage['shouts']['by_oid']: + if len(storage['shouts']['by_oid']) > 0: + return shout_oid + else: + print('[migration] no shouts migrated yet') + raise Exception + return + else: + with local_session() as session: + author = session.query(User).filter(User.oid == entry['createdBy']).first() + shout_dict = storage['shouts']['by_oid'][shout_oid] + if shout_dict: + reaction_dict['shout'] = shout_dict['slug'] + reaction_dict['createdBy'] = author.slug if author else 'discours' + reaction_dict['kind'] = ReactionKind.COMMENT + + # creating reaction from old comment + reaction = Reaction.create(**reaction_dict) + + reaction_dict['id'] = reaction.id + for comment_rating_old in entry.get('ratings',[]): + rater = session.query(User).filter(User.oid == comment_rating_old['createdBy']).first() + reactedBy = rater if rater else session.query(User).filter(User.slug == 'noname').first() + re_reaction_dict = { + 'shout': reaction_dict['shout'], + 'replyTo': reaction.id, + 'kind': ReactionKind.LIKE if comment_rating_old['value'] > 0 else ReactionKind.DISLIKE, + 'createdBy': reactedBy.slug if reactedBy else 'discours' + } + cts = comment_rating_old.get('createdAt') + if cts: re_reaction_dict['createdAt'] = date_parse(cts) + try: + # creating reaction from old rating + Reaction.create(**re_reaction_dict) + except Exception as e: + print('[migration] comment rating error: %r' % re_reaction_dict) + raise e + else: + print('[migration] error: cannot find shout for comment %r' % reaction_dict) + return reaction + +def migrate_2stage(rr, old_new_id): + reply_oid = rr.get('replyTo') + if not reply_oid: return + new_id = old_new_id.get(rr.get('oid')) + if not new_id: return + with local_session() as session: + comment = session.query(Reaction).filter(Reaction.id == new_id).first() + comment.replyTo = old_new_id.get(reply_oid) + comment.save() + session.commit() + if not rr['body']: raise Exception(rr) diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py new file mode 100644 index 00000000..c5f85840 --- /dev/null +++ b/migration/tables/content_items.py @@ -0,0 +1,226 @@ +from dateutil.parser import parse as date_parse +import sqlalchemy +from orm.shout import Shout, ShoutTopic, User +from storages.viewed import ViewedByDay +from transliterate import translit +from datetime import datetime +from orm.base import local_session +from migration.extract import prepare_html_body +from orm.community import Community +from orm.reaction import Reaction, ReactionKind + +OLD_DATE = '2016-03-05 22:22:00.350000' +ts = datetime.now() +type2layout = { + 'Article': 'article', + 'Literature': 'prose', + 'Music': 'music', + 'Video': 'video', + 'Image': 'image' +} + +def get_shout_slug(entry): + slug = entry.get('slug', '') + if not slug: + for friend in entry.get('friendlySlugs', []): + slug = friend.get('slug', '') + if slug: break + return slug + +def migrate(entry, storage): + # init, set title and layout + r = { + 'layout': type2layout[entry['type']], + 'title': entry['title'], + 'community': Community.default_community.id, + 'authors': [], + 'topics': set([]), + # 'rating': 0, + # 'ratings': [], + 'createdAt': [] + } + topics_by_oid = storage['topics']['by_oid'] + users_by_oid = storage['users']['by_oid'] + + # author + + oid = entry.get('createdBy', entry.get('_id', entry.get('oid'))) + userdata = users_by_oid.get(oid) + if not userdata: + app = entry.get('application') + if app: + userslug = translit(app['name'], 'ru', reversed=True)\ + .replace(' ', '-')\ + .replace('\'', '')\ + .replace('.', '-').lower() + userdata = { + 'username': app['email'], + 'email': app['email'], + 'name': app['name'], + 'bio': app.get('bio', ''), + 'emailConfirmed': False, + 'slug': userslug, + 'createdAt': ts, + 'wasOnlineAt': ts + } + else: + userdata = User.default_user.dict() + assert userdata, 'no user found for %s from ' % [oid, len(users_by_oid.keys())] + r['authors'] = [userdata, ] + + # slug + + slug = get_shout_slug(entry) + if slug: r['slug'] = slug + else: raise Exception + + # cover + c = '' + if entry.get('thumborId'): + c = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId'] + else: + c = entry.get('image', {}).get('url') + if not c or 'cloudinary' in c: c = '' + r['cover'] = c + + # timestamps + + r['createdAt'] = date_parse(entry.get('createdAt', OLD_DATE)) + r['updatedAt'] = date_parse(entry['updatedAt']) if 'updatedAt' in entry else ts + if entry.get('published'): + r['publishedAt'] = date_parse(entry.get('publishedAt', OLD_DATE)) + if r['publishedAt'] == OLD_DATE: r['publishedAt'] = ts + if 'deletedAt' in entry: r['deletedAt'] = date_parse(entry['deletedAt']) + + # topics + category = entry['category'] + mainTopic = topics_by_oid.get(category) + if mainTopic: + r['mainTopic'] = storage['replacements'].get(mainTopic["slug"], mainTopic["slug"]) + topic_oids = [category, ] + topic_oids.extend(entry.get('tags', [])) + for oid in topic_oids: + if oid in storage['topics']['by_oid']: + r['topics'].add(storage['topics']['by_oid'][oid]['slug']) + else: + print('[migration] unknown old topic id: ' + oid) + r['topics'] = list(r['topics']) + + entry['topics'] = r['topics'] + entry['cover'] = r['cover'] + entry['authors'] = r['authors'] + + # body + r['body'] = prepare_html_body(entry) + + # save shout to db + + s = object() + shout_dict = r.copy() + user = None + del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state' + #del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout + #del shout_dict['ratings'] + email = userdata.get('email') + slug = userdata.get('slug') + with local_session() as session: + # c = session.query(Community).all().pop() + if email: user = session.query(User).filter(User.email == email).first() + if not user and slug: user = session.query(User).filter(User.slug == slug).first() + if not user and userdata: + try: user = User.create(**userdata) + except sqlalchemy.exc.IntegrityError: + print('[migration] user error: ' + userdata) + userdata['id'] = user.id + userdata['createdAt'] = user.createdAt + storage['users']['by_slug'][userdata['slug']] = userdata + storage['users']['by_oid'][entry['_id']] = userdata + assert user, 'could not get a user' + shout_dict['authors'] = [ user, ] + + try: + s = Shout.create(**shout_dict) + except sqlalchemy.exc.IntegrityError as e: + with local_session() as session: + s = session.query(Shout).filter(Shout.slug == shout_dict['slug']).first() + bump = False + if s: + for key in shout_dict: + if key in s.__dict__: + if s.__dict__[key] != shout_dict[key]: + print('[migration] shout already exists, but differs in %s' % key) + bump = True + else: + print('[migration] shout already exists, but lacks %s' % key) + bump = True + if bump: + s.update(shout_dict) + else: + print('[migration] something went wrong with shout: \n%r' % shout_dict) + raise e + session.commit() + except: + print(s) + raise Exception + + + # shout topics aftermath + shout_dict['topics'] = [] + for tpc in r['topics']: + oldslug = tpc + newslug = storage['replacements'].get(oldslug, oldslug) + if newslug: + with local_session() as session: + shout_topic_old = session.query(ShoutTopic)\ + .filter(ShoutTopic.shout == shout_dict['slug'])\ + .filter(ShoutTopic.topic == oldslug).first() + if shout_topic_old: + shout_topic_old.update({ 'slug': newslug }) + else: + shout_topic_new = session.query(ShoutTopic)\ + .filter(ShoutTopic.shout == shout_dict['slug'])\ + .filter(ShoutTopic.topic == newslug).first() + if not shout_topic_new: + try: ShoutTopic.create(**{ 'shout': shout_dict['slug'], 'topic': newslug }) + except: print('[migration] shout topic error: ' + newslug) + session.commit() + if newslug not in shout_dict['topics']: + shout_dict['topics'].append(newslug) + else: + print('[migration] ignored topic slug: \n%r' % tpc['slug']) + # raise Exception + + # content_item ratings to reactions + try: + for content_rating in entry.get('ratings',[]): + with local_session() as session: + rater = session.query(User).filter(User.oid == content_rating['createdBy']).first() + reactedBy = rater if rater else session.query(User).filter(User.slug == 'noname').first() + if rater: + reaction_dict = { + 'kind': ReactionKind.LIKE if content_rating['value'] > 0 else ReactionKind.DISLIKE, + 'createdBy': reactedBy.slug, + 'shout': shout_dict['slug'] + } + cts = content_rating.get('createdAt') + if cts: reaction_dict['createdAt'] = date_parse(cts) + reaction = session.query(Reaction).\ + filter(Reaction.shout == reaction_dict['shout']).\ + filter(Reaction.createdBy == reaction_dict['createdBy']).\ + filter(Reaction.kind == reaction_dict['kind']).first() + if reaction: + reaction_dict['kind'] = ReactionKind.AGREE if content_rating['value'] > 0 else ReactionKind.DISAGREE, + reaction.update(reaction_dict) + else: Reaction.create(**reaction_dict) + # shout_dict['ratings'].append(reaction_dict) + except: + print('[migration] content_item.ratings error: \n%r' % content_rating) + raise Exception + + # shout views + ViewedByDay.create( shout = shout_dict['slug'], value = entry.get('views', 1) ) + # del shout_dict['ratings'] + shout_dict['oid'] = entry.get('_id') + storage['shouts']['by_oid'][entry['_id']] = shout_dict + storage['shouts']['by_slug'][slug] = shout_dict + return shout_dict diff --git a/migration/tables/replacements.json b/migration/tables/replacements.json new file mode 100644 index 00000000..e53a0886 --- /dev/null +++ b/migration/tables/replacements.json @@ -0,0 +1,768 @@ +{ + "1990-e": "90s", + "2000-e": "2000s", + "90-e": "90s", + "207": "207", + "kartochki-rubinshteyna": "rubinstein-cards", + "Georgia": "georgia", + "Japan": "japan", + "Sweden": "sweden", + "abstraktsiya": "abstract", + "absurdism": "absurdism", + "acclimatization": "acclimatisation", + "activism": "activism", + "adolf-gitler": "adolf-hitler", + "afrika": "africa", + "agata-kristi": "agatha-christie", + "agressiya": "agression", + "agressivnoe-povedenie": "agression", + "aktsii": "actions", + "aktsionizm": "actionism", + "alber-kamyu": "albert-kamus", + "albomy": "albums", + "aleksandr-griboedov": "aleksander-griboedov", + "aleksandr-pushkin": "aleksander-pushkin", + "aleksandr-solzhenitsyn": "aleksander-solzhenitsyn", + "aleksandr-vvedenskiy": "aleksander-vvedensky", + "aleksey-navalnyy": "alexey-navalny", + "alfavit": "alphabet", + "alkogol": "alcohol", + "alternativa": "alternative", + "alternative": "alternative", + "alternativnaya-istoriya": "alternative-history", + "amerika": "america", + "anarhizm": "anarchism", + "anatoliy-mariengof": "anatoly-mariengof", + "ancient-russia": "ancient-russia", + "andegraund": "underground", + "andrey-platonov": "andrey-platonov", + "andrey-rodionov": "andrey-rodionov", + "andrey-tarkovskiy": "andrey-tarkovsky", + "angliyskie-istorii": "english-stories", + "angliyskiy-yazyk": "english-langugae", + "animation": "animation", + "animatsiya": "animation", + "anime": "anime", + "anri-volohonskiy": "anri-volohonsky", + "antifashizm": "anti-faschism", + "antiquity": "antiquity", + "antiutopiya": "dystopia", + "antropology": "antropology", + "antropotsen": "antropocenus", + "architecture": "architecture", + "arheologiya": "archeology", + "arhetipy": "archetypes", + "arhiv": "archive", + "aristokraty": "aristocracy", + "aristotel": "aristotle", + "arktika": "arctic", + "armiya": "army", + "art": "art", + "art-is": "art-is", + "artists": "artists", + "ateizm": "atheism", + "audiopoeziya": "audio-poetry", + "audio-poetry": "audio-poetry", + "audiospektakl": "audio-spectacles", + "auktsyon": "auktsyon", + "avangard": "avantgarde", + "avtofikshn": "autofiction", + "avtorskaya-pesnya": "bardsongs", + "azbuka-immigratsii": "immigration-basics", + "aziatskiy-kinematograf": "asian-cinema", + "b-movie": "b-movie", + "bannye-chteniya": "sauna-reading", + "bardsongs": "bardsongs", + "bdsm": "bdsm", + "belarus": "belarus", + "belgiya": "belgium", + "bertold-breht": "berttold-brecht", + "bezumie": "madness", + "biography": "biography", + "biologiya": "biology", + "bipolyarnoe-rasstroystvo": "bipolar-disorder", + "bitniki": "beatnics", + "biznes": "business", + "blizhniy-vostok": "middle-east", + "blizost": "closeness", + "blokada": "blockade", + "bob-dilan": "bob-dylan", + "bog": "god", + "bol": "pain", + "bolotnoe-delo": "bolotnaya-case", + "books": "books", + "boris-eltsin": "boris-eltsin", + "boris-godunov": "boris-godunov", + "boris-grebenschikov": "boris-grebenschikov", + "boris-nemtsov": "boris-nemtsov", + "boris-pasternak": "boris-pasternak", + "brak": "marriage", + "bret-iston-ellis": "bret-iston-ellis", + "buddizm": "buddhism", + "bullying": "bullying", + "bunt": "riot", + "burning-man": "burning-man", + "bytie": "being", + "byurokratiya": "bureaucracy", + "capitalism": "capitalism", + "censored-in-russia": "censored-in-russia", + "ch-rno-beloe": "black-and-white", + "ch-rnyy-yumor": "black-humour", + "chapters": "chapters", + "charity": "charity", + "chayldfri": "childfree", + "chechenskaya-voyna": "chechen-war", + "chechnya": "chechnya", + "chelovek": "male", + "chernobyl": "chernobyl", + "chernyy-yumor": "black-humour", + "children": "children", + "china": "china", + "chinovniki": "bureaucracy", + "chukotka": "chukotka", + "chuma": "plague", + "church": "church", + "cinema": "cinema", + "city": "city", + "civil-position": "civil-position", + "clips": "clips", + "collage": "collage", + "comics": "comics", + "conspiracy-theory": "conspiracy-theory", + "contemporary-art": "contemporary-art", + "contemporary-poetry": "poetry", + "contemporary-prose": "prose", + "coronavirus": "coronavirus", + "corruption": "corruption", + "creative-writing-school": "creative-writing-school", + "crime": "crime", + "criticism": "criticism", + "critiques": "reviews", + "culture": "culture", + "dadaizm": "dadaism", + "daniel-defo": "daniel-defoe", + "daniil-harms": "daniil-kharms", + "dante-aligeri": "dante-alighieri", + "darkveyv": "darkwave", + "death": "death", + "debaty": "debats", + "delo-seti": "seti-case", + "democracy": "democracy", + "demografiya": "demographics", + "demonstrations": "demonstrations", + "depression": "depression", + "derevnya": "village", + "design": "design", + "detskie-doma": "orphanages", + "detstvo": "childhood", + "digital": "digital", + "digital-art": "digital-art", + "directing": "directing", + "diskurs": "discours", + "diskurs-1": "discourse", + "dissidenty": "dissidents", + "diy": "diy", + "dmitriy-donskoy": "dmitriy-donskoy", + "dmitriy-prigov": "dmitriy-prigov", + "dnevniki": "dairies", + "documentary": "documentary", + "dokumenty": "doсuments", + "domashnee-nasilie": "home-terror", + "donald-tramp": "donald-trump", + "donbass": "donbass", + "donorstvo": "donation", + "drama": "drama", + "dramaturgy": "dramaturgy", + "drawing": "drawing", + "drevo-zhizni": "tree-of-life", + "drugs": "drugs", + "dzhaz": "jazz", + "dzhek-keruak": "jack-keruak", + "dzhim-morrison": "jim-morrison", + "dzhordzh-romero": "george-romero", + "dzhordzho-agamben": "giorgio-agamben", + "ecology": "ecology", + "economics": "economics", + "eda": "food", + "editing": "editing", + "editorial-statements": "editorial-statements", + "eduard-limonov": "eduard-limonov", + "education": "education", + "egor-letov": "egor-letov", + "eksperiment": "experiments", + "eksperimentalnaya-muzyka": "experimental-music", + "ekspressionizm": "expressionism", + "ekstremizm": "extremism", + "ekzistentsializm-1": "existentialism", + "elections": "elections", + "electronic": "electronics", + "electronics": "electronics", + "elena-glinskaya": "elena-glinskaya", + "elena-guro": "elena-guro", + "elizaveta-mnatsakanova": "elizaveta-mnatsakanova", + "embient": "ambient", + "emigration": "emigration", + "emil-dyurkgeym": "emile-durkheim", + "emotsii": "emotions", + "empiric": "empiric", + "epidemiya": "pandemic", + "erich-von-neff": "erich-von-neff", + "erotika": "erotics", + "essay": "essay", + "estetika": "aestetics", + "etika": "ethics", + "etnos": "ethnics", + "everyday-life": "everyday-life", + "evgeniy-onegin": "eugene-onegin", + "evolyutsiya": "evolution", + "exhibitions": "exhibitions", + "experience": "experiences", + "experimental": "experimental", + "experimental-music": "experimental-music", + "explanation": "explanation", + "faktcheking": "fact-checking", + "falsifikatsii": "falsifications", + "family": "family", + "fanfiki": "fan-fiction", + "fantastika": "sci-fi", + "fatalizm": "fatalism", + "fedor-dostoevskiy": "fedor-dostoevsky", + "fedor-ioannovich": "fedor-ioannovich", + "feleton": "feuilleton", + "feminism": "feminism", + "fenomenologiya": "phenomenology", + "fentezi": "fantasy", + "festival": "festival", + "festival-territoriya": "festival-territory", + "folk": "folk", + "folklor": "folklore", + "fotoreportazh": "photoreports", + "france": "france", + "frants-kafka": "franz-kafka", + "frederik-begbeder": "frederick-begbeder", + "freedom": "freedom", + "friendship": "friendship", + "fsb": "fsb", + "futbol": "footbool", + "future": "future", + "futuristy": "futurists", + "futurizm": "futurism", + "galereya": "gallery", + "gdr": "gdr", + "gender": "gender", + "gendernyy-diskurs": "gender", + "gennadiy-aygi": "gennadiy-aygi", + "gerhard-rihter": "gerhard-rihter", + "germaniya": "germany", + "germenevtika": "hermeneutics", + "geroi": "heroes", + "girls": "girls", + "gkchp": "gkchp", + "glitch": "glitch", + "globalizatsiya": "globalisation", + "gollivud": "hollywood", + "gonzo": "gonzo", + "gore-ot-uma": "woe-from-wit", + "graffiti": "graffiti", + "graphics": "graphics", + "gravyura": "engraving", + "grazhdanskaya-oborona": "grazhdanskaya-oborona", + "gretsiya": "greece", + "gulag": "gulag", + "han-batyy": "khan-batyy", + "health": "health", + "himiya": "chemistry", + "hip-hop": "hip-hop", + "history": "history", + "history-of-russia": "history-of-russia", + "holokost": "holocaust", + "horeografiya": "choreography", + "horror": "horror", + "hospis": "hospice", + "hristianstvo": "christianity", + "humans": "humans", + "humour": "humour", + "ideologiya": "ideology", + "idm": "idm", + "igil": "isis", + "igor-pomerantsev": "igor-pomerantsev", + "igra-prestolov": "game-of-throne", + "igry": "games", + "iisus-hristos": "jesus-christ", + "illness": "illness", + "illustration-history": "illustration-history", + "illustrations": "illustrations", + "imazhinizm": "imagism", + "immanuil-kant": "immanuel-kant", + "impressionizm": "impressionism", + "improvizatsiya": "improvisation", + "indi": "indie", + "individualizm": "individualism", + "infografika": "infographics", + "informatsiya": "information", + "ingmar-bergman": "ingmar-bergman", + "inklyuziya": "inclusion", + "installyatsiya": "installation", + "internet": "internet", + "interview": "interview", + "invalidnost": "disability", + "investigations": "investigations", + "iosif-brodskiy": "joseph-brodsky", + "iosif-stalin": "joseph-stalin", + "iskusstvennyy-intellekt": "artificial-intelligence", + "islam": "islam", + "istoriya-moskvy": "moscow-history", + "istoriya-teatra": "theatre-history", + "italiya": "italy", + "italyanskiy-yazyk": "italian-language", + "iudaika": "judaica", + "ivan-groznyy": "ivan-grozny", + "ivan-iii-gorbatyy": "ivan-iii-gorbaty", + "ivan-kalita": "ivan-kalita", + "ivan-krylov": "ivan-krylov", + "izobreteniya": "inventions", + "izrail-1": "israel", + "jazz": "jazz", + "john-lennon": "john-lennon", + "journalism": "journalism", + "justice": "justice", + "k-pop": "k-pop", + "kalligrafiya": "calligraphy", + "karikatura": "caricatures", + "katrin-nenasheva": "katrin-nenasheva", + "kavkaz": "caucasus", + "kazan": "kazan", + "kiberbezopasnost": "cybersecurity", + "kinoklub": "cinema-club", + "kirill-serebrennikov": "kirill-serebrennikov", + "klassika": "classic", + "kollektivnoe-bessoznatelnoe": "сollective-unconscious", + "komediya": "comedy", + "kommunikatsii": "communications", + "kommunizm": "communism", + "kommuny": "communes", + "kompyuternye-igry": "computer-games", + "konservatizm": "conservatism", + "kontrkultura": "counter-culture", + "kontseptualizm": "conceptualism", + "korotkometrazhka": "cinema-shorts", + "kosmos": "cosmos", + "kraudfanding": "crowdfunding", + "krizis": "crisis", + "krov": "blood", + "krym": "crimea", + "kulturologiya": "culturology", + "kulty": "cults", + "kurdistan": "kurdistan", + "kurt-kobeyn": "kurt-cobain", + "kurt-vonnegut": "kurt-vonnegut", + "kvir": "queer", + "laboratoriya": "lab", + "language": "languages", + "lars-fon-trier": "lars-fon-trier", + "laws": "laws", + "lectures": "lectures", + "leto": "summer", + "lev-tolstoy": "leo-tolstoy", + "lgbt": "lgbt", + "liberalizm": "liberalism", + "libertarianstvo": "libertarianism", + "life": "life", + "likbez": "likbez", + "lingvistika": "linguistics", + "lirika": "lirics", + "literary-studies": "literary-studies", + "literature": "literature", + "lo-fi": "lo-fi", + "love": "love", + "luzha-goluboy-krovi": "luzha-goluboy-krovi", + "lyudvig-vitgenshteyn": "ludwig-wittgenstein", + "lzhedmitriy": "false-dmitry", + "lzhenauka": "pseudoscience", + "maks-veber": "max-weber", + "manifests": "manifests", + "manipulyatsii-soznaniem": "mind-manipulation", + "marina-abramovich": "marina-abramovich", + "marketing": "marketing", + "marksizm": "marxism", + "marsel-dyushan": "marchel-duchamp", + "martin-haydegger": "martin-hidegger", + "matematika": "maths", + "vladimir-mayakovskiy": "vladimir-mayakovsky", + "mayakovskiy": "vladimir-mayakovsky", + "ekzistentsiya": "existence", + "media": "media", + "medicine": "medicine", + "memuary": "memoirs", + "menedzhment": "management", + "merab-mamardashvili": "merab-mamardashvili", + "mest": "revenge", + "metamodernizm": "metamodern", + "metavselennaya": "metaverse", + "metro": "metro", + "mifologiya": "mythology", + "mify": "myth", + "mihael-haneke": "michael-haneke", + "mihail-baryshnikov": "mihail-baryshnikov", + "mihail-bulgakov": "mihail-bulgakov", + "mikrotonalnaya-muzyka": "mikrotone-muzyka", + "minimalizm": "minimalism", + "minkult-privet": "minkult-privet", + "mir": "world", + "mirovozzrenie": "mindsets", + "mishel-fuko": "michel-foucault", + "mistika": "mystics", + "mitropolit-makariy": "mitropolit-makariy", + "mlm": "mlm", + "moda": "fashion", + "modernizm": "modernism", + "mokyumentari": "mockumentary", + "moloko-plus": "moloko-plus", + "money": "money", + "monologs": "monologues", + "monstratsiya": "monstration", + "moralnaya-otvetstvennost": "moral-responsibility", + "more": "sea", + "moscow": "moscow", + "moshennichestvo": "frauds", + "moskovskiy-romanticheskiy-kontseptualizm": "moscow-romantic-conceptualism", + "moskovskoe-delo": "moscow-case", + "movies": "movies", + "mozg": "brain", + "multiplikatsiya": "animation", + "music": "music", + "muzei": "museum", + "muzey": "museum", + "muzhchiny": "man", + "myshlenie": "thinking", + "nagornyy-karabah": "nagorno-karabakh", + "natsionalizm": "nationalism", + "natsionalnaya-ideya": "national-idea", + "natsizm": "nazism", + "natyurmort": "nature-morte", + "nauchpop": "pop-science", + "nbp": "nbp", + "nenavist": "hate", + "neofitsialnaya-literatura": "unofficial-literature", + "neoklassika": "neoclassic", + "neprozrachnye-smysly": "hidden-meanings", + "neravenstvo": "inequality", + "new-year": "new-year", + "neyronauka": "neuro-science", + "neyroseti": "neural-networks", + "niu-vshe": "hse", + "nizhniy-novgorod": "nizhny-novgorod", + "nko": "nonprofits", + "nlo": "ufo", + "nobelevskaya-premiya": "nobel-prize", + "noize-mc": "noize-mc", + "nonkonformizm": "nonconformism", + "novaya-drama": "new-drama", + "novosti": "news", + "noyz": "noise", + "oberiu": "oberiu", + "ocherk": "etudes", + "ochevidnyy-nuar": "ochevidnyy-nuar", + "odinochestvo": "loneliness", + "odna-kniga-odna-istoriya": "one-book-one-story", + "okrainy": "outskirts", + "opinions": "opinions", + "oppozitsiya": "opposition", + "orhan-pamuk": "orhan-pamuk", + "ornitologiya": "ornitology", + "osip-mandelshtam": "osip-mandelshtam", + "oskar-uayld": "oscar-wilde", + "osoznanie": "awareness", + "otnosheniya": "relationship", + "pablo-pikasso": "pablo-picasso", + "painting": "painting", + "paintings": "painting", + "pamyat": "memory", + "pandemiya": "pandemic", + "parizh": "paris", + "patriotizm": "patriotism", + "paul-tselan": "paul-tselan", + "per-burd": "pierre-bourdieu", + "performance": "performance", + "peyzazh": "landscape", + "philology": "philology", + "philosophy": "philosophy", + "photo": "photography", + "photography": "photography", + "photoprojects": "photoprojects", + "plakaty": "posters", + "plastilin": "plasticine", + "plays": "plays", + "podrostki": "teenagers", + "poema": "poem", + "poems": "poems", + "poeticheskaya-proza": "poetic-prose", + "poetry": "poetry", + "poetry-of-squares": "poetry-of-squares", + "poetry-slam": "poetry-slam", + "police": "police", + "politics": "politics", + "polsha": "poland", + "pop-art": "pop-art", + "pop-culture": "pop-culture", + "pornografiya": "pornography", + "portret": "portrait", + "poslovitsy": "proverbs", + "post-pank": "post-punk", + "post-rok": "post-rock", + "postmodernism": "postmodernism", + "povest": "novells", + "povsednevnost": "everyday-life", + "power": "power", + "pravo": "right", + "pravoslavie": "orthodox", + "pravozaschitniki": "human-rights-activism", + "prazdnik": "holidays", + "predatelstvo": "betrayal", + "predprinimatelstvo": "entrepreneurship", + "premera": "premier", + "premiya-oskar": "oscar-prize", + "pribaltika-1": "baltic", + "priroda": "nature", + "prison": "prison", + "pritcha": "parable", + "privatnost": "privacy", + "progress": "progress", + "projects": "projects", + "prokrastinatsiya": "procrastination", + "propaganda": "propaganda", + "proschenie": "forgiveness", + "prose": "prose", + "proshloe": "past", + "prostitutsiya": "prostitution", + "prosveschenie": "enlightenment", + "protests": "protests", + "psalmy": "psalms", + "psihoanaliz": "psychoanalysis", + "psihodeliki": "psychodelics", + "pskov": "pskov", + "psychiatry": "psychiatry", + "psychology": "psychology", + "punk": "punk", + "r-b": "rnb", + "realizm": "realism", + "redaktura": "editorial", + "refleksiya": "reflection", + "reggi": "reggae", + "religion": "religion", + "rene-zhirar": "rene-girard", + "renesanss": "renessance", + "renovatsiya": "renovation", + "rep": "rap", + "reportage": "reportage", + "repressions": "repressions", + "research": "research", + "retroveyv": "retrowave", + "review": "review", + "revolution": "revolution", + "rezo-gabriadze": "rezo-gabriadze", + "risunki": "painting", + "roboty": "robots", + "rock": "rock", + "roditeli": "parents", + "romantizm": "romantism", + "romany": "novell", + "ronald-reygan": "ronald-reygan", + "roskomnadzor": "roskomnadzor", + "rossiyskoe-kino": "russian-cinema", + "rozhava": "rojava", + "rpts": "rpts", + "rus-na-grani-sryva": "rus-na-grani-sryva", + "russia": "russia", + "russian-language": "russian-language", + "russian-literature": "russian-literature", + "russkiy-mir": "russkiy-mir", + "salvador-dali": "salvador-dali", + "samoidentifikatsiya": "self-identity", + "samoopredelenie": "self-definition", + "sankt-peterburg": "saint-petersburg", + "sasha-skochilenko": "sasha-skochilenko", + "satira": "satiric", + "saund-art": "sound-art", + "schaste": "hapiness", + "school": "school", + "science": "science", + "sculpture": "sculpture", + "second-world-war": "second-world-war", + "sekond-hend": "second-hand", + "seksprosvet": "sex-education", + "sekty": "sects", + "semiotics": "semiotics", + "serbiya": "serbia", + "serialy": "series", + "sever": "north", + "severnaya-koreya": "north-korea", + "sex": "sex", + "shotlandiya": "scotland", + "shugeyz": "shoegaze", + "siloviki": "siloviki", + "simeon-bekbulatovich": "simeon-bekbulatovich", + "simvolizm": "simbolism", + "siriya": "siria", + "skulptura": "sculpture", + "slavoy-zhizhek": "slavoj-zizek", + "smysl": "meaning", + "sny": "dreams", + "sobytiya": "events", + "social": "society", + "society": "society", + "sociology": "sociology", + "sofya-paleolog": "sofya-paleolog", + "sofya-vitovtovna": "sofya-vitovtovna", + "soobschestva": "communities", + "soprotivlenie": "resistence", + "sotsializm": "socialism", + "sotsialnaya-filosofiya": "social-philosophy", + "sotsseti": "social-networks", + "sotvorenie-tretego-rima": "third-rome", + "sovremennost": "modernity", + "spaces": "spaces", + "spektakl": "spectacles", + "spetseffekty": "special-fx", + "spetsoperatsiya": "special-operation", + "spetssluzhby": "special-services", + "sport": "sport", + "srednevekove": "middle-age", + "state": "state", + "statistika": "statistics", + "stendap": "stand-up", + "stoitsizm": "stoicism", + "stories": "stories", + "stoyanie-na-ugre": "stoyanie-na-ugre", + "strah": "fear", + "street-art": "street-art", + "stsenarii": "scenarios", + "summary": "summary", + "supergeroi": "superheroes", + "svetlana-aleksievich": "svetlana-aleksievich", + "svobodu-ivanu-golunovu": "free-ivan-golunov", + "syurrealizm": "surrealism", + "tales": "tales", + "tanets": "dance", + "tataro-mongolskoe-igo": "mongol-tatar-yoke", + "tatuirovki": "tattoo", + "technology": "technology", + "televidenie": "tv", + "telo": "body", + "telo-kak-iskusstvo": "body-as-art", + "terrorizm": "terrorism", + "tests": "tests", + "text": "texts", + "the-beatles": "the-beatles", + "theater": "theater", + "theory": "theory", + "tokio": "tokio", + "torture": "torture", + "totalitarizm": "totalitarism", + "traditions": "traditions", + "tragicomedy": "tragicomedy", + "transgendernost": "transgender", + "translation": "translation", + "transport": "transport", + "travel": "travel", + "travma": "trauma", + "trendy": "trends", + "tretiy-reyh": "third-reich", + "triller": "thriller", + "tsar": "central-african-republic", + "tsar-edip": "oedipus", + "tsarevich-dmitriy": "tsarevich-dmitry", + "tsennosti": "values", + "tsenzura": "censorship", + "tseremonii": "ceremonies", + "turizm": "tourism", + "tvorchestvo": "creativity", + "ugnetennyy-zhilischnyy-klass": "oppressed-housing-class", + "uilyam-shekspir": "william-shakespeare", + "ukraine": "ukraine", + "university": "university", + "urban-studies": "urban-studies", + "uroki-literatury": "literature-lessons", + "usa": "usa", + "ussr": "ussr", + "utopiya": "utopia", + "valter-benyamin": "valter-benyamin", + "varlam-shalamov": "varlam-shalamov", + "vasiliy-ii-temnyy": "basil-ii-temnyy", + "vasiliy-iii": "basil-iii", + "vdnh": "vdnh", + "vechnost": "ethernety", + "velikobritaniya": "great-britain", + "velimir-hlebnikov": "velimir-hlebnikov", + "velkom-tu-greyt-britn": "welcome-to-great-britain", + "venedikt-erofeev": "venedikt-erofeev", + "venetsiya": "veneece", + "vengriya": "hungary", + "verlibry": "free-verse", + "veschi": "things", + "vessels": "vessels", + "veterany": "veterans", + "video": "video", + "videoart": "videoart", + "videoklip": "clips", + "videopoeziya": "video-poetry", + "viktor-astafev": "viktor-astafev", + "viktor-pelevin": "viktor-pelevin", + "vilgelm-rayh": "wilhelm-reich", + "vinzavod": "vinzavod", + "violence": "violence", + "visual-culture": "visual-culture", + "vizualnaya-poeziya": "visual-poetry", + "vladimir-lenin": "vladimir-lenin", + "vladimir-nabokov": "vladimir-nabokov", + "vladimir-putin": "vladimir-putin", + "vladimir-sorokin": "vladimir-sorokin", + "vladimir-voynovich": "vladimir-voynovich", + "volga": "volga", + "volontery": "volonteurs", + "vong-karvay": "wong-karwai", + "vospominaniya": "memories", + "vostok": "east", + "vremya": "time", + "vudi-allen": "woody-allen", + "vynuzhdennye-otnosheniya": "forced-relationship", + "war": "war", + "war-in-ukraine-images": "war-in-ukrahine-images", + "women": "women", + "work": "work", + "writers": "writers", + "xx-century": "xx-century", + "yakob-yordans": "yakob-yordans", + "yan-vermeer": "yan-vermeer", + "yanka-dyagileva": "yanka-dyagileva", + "yaponskaya-literatura": "japan-literature", + "youth": "youth", + "yozef-rot": "yozef-rot", + "yurgen-habermas": "jorgen-habermas", + "za-liniey-mannergeyma": "behind-mannerheim-line", + "zahar-prilepin": "zahar-prilepin", + "zakonodatelstvo": "laws", + "zakony-mira": "world-laws", + "zametki": "notes", + "zhelanie": "wish", + "konets-vesny": "end-of-spring", + "zhivotnye": "animals", + "zhoze-saramago": "jose-saramago", + "zigmund-freyd": "sigmund-freud", + "zolotaya-orda": "golden-horde", + "zombi": "zombie", + "zombi-simpsony": "zombie-simpsons", + "rouling": "rowling", + "diskurs-analiz": "discourse-analytics", + "menty": "police", + "ptitsy": "birds", + "salo": "lard", + "rasizm": "racism", + "griby": "mushrooms", + "politzaklyuchennye": "political-prisoners", + "molodezh": "youth", + "blocked-in-russia": "blocked-in-russia", + "kavarga": "kavarga", + "galereya-anna-nova": "gallery-anna-nova", + "derrida": "derrida" +} \ No newline at end of file diff --git a/migration/tables/topics.py b/migration/tables/topics.py new file mode 100644 index 00000000..57084ecb --- /dev/null +++ b/migration/tables/topics.py @@ -0,0 +1,28 @@ +from migration.extract import extract_md, html2text +from orm.base import local_session +from orm import Topic, Community + +def migrate(entry): + body_orig = entry.get('description', '').replace(' ', ' ') + topic_dict = { + 'slug': entry['slug'], + 'oid': entry['_id'], + 'title': entry['title'].replace(' ', ' '), #.lower(), + 'children': [], + 'community' : Community.default_community.slug + } + topic_dict['body'] = extract_md(html2text(body_orig), entry['_id']) + with local_session() as session: + slug = topic_dict['slug'] + topic = session.query(Topic).filter(Topic.slug == slug).first() + if not topic: + topic = Topic.create(**topic_dict) + if len(topic.title) > len(topic_dict['title']): + topic.update({ 'title': topic_dict['title'] }) + if len(topic.body) < len(topic_dict['body']): + topic.update({ 'body': topic_dict['body'] }) + session.commit() + # print(topic.__dict__) + rt = topic.__dict__.copy() + del rt['_sa_instance_state'] + return rt diff --git a/migration/tables/users.py b/migration/tables/users.py new file mode 100644 index 00000000..40b0eaf4 --- /dev/null +++ b/migration/tables/users.py @@ -0,0 +1,106 @@ +import sqlalchemy +from migration.html2text import html2text +from orm import User, UserRating +from dateutil.parser import parse +from orm.base import local_session + +def migrate(entry): + if 'subscribedTo' in entry: del entry['subscribedTo'] + email = entry['emails'][0]['address'] + user_dict = { + 'oid': entry['_id'], + 'roles': [], + 'ratings': [], + 'username': email, + 'email': email, + 'password': entry['services']['password'].get('bcrypt', ''), + 'createdAt': parse(entry['createdAt']), + 'emailConfirmed': bool(entry['emails'][0]['verified']), + 'muted': False, # amnesty + 'bio': entry['profile'].get('bio', ''), + 'notifications': [], + 'createdAt': parse(entry['createdAt']), + 'roles': [], # entry['roles'] # roles by community + 'ratings': [], # entry['ratings'] + 'links': [], + 'name': 'anonymous' + } + if 'updatedAt' in entry: user_dict['updatedAt'] = parse(entry['updatedAt']) + if 'wasOnineAt' in entry: user_dict['wasOnlineAt'] = parse(entry['wasOnlineAt']) + if entry.get('profile'): + # slug + user_dict['slug'] = entry['profile'].get('path') + user_dict['bio'] = html2text(entry.get('profile').get('bio') or '') + + # userpic + try: user_dict['userpic'] = 'https://assets.discours.io/unsafe/100x/' + entry['profile']['thumborId'] + except KeyError: + try: user_dict['userpic'] = entry['profile']['image']['url'] + except KeyError: user_dict['userpic'] = '' + + # name + fn = entry['profile'].get('firstName', '') + ln = entry['profile'].get('lastName', '') + name = user_dict['slug'] if user_dict['slug'] else 'noname' + name = fn if fn else name + name = (name + ' ' + ln) if ln else name + name = entry['profile']['path'].lower().replace(' ', '-') if len(name) < 2 else name + user_dict['name'] = name + + # links + fb = entry['profile'].get('facebook', False) + if fb: user_dict['links'].append(fb) + vk = entry['profile'].get('vkontakte', False) + if vk: user_dict['links'].append(vk) + tr = entry['profile'].get('twitter', False) + if tr: user_dict['links'].append(tr) + ws = entry['profile'].get('website', False) + if ws: user_dict['links'].append(ws) + + # some checks + if not user_dict['slug'] and len(user_dict['links']) > 0: + user_dict['slug'] = user_dict['links'][0].split('/')[-1] + + user_dict['slug'] = user_dict.get('slug', user_dict['email'].split('@')[0]) + oid = user_dict['oid'] + try: user = User.create(**user_dict.copy()) + except sqlalchemy.exc.IntegrityError: + print('[migration] cannot create user ' + user_dict['slug']) + with local_session() as session: + old_user = session.query(User).filter(User.slug == user_dict['slug']).first() + old_user.oid = oid + user = old_user + if not user: + print('[migration] ERROR: cannot find user ' + user_dict['slug']) + raise Exception + user_dict['id'] = user.id + return user_dict + +def migrate_2stage(entry, id_map): + ce = 0 + for rating_entry in entry.get('ratings',[]): + rater_oid = rating_entry['createdBy'] + rater_slug = id_map.get(rater_oid) + if not rater_slug: + ce +=1 + # print(rating_entry) + continue + oid = entry['_id'] + author_slug = id_map.get(oid) + user_rating_dict = { + 'value': rating_entry['value'], + 'rater': rater_slug, + 'user': author_slug + } + with local_session() as session: + try: + user_rating = UserRating.create(**user_rating_dict) + except sqlalchemy.exc.IntegrityError: + old_rating = session.query(UserRating).filter(UserRating.rater == rater_slug).first() + print('[migration] cannot create ' + author_slug + '`s rate from ' + rater_slug) + print('[migration] concat rating value %d+%d=%d' % (old_rating.value, rating_entry['value'], old_rating.value + rating_entry['value'])) + old_rating.update({ 'value': old_rating.value + rating_entry['value'] }) + session.commit() + except Exception as e: + print(e) + return ce diff --git a/migration/utils.py b/migration/utils.py new file mode 100644 index 00000000..9a19c556 --- /dev/null +++ b/migration/utils.py @@ -0,0 +1,9 @@ +from datetime import datetime +from json import JSONEncoder + +class DateTimeEncoder(JSONEncoder): + def default(self, z): + if isinstance(z, datetime): + return (str(z)) + else: + return super().default(z) \ No newline at end of file