From d451f9caff6fbe25dccff9af89046aadd7af0e10 Mon Sep 17 00:00:00 2001 From: tonyrewin Date: Fri, 8 Jul 2022 10:12:32 +0300 Subject: [PATCH] fixed-migratioin --- migrate.py | 9 +- migration/export.py | 17 ++- migration/extract.py | 217 +++++++++++++++++------------- migration/tables/content_items.py | 8 +- 4 files changed, 148 insertions(+), 103 deletions(-) diff --git a/migrate.py b/migrate.py index bf65a435..135d87c4 100644 --- a/migrate.py +++ b/migrate.py @@ -30,6 +30,7 @@ def users_handle(storage): ce = 0 for entry in storage['users']['data']: ce += migrateUser_2stage(entry, id_map) + return storage def topics_handle(storage): @@ -53,6 +54,7 @@ def topics_handle(storage): print( '[migration] ' + str(len(storage['topics']['by_oid'].values())) + ' topics by oid' ) print( '[migration] ' + str(len(storage['topics']['by_slug'].values())) + ' topics by slug' ) # raise Exception + return storage def shouts_handle(storage): ''' migrating content items one by one ''' @@ -69,6 +71,8 @@ def shouts_handle(storage): # migrate shout = migrateShout(entry, storage) + storage['shouts']['by_oid'][entry['_id']] = shout + storage['shouts']['by_slug'][shout['slug']] = shout # shouts.topics if not shout['topics']: print('[migration] no topics!') @@ -89,6 +93,7 @@ def shouts_handle(storage): print('[migration] ' + str(counter) + ' content items were migrated') print('[migration] ' + str(pub_counter) + ' have been published') print('[migration] ' + str(discours_author) + ' authored by @discours') + return storage def comments_handle(storage): id_map = {} @@ -102,9 +107,11 @@ def comments_handle(storage): id = comment.get('id') oid = comment.get('oid') id_map[oid] = id + for comment in storage['comments']['data']: migrateComment_2stage(comment, id_map) print('[migration] ' + str(len(id_map)) + ' comments migrated') print('[migration] ' + str(ignored_counter) + ' comments ignored') + return storage def bson_handle(): @@ -125,7 +132,7 @@ def all_handle(storage): shouts_handle(storage) comments_handle(storage) export_email_subscriptions() - print('[migration] everything done!') + print('[migration] done!') def data_load(): diff --git a/migration/export.py b/migration/export.py index a3a79ed3..a05d9b5f 100644 --- a/migration/export.py +++ b/migration/export.py @@ -3,7 +3,7 @@ from datetime import datetime import json import os import frontmatter -from migration.extract import prepare_body +from migration.extract import extract_html, prepare_body from migration.tables.users import migrate_email_subscription from migration.utils import DateTimeEncoder @@ -42,16 +42,21 @@ def export_mdx(r): open(filepath + '.' + ext, 'w').write(bc) def export_body(shout, storage): - shout['body'] = prepare_body(storage['content_items']['by_oid'][shout['oid']]) - export_mdx(shout) - print('[export] trying to save html %s' % shout['slug']) - open(contentDir + shout['slug'] + '.html', 'w').write(storage['content_items']['by_oid'][shout['oid']]['body']) + entry = storage['content_items']['by_oid'][shout['oid']] + if entry: + shout['body'] = prepare_body(entry) + export_mdx(shout) + print('[export] html for %s' % shout['slug']) + body = extract_html(entry) + open(contentDir + shout['slug'] + '.html', 'w').write(body) + else: + raise Exception('no content_items entry found') def export_slug(slug, storage): shout = storage['shouts']['by_slug'][slug] shout = storage['shouts']['by_slug'].get(slug) assert shout, '[export] no shout found by slug: %s ' % slug - author = storage['users']['by_slug'].get(shout['authors'][0]['slug']) + author = shout['authors'][0] assert author, '[export] no author error' export_body(shout, storage) diff --git a/migration/extract.py b/migration/extract.py index b82bec7a..17f0be50 100644 --- a/migration/extract.py +++ b/migration/extract.py @@ -31,8 +31,7 @@ def place_tooltips(body): print('[extract] found %d tooltips' % (l-1)) for part in parts[1:]: if i & 1: - # print([ len(p) for p in parts ]) - # print('[extract] tooltip: ' + part) + placed = True if 'a class="footnote-url" href=' in part: print('[extract] footnote: ' + part) fn = 'a class="footnote-url" href="' @@ -41,11 +40,11 @@ def place_tooltips(body): newparts[i] = '' + extracted_part + '' else: newparts[i] = '%s' % part + # print('[extract] ' + newparts[i]) else: - # print('[extract] pass: ' + part[:10] + '..') + # print('[extract] ' + part[:10] + '..') newparts[i] = part i += 1 - placed = True return (''.join(newparts), placed) IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)" @@ -81,10 +80,11 @@ IMAGES = { 'data:image/jpeg': 'jpg', } -sep = ';base64,' +b64 = ';base64,' def extract_imageparts(bodyparts, prefix): # recursive loop + newparts = list(bodyparts) for current in bodyparts: i = bodyparts.index(current) for mime in IMAGES.keys(): @@ -109,23 +109,67 @@ def extract_imageparts(bodyparts, prefix): raise Exception # raise Exception('[extract] error decoding image %r' %b64encoded) else: - print('[extract] cached: ' + cache[b64encoded]) + print('[extract] cached link ' + cache[b64encoded]) name = cache[b64encoded] link = cdn + '/upload/image-' + name + '.' + ext - bodyparts[i] = current[:-len(mime)] + current[-len(mime):] + link + next[-b64end:] - bodyparts[i+1] = next[:-b64end] + newparts[i] = current[:-len(mime)] + current[-len(mime):] + link + next[-b64end:] + newparts[i+1] = next[:-b64end] break - return extract_imageparts(sep.join(bodyparts[i+1:]), prefix) \ - if len(bodyparts) > (i + 1) else ''.join(bodyparts) + return extract_imageparts(newparts[i] + newparts[i+1] + b64.join(bodyparts[i+2:]), prefix) \ + if len(bodyparts) > (i + 1) else ''.join(newparts) + +def extract_dataimages(parts, prefix): + newparts = list(parts) + for part in parts: + i = parts.index(part) + if part.endswith(']('): + [ext, rest] = parts[i+1].split(b64) + name = prefix + '-' + str(len(cache)) + if ext == '/jpeg': ext = 'jpg' + else: ext = ext.replace('/', '') + link = '/upload/image-' + name + '.' + ext + print('[extract] filename: ' + link) + b64end = rest.find(')') + if b64end !=-1: + b64encoded = rest[:b64end] + print('[extract] %d text bytes' % len(b64encoded)) + # write if not cached + if b64encoded not in cache: + try: + content = base64.b64decode(b64encoded + '==') + open(public + link, 'wb').write(content) + print('[extract] ' +str(len(content)) + ' image bytes') + cache[b64encoded] = name + except: + raise Exception + # raise Exception('[extract] error decoding image %r' %b64encoded) + else: + print('[extract] 0 image bytes, cached for ' + cache[b64encoded]) + name = cache[b64encoded] + + # update link with CDN + link = cdn + '/upload/image-' + name + '.' + ext + + # patch newparts + newparts[i+1] = link + rest[b64end:] + else: + raise Exception('cannot find the end of base64 encoded string') + else: + print('[extract] dataimage skipping part ' + str(i)) + continue + return ''.join(newparts) + +di = 'data:image' def extract_images(body, oid): newbody = '' body = body\ - .replace(' [](data:image', '![](data:image')\ - .replace('\n[](data:image', '![](data:image') - parts = body.split(sep) + .replace('\n! []('+di, '\n ![]('+di)\ + .replace('\n[]('+di, '\n![]('+di)\ + .replace(' []('+di, ' ![]('+di) + parts = body.split(di) i = 0 - if len(parts) > 1: newbody = extract_imageparts(parts, oid) + if len(parts) > 1: newbody = extract_dataimages(parts, oid) else: newbody = body return newbody @@ -148,8 +192,9 @@ def cleanup(body): return newbody def extract(body, oid): - if body: - newbody = extract_images(body, oid) + newbody = body + if newbody: + newbody = extract_images(newbody, oid) if not newbody: raise Exception('extract_images error') newbody = cleanup(newbody) if not newbody: raise Exception('cleanup error') @@ -157,96 +202,82 @@ def extract(body, oid): if not newbody: raise Exception('place_tooltips error') if placed: newbody = 'import Tooltip from \'$/components/Article/Tooltip\'\n\n' + newbody - return newbody - return body + return newbody def prepare_body(entry): - # print('[migration] preparing body %s' % entry.get('slug','')) # body modifications body = '' - body_orig = entry.get('body', '') - if not body_orig: body_orig = '' - - if entry.get('type') == 'Literature': - print('[extract] literature') + kind = entry.get('type') + addon = '' + if kind == 'Video': + addon = '' for m in entry.get('media', []): - t = m.get('title', '') - if t: body_orig += '
' + t + '
\n' - body_orig += (m.get('body') or '').replace((m.get('literatureBody') or ''), '') + m.get('literatureBody', '') + '\n' - - elif entry.get('type') == 'Video': - print('[extract] embedding video') - providers = set([]) - video_url = '' - require = False - for m in entry.get('media', []): - yt = m.get('youtubeId', '') - vm = m.get('vimeoId', '') - if yt: - require = True - providers.add('YouTube') - video_url = 'https://www.youtube.com/watch?v=' + yt - body += '\n' - if vm: - require = True - providers.add('Vimeo') - video_url = 'https://vimeo.com/' + vm - body += '\n' - body += extract(html2text(m.get('body', '')), entry['_id']) - if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!')) - if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n' - # already body_orig = entry.get('body', '') - - elif entry.get('type') == 'Music': - print('[extract] music album') + if 'youtubeId' in m: addon += '\n' + elif 'vimeoId' in m: addon += '\n' + else: + print('[extract] media is not supported') + print(m) + body = 'import * as Social from \'solid-social\'\n\n' + addon + + elif kind == 'Music': + addon = '' for m in entry.get('media', []): artist = m.get('performer') trackname = '' if artist: trackname += artist + ' - ' if 'title' in m: trackname += m.get('title','') - body += '\n' - body += extract(html2text(m.get('body', '')), entry['_id']) - body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n' - # already body_orig = entry.get('body', '') + addon += '\n' + body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + addon - elif entry.get('type') == 'Image': - print('[extract] image gallery') - cover = '' - if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId'] - if not cover: - if 'image' in entry: cover = entry['image'].get('url', '') - if 'cloudinary' in cover: cover = '' - else: - print('[migration] cover: ' + cover) - images = {} - for m in entry.get('media', []): - b = '' - title = m.get('title','').replace('\n', ' ').replace(' ', ' ') - u = m.get('image', {}).get('url', '') or m.get('thumborId') or cover - u = str(u) - b += '

' + title + '

\n' + body_orig - if not u.startswith('http'): u = s3 + u - if not u: print('[extract] no image for ' + str(m)) - if 'cloudinary' in u: u = 'img/lost.svg' - if u not in images.keys(): - # print('[extract] image: ' + u) - images[u] = title - b += '\"'+\n' - b += m.get('body', '') + '\n' - body += extract(html2text(b), entry['_id']) + body_orig = extract_html(entry) + if body_orig: body += extract(html2text(body_orig), entry['_id']) + if not body: print('[extract] empty MDX body') + return body - elif not body_orig: +def extract_html(entry): + body_orig = entry.get('body') or '' + media = entry.get('media', []) + kind = entry.get('type') or '' + print('[extract] kind: ' + kind) + mbodies = set([]) + if media: + # print('[extract] media is found') + for m in media: + mbody = m.get('body', '') + addon = '' + if kind == 'Literature': + mbody = m.get('literatureBody') or m.get('body', '') + elif kind == 'Image': + cover = '' + if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId'] + if not cover: + if 'image' in entry: cover = entry['image'].get('url', '') + if 'cloudinary' in cover: cover = '' + # else: print('[extract] cover: ' + cover) + title = m.get('title','').replace('\n', ' ').replace(' ', ' ') + u = m.get('thumborId') or cover or '' + addon = '

' + title + '

\n' + if not u.startswith('http'): u = s3 + u + if not u: print('[extract] no image url for ' + str(m)) + if 'cloudinary' in u: u = 'img/lost.svg' + if u != cover or (u == cover and media.index(m) == 0): + addon += '\"'+\n' + if addon: + body_orig += addon + # print('[extract] item addon: ' + addon) + # if addon: print('[extract] addon: %s' % addon) + if mbody and mbody not in mbodies: + mbodies.add(mbody) + body_orig += mbody + if len(list(mbodies)) != len(media): + print('[extract] %d/%d media item bodies appended' % (len(list(mbodies)),len(media))) + # print('[extract] media items body: \n' + body_orig) + if not body_orig: for up in entry.get('bodyHistory', []) or []: body_orig = up.get('text', '') or '' - if body_orig: - print('[extract] body from history!') + if body_orig: + print('[extract] got html body from history') break - if not body and not body_orig: print('[extract] error: EMPTY BODY') - + if not body_orig: print('[extract] empty HTML body') # body_html = str(BeautifulSoup(body_orig, features="html.parser")) - # print('[extract] adding original body') - if body_orig: body += extract(html2text(body_orig), entry['_id']) - if entry['slug'] in sys.argv: - open(contentDir + '/' + entry['slug'] + '.html', 'w')\ - .write(entry.get('body','')) - return body + return body_orig \ No newline at end of file diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index d21d61b4..7db195d6 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -40,7 +40,7 @@ def migrate(entry, storage): 'title': entry['title'], 'community': 0, 'authors': [], - 'topics': [], + 'topics': set([]), 'rating': 0, 'ratings': [], 'createdAt': [] @@ -112,9 +112,10 @@ def migrate(entry, storage): topic_oids.extend(entry.get('tags', [])) for oid in topic_oids: if oid in storage['topics']['by_oid']: - r['topics'].append(storage['topics']['by_oid'][oid]['slug']) + r['topics'].add(storage['topics']['by_oid'][oid]['slug']) else: print('[migration] unknown old topic id: ' + oid) + r['topics'] = list(r['topics']) entry['topics'] = r['topics'] entry['cover'] = r['cover'] @@ -191,7 +192,8 @@ def migrate(entry, storage): .filter(ShoutTopic.topic == newslug).first() if not shout_topic_new: ShoutTopic.create(**{ 'shout': s.slug, 'topic': newslug }) session.commit() - shout_dict['topics'].append(newslug) + if newslug not in shout_dict['topics']: + shout_dict['topics'].append(newslug) else: print('[migration] ignored topic slug: \n%r' % tpc['slug']) # raise Exception