From 5b679f99e0b9ffb20265c0d5df626bb95ec21f03 Mon Sep 17 00:00:00 2001 From: tonyrewin Date: Sun, 3 Jul 2022 03:58:41 +0300 Subject: [PATCH] migration: new extract logix --- migrate.py | 171 ++++++++++++++++++------------------------- migration/extract.py | 154 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 99 deletions(-) create mode 100644 migration/extract.py diff --git a/migrate.py b/migrate.py index 9da3ca9c..618eb097 100644 --- a/migrate.py +++ b/migrate.py @@ -1,9 +1,7 @@ ''' cmd managed migration ''' import json -import pprint -import base64 -import re import frontmatter +from migration.extract import extract from migration.tables.users import migrate as migrateUser from migration.tables.users import migrate_2stage as migrateUser_2stage from migration.tables.users import migrate_email_subscription @@ -19,35 +17,14 @@ from dateutil.parser import parse as date_parse from orm.base import local_session from orm import User -print = pprint.pprint -IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((.|\s)*?))\)" OLD_DATE = '2016-03-05 22:22:00.350000' - -def extract_images(article): - ''' extract b64 encoded images from markdown in article body ''' - body = article['body'] - images = [] - matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE) - for i, match in enumerate(matches, start=1): - ext = match.group(3) - link = 'discoursio-web/public/upload/image-' + \ - article['old_id'] + str(i) + '.' + ext - img = match.group(4) - if img not in images: - open('../' + link, 'wb').write(base64.b64decode(img)) - images.append(img) - body = body.replace(match.group(2), link) - print(link) - article['body'] = body - return article - def users(users_by_oid, users_by_slug, users_data): ''' migrating users first ''' # limiting limit = len(users_data) if len(sys.argv) > 2: limit = int(sys.argv[2]) - print('migrating %d users...' % limit) + print('[migration] %d users...' % limit) counter = 0 id_map = {} for entry in users_data: @@ -63,16 +40,18 @@ def users(users_by_oid, users_by_slug, users_data): users_by_slug[user['slug']] = user # public id_map[user['old_id']] = user['slug'] counter += 1 - print(' - * - stage 2 users migration - * -') + # print(' - * - stage 2 users migration - * -') + ce = 0 for entry in users_data: - migrateUser_2stage(entry, id_map) - try: - open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id - open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug - print(str(len(users_by_slug.items())) + ' users migrated') - except Exception: - print('json dump error') - # print(users_by_oid) + ce += migrateUser_2stage(entry, id_map) + # print(str(len(users_by_slug.items())) + ' users migrated') + print('[migration] %d user ratings errors' % ce) + #try: + # open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id + # open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug + #except Exception: + # print('json dump error') + # # print(users_by_oid) def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data): @@ -80,7 +59,7 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data): # limiting limit = len(cats_data) + len(tags_data) if len(sys.argv) > 2: limit = int(sys.argv[2]) - print('migrating %d topics...' % limit) + print('[migration] %d topics...' % limit) counter = 0 retopics = json.loads(open('migration/tables/replacements.json').read()) topicslugs_by_oid = {} @@ -106,8 +85,8 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data): for oid, oslug in topicslugs_by_oid.items(): if topics_by_slug.get(oslug): topics_by_oid[oid] = topics_by_slug.get(retopics.get(oslug, oslug)) - print( str(len(topics_by_oid.values())) + ' topics by oid' ) - print( str(len(topics_by_slug.values())) + ' topics by slug' ) + print( '[migration] ' + str(len(topics_by_oid.values())) + ' topics by oid' ) + print( '[migration] ' + str(len(topics_by_slug.values())) + ' topics by slug' ) #replacements = {} # json.loads(open('migration/tables/replacements.json').read()) #for t in topics_by_title.values(): # slug = replacements.get(t['slug'].strip()) or t['slug'].strip() @@ -121,32 +100,24 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data): # sort_keys=True, # ensure_ascii=False)) -def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid): +def shouts(content_data, shouts_by_slug, shouts_by_oid): ''' migrating content items one by one ''' # limiting limit = len(content_data) if len(sys.argv) > 2: limit = int(sys.argv[2]) - print('migrating %d content items...' % limit) + print('[migration] %d content items...' % limit) counter = 0 discours_author = 0 errored = [] - + pub_counter = 0 # limiting try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data) except ValueError: limit = len(content_data) - te = {} for entry in content_data[:limit]: + if 'slug' in sys.argv and entry['slug'] not in sys.argv: continue try: shout, terrors = migrateShout(entry, users_by_oid, topics_by_oid) - for oid in terrors: - if not te.get(oid): - if oldtopics_by_oid.get(oid): - te[oldtopics_by_oid[oid]['slug']] = [] - else: - # print('lost old topic id: ' + oid) - pass - else: - te[oid].append(shout['slug']) + if entry.get('published'): pub_counter += 1 author = shout['authors'][0] shout['authors'] = [ author.id, ] newtopics = [] @@ -156,7 +127,6 @@ def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid): if nt not in newtopics: newtopics.append(nt) shout['topics'] = newtopics - shout = extract_images(shout) shouts_by_slug[shout['slug']] = shout shouts_by_oid[entry['_id']] = shout line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug) @@ -165,33 +135,34 @@ def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid): print(line) # open('./shouts.id.log', 'a').write(line + '\n') except Exception as e: - print(entry['_id']) + # print(entry['_id']) errored.append(entry) raise e - print(te) - open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder)) - open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder)) - print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated') - print(str(discours_author) + ' authored by @discours') + # print(te) + # open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder)) + # open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder)) + print('[migration] ' + str(counter) + ' content items were migrated') + print('[migration] ' + str(pub_counter) + ' have been published') + print('[migration] ' + str(discours_author) + ' authored by @discours') def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict): # update what was just migrated or load json again if len(export_authors.keys()) == 0: export_authors = json.loads(open('../src/data/authors.json').read()) - print(str(len(export_authors.items())) + ' exported authors loaded') + print('[migration] ' + str(len(export_authors.items())) + ' exported authors loaded') if len(export_articles.keys()) == 0: export_articles = json.loads(open('../src/data/articles.json').read()) - print(str(len(export_articles.items())) + ' exported articles loaded') + print('[migration] ' + str(len(export_articles.items())) + ' exported articles loaded') # limiting limit = 33 if len(sys.argv) > 2: limit = int(sys.argv[2]) - print('exporting %d articles to json...' % limit) + print('[migration] ' + 'exporting %d articles to json...' % limit) # filter export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article'] export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True) - print(str(len(export_list)) + ' filtered') + print('[migration] ' + str(len(export_list)) + ' filtered') export_list = export_list[:limit or len(export_list)] for (slug, article) in export_list: @@ -199,20 +170,20 @@ def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict) export_slug(slug, export_articles, export_authors, content_dict) def export_body(article, content_dict): - article = extract_images(article) + article['body'] = extract(article['body'], article['oid']) metadata = get_metadata(article) content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata)) - open('../discoursio-web/content/' + slug + '.mdx', 'w').write(content) - # open('../discoursio-web/content/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body']) + open('../discoursio-web/content/' + article['slug'] + '.mdx', 'w').write(content) + open('../discoursio-web/content/'+ article['slug'] + '.html', 'w').write(content_dict[article['old_id']]['body']) def export_slug(slug, export_articles, export_authors, content_dict): - print('exporting %s ' % slug) + print('[migration] ' + 'exporting %s ' % slug) if export_authors == {}: export_authors = json.loads(open('../src/data/authors.json').read()) - print(str(len(export_authors.items())) + ' exported authors loaded') + print('[migration] ' + str(len(export_authors.items())) + ' exported authors loaded') if export_articles == {}: export_articles = json.loads(open('../src/data/articles.json').read()) - print(str(len(export_articles.items())) + ' exported articles loaded') + print('[migration] ' + str(len(export_articles.items())) + ' exported articles loaded') shout = shouts_by_slug.get(slug, False) assert shout, 'no data error' @@ -233,12 +204,14 @@ def comments(comments_data): id_map[old_id] = id for comment in comments_data: migrateComment_2stage(comment, id_map) - print(str(len(id_map)) + ' comments exported') + print('[migration] ' + str(len(id_map)) + ' comments exported') -def export_email_subscriptions(email_subscriptions_data): +def export_email_subscriptions(): + email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read()) + print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions loaded') for data in email_subscriptions_data: migrate_email_subscription(data) - print(str(len(email_subscriptions_data)) + ' email subscriptions exported') + print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions exported') def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}): @@ -247,26 +220,26 @@ def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, indent=4, sort_keys=True, ensure_ascii=False)) - print(str(len(export_authors.items())) + ' authors exported') + print('[migration] ' + str(len(export_authors.items())) + ' authors exported') open('../src/data/topics.json', 'w').write(json.dumps(export_topics, cls=DateTimeEncoder, indent=4, sort_keys=True, ensure_ascii=False)) - print(str(len(export_topics.keys())) + ' topics exported') + print('[migration] ' + str(len(export_topics.keys())) + ' topics exported') open('../src/data/articles.json', 'w').write(json.dumps(export_articles, cls=DateTimeEncoder, indent=4, sort_keys=True, ensure_ascii=False)) - print(str(len(export_articles.items())) + ' articles exported') + print('[migration] ' + str(len(export_articles.items())) + ' articles exported') open('../src/data/comments.json', 'w').write(json.dumps(export_comments, cls=DateTimeEncoder, indent=4, sort_keys=True, ensure_ascii=False)) - print(str(len(export_comments.items())) + ' exported articles with comments') + print('[migration] ' + str(len(export_comments.items())) + ' exported articles with comments') if __name__ == '__main__': @@ -280,10 +253,10 @@ if __name__ == '__main__': bson2json.json_tables() else: # preparing data - + # users users_data = json.loads(open('migration/data/users.json').read()) - print(str(len(users_data)) + ' users loaded') + print('[migration] ' + str(len(users_data)) + ' users loaded') users_by_oid = {} users_by_slug = {} user_id_map = {} @@ -294,10 +267,10 @@ if __name__ == '__main__': users_by_oid[user.old_id] = vars(user) # tags tags_data = json.loads(open('migration/data/tags.json').read()) - print(str(len(tags_data)) + ' tags loaded') + print('[migration] ' + str(len(tags_data)) + ' tags loaded') # cats cats_data = json.loads(open('migration/data/content_item_categories.json').read()) - print(str(len(cats_data)) + ' cats loaded') + print('[migration] ' + str(len(cats_data)) + ' cats loaded') topics_data = tags_data tags_data.extend(cats_data) oldtopics_by_oid = { x['_id']: x for x in topics_data } @@ -308,12 +281,12 @@ if __name__ == '__main__': # content content_data = json.loads(open('migration/data/content_items.json').read()) content_dict = { x['_id']: x for x in content_data } - print(str(len(content_data)) + ' content items loaded') + print('[migration] ' + str(len(content_data)) + ' content items loaded') shouts_by_slug = {} shouts_by_oid = {} comments_data = json.loads(open('migration/data/comments.json').read()) - print(str(len(comments_data)) + ' comments loaded') + print('[migration] ' + str(len(comments_data)) + ' comments loaded') comments_by_post = {} # sort comments by old posts ids for old_comment in comments_data: @@ -321,10 +294,7 @@ if __name__ == '__main__': comments_by_post[cid] = comments_by_post.get(cid, []) if not old_comment.get('deletedAt', True): comments_by_post[cid].append(old_comment) - print(str(len(comments_by_post.keys())) + ' articles with comments') - - email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read()) - print(str(len(email_subscriptions_data)) + ' email subscriptions loaded') + print('[migration] ' + str(len(comments_by_post.keys())) + ' articles with comments') export_articles = {} # slug: shout export_authors = {} # slug: user @@ -338,29 +308,32 @@ if __name__ == '__main__': elif cmd == "topics": topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data) elif cmd == "shouts": - shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid) # NOTE: listens limit + shouts(content_data, shouts_by_slug, shouts_by_oid) # NOTE: listens limit elif cmd == "comments": comments(comments_data) elif cmd == "export_shouts": export_shouts(shouts_by_slug, export_articles, export_authors, content_dict) elif cmd == "email_subscriptions": - export_email_subscriptions(email_subscriptions_data) + export_email_subscriptions() + elif cmd == 'slug': + export_slug(sys.argv[2], export_articles, export_authors, content_dict) elif cmd == "all": users(users_by_oid, users_by_slug, users_data) topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data) - shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid) + shouts(content_data, shouts_by_slug, shouts_by_oid) comments(comments_data) - export_email_subscriptions(email_subscriptions_data) - elif cmd == 'slug': - export_slug(sys.argv[2], export_articles, export_authors, content_dict) + export_email_subscriptions() + else: + print('[migration] --- debug users, topics, shouts') + users(users_by_oid, users_by_slug, users_data) + topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data) + shouts(content_data, shouts_by_slug, shouts_by_oid) #export_finish(export_articles, export_authors, export_topics, export_comments) else: - print(''' - usage: python migrate.py bson - \n.. \ttopics - \n.. \tusers - \n.. \tshouts - \n.. \texport_shouts - \n.. \tslug - \n.. \tall - ''') + print('usage: python migrate.py bson') + print('.. \ttopics ') + print('.. \tusers ') + print('.. \tshouts ') + print('.. \texport_shouts ') + print('.. \tslug ') + print('.. \tall') diff --git a/migration/extract.py b/migration/extract.py new file mode 100644 index 00000000..965d4acc --- /dev/null +++ b/migration/extract.py @@ -0,0 +1,154 @@ +import re +import base64 + +TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)' + + +def replace_tooltips(body): + newbody = body + matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:] + for match in matches: + newbody = body.replace(match.group(1), '') # FIXME: doesn't work + if len(matches) > 0: + print('[extract] found %d tooltips' % len(matches)) + return newbody + + +def place_tooltips(body): + parts = body.split('///') + l = len(parts) + newparts = list(parts) + if l & 1: + if l > 1: + i = 1 + print('[extract] found %d tooltips' % (l-1)) + for part in parts[1:]: + if i & 1: + # print('[extract] tooltip: ' + part) + if 'a class="footnote-url" href=' in part: + fn = 'a class="footnote-url" href="' + link = part.split(fn,1)[1].split('"', 1)[0] + extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1] + newparts[i] = '' + else: + newparts[i] = '' % part + # print('[extract] tooltip: ' + newparts[i]) + else: + # print('[extract] pass: ' + part[:10] + '..') + newparts[i] = part + i += 1 + + return ''.join(newparts) + +IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)" +public = '../discoursio-web/public' +cdn = 'https://assets.discours.io' +cache = {} + + +def reextract_images(body, oid): + matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:] + i = 0 + for match in matches: + print('[extract] image ' + match.group(1)) + ext = match.group(3) + name = oid + str(i) + link = public + '/upload/image-' + name + '.' + ext + img = match.group(4) + title = match.group(1) # FIXME: this is not the title + if img not in cache: + content = base64.b64decode(img + '==') + print(str(len(img)) + ' image bytes been written') + open('../' + link, 'wb').write(content) + cache[img] = name + i += 1 + else: + print('[extract] image cached ' + cache[img]) + body.replace(str(match), '![' + title + '](' + cdn + link + ')') # FIXME: this does not work + return body + +IMAGES = { + 'data:image/png': 'png', + 'data:image/jpg': 'jpg', + 'data:image/jpeg': 'jpg', +} + +sep = ';base64,' + + +def extract_images(body, oid): + newbody = '' + body = body.replace(' [](data:image', '![](data:image').replace('\n[](data:image', '![](data:image') + oldparts = body.split(sep) + newparts = list(oldparts) + print() + if len(oldparts) > 1: + print('[extract] images for %s' % oid) + print('[extract] %d candidates' % (len(oldparts)-1)) + i = 0 + for current in oldparts: + next = '' + try: next = oldparts[i+1] + except: newbody += current + start = oldparts.index(current) == 0 + end = not next + if end: + continue + else: # start or between + # print('[extract_images] have next') + for mime in IMAGES.keys(): + if mime in current[-15:]: + # print('[extract_images] found proper mime type') + print('[extract] ' + current[-15:]) + if ')' in next: + b64encoded = next.split(')')[0] + print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded)) + # print(meta) + ext = IMAGES[mime] + print('[extract] type: ' + mime) + name = oid + '-' + str(i) + print('[extract] name: ' + name) + link = '/upload/image-' + name + '.' + ext + print('[extract] link: ' + link) + if b64encoded: + if b64encoded not in cache: + content = base64.b64decode(b64encoded + '==') + open(public + link, 'wb').write(content) + cache[b64encoded] = name + else: + print('[extract] cached: ' + cache[b64encoded]) + name = cache[b64encoded] + link = cdn + '/upload/image-' + name + '.' + ext + newparts[i] = current.split('![](' + mime)[0] + '![](' + link + ')' + newparts[i+1] = next.replace(b64encoded + ')', '') + else: + print('[extract] not b64encoded') + print(current[-15:]) + i += 1 + newbody = ''.join(newparts) + return newbody + + +def cleanup(body): + newbody = body\ + .replace('<', '').replace('>', '')\ + .replace('{', '(').replace('}', ')')\ + .replace('…', '...')\ + .replace(' __ ', ' ')\ + .replace('_ _', ' ')\ + .replace('****', '')\ + .replace('\u00a0', ' ')\ + .replace('\u02c6', '^')\ + .replace('\u00a0',' ')\ + .replace('\ufeff', '')\ + .replace('\u200b', '')\ + .replace('\u200c', '')\ + # .replace('\u2212', '-') + return newbody + + +def extract(body, oid): + newbody = extract_images(body, oid) + newbody = cleanup(newbody) + newbody = place_tooltips(newbody) + return newbody \ No newline at end of file