From c3e0c5720ab6199d691bd5ff5fc645e7e26ab530 Mon Sep 17 00:00:00 2001 From: Untone Date: Fri, 8 Oct 2021 07:42:59 +0300 Subject: [PATCH] upgrade migration --- migrate.py | 186 ++++++++++++++++++++---------- migration/tables/content_items.py | 70 ++++++----- migration/tables/tags.py | 28 ++++- migration/tables/users.py | 150 ++++++++++++------------ 4 files changed, 260 insertions(+), 174 deletions(-) diff --git a/migrate.py b/migrate.py index a8ce0e75..5ed76323 100644 --- a/migrate.py +++ b/migrate.py @@ -1,18 +1,45 @@ +''' cmd managed migration ''' import json +import base64 +import re from migration.tables.users import migrate as migrateUser from migration.tables.content_items import migrate as migrateShout -from migration.tables.content_item_categories import migrate as migrateTopic +from migration.tables.content_item_categories import migrate as migrateCategory +from migration.tables.tags import migrate as migrateTag from migration.utils import DateTimeEncoder from orm import Community -def users(limit): + +IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)" +OLD_DATE = '2016-03-05 22:22:00.350000' + + +def extract_images(article): + ''' extract b64 encoded images from markdown in article body ''' + body = article['body'] + images = [] + matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE) + for i, match in enumerate(matches, start=1): + ext = match.group(3) + link = '/static/upload/image-' + \ + article['old_id'] + str(i) + '.' + ext + img = match.group(4) + if img not in images: + open('..' + link, 'wb').write(base64.b64decode(img)) + images.append(img) + body = body.replace(match.group(2), link) + print(link) + article['body'] = body + return article + + +def users(): + ''' migrating users first ''' print('migrating users...') - data = json.loads(open('migration/data/users.json').read()) newdata = {} - exportData = {} + data = json.loads(open('migration/data/users.json').read()) counter = 0 - # limit = 100 - #try: + export_data = {} for entry in data: oid = entry['_id'] user = migrateUser(entry) @@ -23,96 +50,127 @@ def users(limit): del user['emailConfirmed'] del user['username'] del user['email'] - exportData[user['slug']] = user + export_data[user['slug']] = user counter += 1 - if counter > limit: - break - #except Exception: - # print(str(counter) + '/' + str(len(data)) + ' users entries were migrated') - # print('try to remove database first') - open('migration/data/users.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) - open('../src/data/authors.json','w').write( json.dumps(exportData, cls=DateTimeEncoder) ) - print(str(counter) + ' users entries were migrated') + export_list = sorted(export_data.items(), + key=lambda item: item[1]['rating'])[-10:] + open('migration/data/users.dict.json', + 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) # NOTE: by old_id + open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list), + cls=DateTimeEncoder, + indent=4, + sort_keys=True, + ensure_ascii=False)) + print(str(len(newdata.items())) + ' user accounts were migrated') + print(str(len(export_list)) + ' authors were exported') def topics(): + ''' topics from categories and tags ''' print('migrating topics...') - data = json.loads(open('migration/data/content_item_categories.json').read()) + cat_data = json.loads( + open('migration/data/content_item_categories.json').read()) + tag_data = json.loads(open('migration/data/tags.json').read()) newdata = {} - exportData = {} counter = 0 try: - for entry in data: - oid = entry['_id'] - newdata[oid] = migrateTopic(entry) - exportData[entry['slug']] = newdata[oid] + for cat in cat_data: + topic = migrateCategory(cat) + newdata[topic['slug']] = topic counter += 1 except Exception: - print(str(counter) + '/' + str(len(data)) + ' topics were migrated') - print('try to remove database first') - open('migration/data/topics.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) - open('../src/data/topics.json','w').write( json.dumps(exportData, cls=DateTimeEncoder) ) - print(str(counter) + ' topics were migrated') + print('cats exception, try to remove database first') + try: + for tag in tag_data: + topic = migrateTag(tag) + newdata[topic['slug']] = topic + counter += 1 + except Exception: + print('tags exception, try to remove database first') + raise Exception + export_list = sorted(newdata.items(), key=lambda item: str( + item[1]['createdAt']))[-10:] + open('migration/data/topics.dict.json', + 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) + open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list), + cls=DateTimeEncoder, indent=4, sort_keys=True, ensure_ascii=False)) + print(str(counter) + ' from ' + str(len(cat_data)) + + ' tags and ' + str(len(tag_data)) + ' cats were migrated') + print(str(len(export_list)) + ' topics were exported') -def shouts(limit): + +def shouts(): + ''' migrating content items one by one ''' print('loading shouts...') counter = 0 - discoursAuthor = 0 - data = json.loads(open('migration/data/content_items.json').read()) + discours_author = 0 + content_data = json.loads(open('migration/data/content_items.json').read()) newdata = {} - print(str(len(data)) + ' entries loaded. now migrating...') + print(str(len(content_data)) + ' entries loaded. now migrating...') errored = [] - exportData = {} - for entry in data: + for entry in content_data: try: - oid = entry['_id'] - shout = migrateShout(entry) - newdata[oid] = shout - author = newdata[oid]['authors'][0]['slug'] - line = str(counter) + ': ' + newdata[oid]['slug'] + " @" + str(author) - if shout['layout'] == 'article': - counter += 1 - exportData[shout['slug']] = shout - print(line) - # counter += 1 + (shout, content) = migrateShout(entry) + newdata[shout['slug']] = shout + author = newdata[shout['slug']]['authors'][0]['slug'] + line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author) + print(line) + counter += 1 if author == 'discours.io': - discoursAuthor += 1 - open('./shouts.id.log','a').write(line + '\n') - if counter > limit: - break + discours_author += 1 + open('./shouts.id.log', 'a').write(line + '\n') except Exception: print(entry['_id']) errored.append(entry) - raise Exception + raise Exception(" error") + try: + limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data) + except ValueError: + limit = len(content_data) + export_list = sorted(newdata.items( + ), key=lambda item: item[1]['createdAt'] if item[1]['layout'] == 'article' else OLD_DATE)[:limit] + export_clean = {} + for slug, a in dict(export_list).items(): + export_clean[slug] = extract_images(a) + open('../content/discours.io/'+slug+'.md', 'w').write(content) + open('migration/data/shouts.dict.json', + 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) + open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean), + cls=DateTimeEncoder, + indent=4, + sort_keys=True, + ensure_ascii=False)) + print(str(counter) + '/' + str(len(content_data)) + + ' content items were migrated') + print(str(len(export_list)) + ' shouts were exported') + print(str(discours_author) + ' from them by @discours.io') - open('migration/data/shouts.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) ) - open('../src/data/articles.json','w').write( json.dumps(exportData, cls=DateTimeEncoder) ) - print(str(counter) + ' shouts were migrated') - print(str(discoursAuthor) + ' from them by @discours.io') - print(str(len(errored)) + ' shouts without authors') if __name__ == '__main__': import sys if len(sys.argv) > 1: if sys.argv[1] == "users": - users(668) + users() elif sys.argv[1] == "topics": topics() elif sys.argv[1] == "shouts": - Community.create(**{ - 'slug': 'discours.io', - 'name': 'Дискурс', - 'pic': 'https://discours.io/images/logo-min.svg', - 'createdBy': '0', - 'createdAt': ts + try: + Community.create(**{ + 'slug': 'discours.io', + 'name': 'Дискурс', + 'pic': 'https://discours.io/images/logo-min.svg', + 'createdBy': '0', + 'createdAt': OLD_DATE }) - shouts(3626) + except Exception: + pass + shouts() elif sys.argv[1] == "all": + users() topics() - users(668) - shouts(3626) + shouts() elif sys.argv[1] == "bson": - import migration.bson2json + from migration import bson2json bson2json.json_tables() else: - print('usage: python migrate.py ') \ No newline at end of file + print('usage: python migrate.py ') diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index d878b80d..7a1d16e4 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -16,8 +16,9 @@ users_dict['0'] = { 'id': 9999999, 'slug': 'discours.io', 'name': 'Дискурс', - 'userpic': 'https://discours.io/images/logo-mini.svg' - } + 'userpic': 'https://discours.io/images/logo-mini.svg', + 'createdAt': '2016-03-05 22:22:00.350000' +} ts = datetime.now() @@ -29,8 +30,9 @@ type2layout = { 'Image': 'image' } -def migrate(entry, limit=3626, start=0): - ''' + +def migrate(entry): + ''' type Shout { slug: String! author: Int! @@ -41,7 +43,7 @@ def migrate(entry, limit=3626, start=0): deletedBy: Int rating: Int ratigns: [Rating] - published: Bool! + published: Bool! publishedAt: DateTime # if there is no published field - it is not published replyTo: String # another shout tags: [String] # actual values @@ -53,17 +55,19 @@ def migrate(entry, limit=3626, start=0): views: Int } ''' + content = '' r = { - 'layout': type2layout[entry['type']], - 'title': entry['title'], - 'community': 0, - 'authors': [], - 'topics': [], - 'published': entry.get('published', False), - 'views': entry.get('views', 0), - 'rating': entry.get('rating', 0), - 'ratings': [] - } + 'layout': type2layout[entry['type']], + 'title': entry['title'], + 'community': 0, + 'authors': [], + 'topics': [], + 'published': entry.get('published', False), + 'views': entry.get('views', 0), + 'rating': entry.get('rating', 0), + 'ratings': [], + 'createdAt': '2016-03-05 22:22:00.350000' + } r['slug'] = entry.get('slug', '') body_orig = entry.get('body', '') if not r['slug'] and entry.get('friendlySlugs') is not None: @@ -88,7 +92,8 @@ def migrate(entry, limit=3626, start=0): if body_orig == '': print('EMPTY BODY!') else: - body_html = str(BeautifulSoup(body_orig, features="html.parser")) + body_html = str(BeautifulSoup( + body_orig, features="html.parser")) r['body'] = html2text(body_html).replace('****', '**') r['old_id'] = entry.get('_id') else: @@ -103,20 +108,20 @@ def migrate(entry, limit=3626, start=0): if videoUrl == '#': print(entry.get('media', 'NO MEDIA!')) # raise Exception - r['body'] = '' + html2text(m.get('body', '')) # FIXME + r['body'] = '' + html2text(m.get('body', '')) # FIXME elif entry.get('type') == 'Music': - r['body'] = '' # FIXME - + r['body'] = '' # FIXME if r.get('body') is None: body_orig = entry.get('body', '') body_html = str(BeautifulSoup(body_orig, features="html.parser")) r['body'] = html2text(body_html).replace('****', '**') r['old_id'] = entry.get('_id') - body = r.get('body') user = None try: - userdata = users_dict[entry['createdBy']] + userdata = users_dict.get(entry['createdBy'], users_dict['0']) slug = userdata['slug'] name = userdata['name'] userpic = userdata['userpic'] @@ -137,10 +142,11 @@ def migrate(entry, limit=3626, start=0): user = User.create(**authordata) except IntegrityError: with local_session() as session: - user = session.query(User).filter(User.email == authordata['email']).first() + user = session.query(User).filter( + User.email == authordata['email']).first() if user is None: - user = session.query(User).filter(User.slug == authordata['slug']).first() - + user = session.query(User).filter( + User.slug == authordata['slug']).first() slug = user['slug'] name = user['name'] userpic = user.userpic @@ -167,15 +173,15 @@ def migrate(entry, limit=3626, start=0): post = frontmatter.Post(body, **metadata) dumped = frontmatter.dumps(post) - if entry['published']: - #if r.get('old_id', None): + if entry['published']: + # if r.get('old_id', None): # ext = 'html' # content = str(body).replace('

', '').replace('

', '') - #else: + # else: ext = 'md' content = dumped - open('migration/content/' + metadata['layout'] + '/' + r['slug'] + '.' + ext, 'w').write(content) - + open('migration/content/' + + metadata['layout'] + '/' + r['slug'] + '.' + ext, 'w').write(content) try: shout_dict = r.copy() @@ -190,8 +196,8 @@ def migrate(entry, limit=3626, start=0): else: shout_dict['publishedAt'] = ts del shout_dict['published'] - del shout_dict['views'] # FIXME - del shout_dict['rating'] # FIXME + del shout_dict['views'] # FIXME + del shout_dict['rating'] # FIXME del shout_dict['ratings'] try: s = Shout.create(**shout_dict) @@ -203,4 +209,4 @@ def migrate(entry, limit=3626, start=0): print(r) # print(s) raise Exception - return r + return (r, content) diff --git a/migration/tables/tags.py b/migration/tables/tags.py index 6e6d80f0..620f7bd1 100644 --- a/migration/tables/tags.py +++ b/migration/tables/tags.py @@ -1,20 +1,36 @@ +import json + +from os.path import abspath +from datetime import datetime + +users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read()) +users_dict['0'] = { + 'id': 9999999, + 'slug': 'discours.io', + 'name': 'Дискурс', + 'userpic': 'https://discours.io/images/logo-mini.svg', + 'createdAt': '2016-03-05 22:22:00.350000' + } + +ts = datetime.now() + def migrate(entry): - ``` + ''' type Topic { slug: String! # ID createdBy: Int! # User createdAt: DateTime! - value: String + title: String parents: [String] # NOTE: topic can have parent topics children: [String] # and children } - ``` - creator = get_new_user_id(entry['createdBy']) + ''' + creator = users_dict.get(entry['createdBy'], users_dict['0']) return { 'slug': entry['slug'], - 'createdBy': creator_id, # NOTE: uses an old user id + 'createdBy': creator['id'], # NOTE: uses an old user id 'createdAt': entry['createdAt'], - 'title': entry['value'].lower(), + 'title': entry['title'].lower(), 'parents': [], 'children': [] } \ No newline at end of file diff --git a/migration/tables/users.py b/migration/tables/users.py index 5443f783..22917a92 100644 --- a/migration/tables/users.py +++ b/migration/tables/users.py @@ -7,76 +7,82 @@ from migration.html2text import html2text counter = 0 def migrate(entry, limit=668): - ''' - - type User { - username: String! # email - createdAt: DateTime! - email: String - password: String - oauth: String # provider:token - name: String # to display - userpic: String - links: [String] - emailConfirmed: Boolean # should contain all emails too - id: Int! - muted: Boolean - rating: Int - roles: [Role] - updatedAt: DateTime - wasOnlineAt: DateTime - ratings: [Rating] - slug: String - bio: String - notifications: [Int] - } + ''' - ''' - res = {} - res['old_id'] = entry['_id'] - res['password'] = entry['services']['password'].get('bcrypt', '') - res['username'] = entry['emails'][0]['address'] - res['email'] = res['username'] - res['wasOnlineAt'] = parse(entry.get('loggedInAt', entry['createdAt'])) - res['emailConfirmed'] = entry['emails'][0]['verified'] - res['createdAt'] = parse(entry['createdAt']) - res['rating'] = entry['rating'] # number - res['roles'] = [] # entry['roles'] # roles by community - res['ratings'] = [] # entry['ratings'] - res['notifications'] = [] - res['links'] = [] - res['muted'] = False - res['bio'] = html2text(entry.get('bio', '')) - if entry['profile']: - res['slug'] = entry['profile'].get('path') - res['userpic'] = entry['profile'].get('image', {'thumborId': ''}).get('thumborId', '') # adding 'https://assets.discours.io/unsafe/1600x' in web ui - fn = entry['profile'].get('firstName', '') - ln = entry['profile'].get('lastName', '') - name = res['slug'] if res['slug'] else 'anonymous' - name = fn if fn else name - name = (name + ' ' + ln) if ln else name - name = entry['profile']['path'] if len(name) < 2 else name - res['name'] = name - fb = entry['profile'].get('facebook', False) - if fb: - res['links'].append(fb) - vk = entry['profile'].get('vkontakte', False) - if vk: - res['links'].append(vk) - tr = entry['profile'].get('twitter', False) - if tr: - res['links'].append(tr) - ws = entry['profile'].get('website', False) - if ws: - res['links'].append(ws) - if not res['slug']: - res['slug'] = res['links'][0].split('/')[-1] - if not res['slug']: - res['slug'] = res['email'].split('@')[0] - else: - old = res['old_id'] - del res['old_id'] - user = User.create(**res.copy()) - res['id'] = user.id - res['old_id'] = old - return res \ No newline at end of file + type User { + username: String! # email + createdAt: DateTime! + email: String + password: String + oauth: String # provider:token + name: String # to display + userpic: String + links: [String] + emailConfirmed: Boolean # should contain all emails too + id: Int! + muted: Boolean + rating: Int + roles: [Role] + updatedAt: DateTime + wasOnlineAt: DateTime + ratings: [Rating] + slug: String + bio: String + notifications: [Int] + } + + ''' + res = {} + res['old_id'] = entry['_id'] + res['password'] = entry['services']['password'].get('bcrypt', '') + res['username'] = entry['emails'][0]['address'] + res['email'] = res['username'] + res['wasOnlineAt'] = parse(entry.get('loggedInAt', entry['createdAt'])) + res['emailConfirmed'] = entry['emails'][0]['verified'] + res['createdAt'] = parse(entry['createdAt']) + res['rating'] = entry['rating'] # number + res['roles'] = [] # entry['roles'] # roles by community + res['ratings'] = [] # entry['ratings'] + res['notifications'] = [] + res['links'] = [] + res['muted'] = False + res['bio'] = html2text(entry.get('bio', '')) + if entry['profile']: + res['slug'] = entry['profile'].get('path') + try: + res['userpic'] = 'https://assets.discours.io/unsafe/100x/' + entry['profile']['thumborId'] + except KeyError: + try: + res['userpic'] = entry['profile']['image']['url'] + except KeyError: + res['userpic'] = '' + fn = entry['profile'].get('firstName', '') + ln = entry['profile'].get('lastName', '') + name = res['slug'] if res['slug'] else 'anonymous' + name = fn if fn else name + name = (name + ' ' + ln) if ln else name + name = entry['profile']['path'] if len(name) < 2 else name + res['name'] = name + fb = entry['profile'].get('facebook', False) + if fb: + res['links'].append(fb) + vk = entry['profile'].get('vkontakte', False) + if vk: + res['links'].append(vk) + tr = entry['profile'].get('twitter', False) + if tr: + res['links'].append(tr) + ws = entry['profile'].get('website', False) + if ws: + res['links'].append(ws) + if not res['slug']: + res['slug'] = res['links'][0].split('/')[-1] + if not res['slug']: + res['slug'] = res['email'].split('@')[0] + else: + old = res['old_id'] + del res['old_id'] + user = User.create(**res.copy()) + res['id'] = user.id + res['old_id'] = old + return res