core/migrate.py

''' cmd managed migration '''
import json
import base64
import re
import frontmatter
from migration.tables.users import migrate as migrateUser
from migration.tables.content_items import get_metadata, migrate as migrateShout
from migration.tables.content_item_categories import migrate as migrateCategory
from migration.tables.tags import migrate as migrateTag
from migration.tables.comments import migrate as migrateComment
from migration.utils import DateTimeEncoder
from orm import Community
from dateutil.parser import parse as date_parse


IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'


def extract_images(article):
    ''' extract b64 encoded images from markdown in article body '''
    body = article['body']
    images = []
    matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
    for i, match in enumerate(matches, start=1):
        ext = match.group(3)
        link = '/static/upload/image-' + \
            article['old_id'] + str(i) + '.' + ext
        img = match.group(4)
        if img not in images:
            open('..' + link, 'wb').write(base64.b64decode(img))
            images.append(img)
        body = body.replace(match.group(2), link)
        print(link)
    article['body'] = body
    return article


def users():
    ''' migrating users first '''
    print('migrating users...')
    newdata = {}
    data = json.loads(open('migration/data/users.json').read())
    counter = 0
    export_data = {}
    for entry in data:
        oid = entry['_id']
        user = migrateUser(entry)
        newdata[oid] = user
        del user['password']
        del user['notifications']
        # del user['oauth']
        del user['emailConfirmed']
        del user['username']
        del user['email']
        export_data[user['slug']] = user
        counter += 1
    export_list = sorted(export_data.items(), key=lambda item: item[1]['rating'])[-10:]
    open('migration/data/users.dict.json', 'w').write(json.dumps(newdata, cls=DateTimeEncoder))  # NOTE: by old_id
    open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list),
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
    print(str(len(newdata.items())) + ' user accounts were migrated')
    print(str(len(export_list)) + ' authors were exported')


def topics():
    ''' topics from categories and tags '''
    print('migrating topics...')
    cats_data = json.loads(open('migration/data/content_item_categories.json').read())
    cat_topics = {}
    slug_topics = {}
    counter = 0
    try:
        for cat in cats_data:
            topic = migrateCategory(cat)
            cat_topics[topic['cat_id']] = topic
            slug_topics[topic['slug']] = topic
            counter += 1
    except Exception as e:
        print('cats exception, try to remove database first')
        raise e
    '''
    try:
        for tag in tag_data:
            topic = migrateTag(tag)
            newdata[topic['slug']] = topic
            counter += 1
    except Exception:
        print('tags exception, try to remove database first')
        raise Exception
    '''
    export_list = sorted(slug_topics.items(), key=lambda item: str(
        item[1]['createdAt']))
    open('migration/data/topics.dict.json','w').write(json.dumps(cat_topics,
                                                        cls=DateTimeEncoder,
                                                        indent=4,
                                                        sort_keys=True,
                                                        ensure_ascii=False))
    open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list),
                                                        cls=DateTimeEncoder,
                                                        indent=4,
                                                        sort_keys=True,
                                                        ensure_ascii=False))
    #' tags and ' + str(len(tag_data)) +
    print(str(counter) + ' / ' + str(len(cats_data)) + ' migrated')
    print(str(len(export_list)) + ' topics were exported')


def shouts():
    ''' migrating content items one by one '''
    print('loading shouts...')
    counter = 0
    discours_author = 0
    content_data = json.loads(open('migration/data/content_items.json').read())
    content_dict = { x['_id']:x for x in content_data }
    newdata = {}
    print(str(len(content_data)) + ' entries loaded. now migrating...')
    errored = []
    for entry in content_data:
        try:
            shout = migrateShout(entry)
            newdata[shout['slug']] = shout
            author = newdata[shout['slug']]['authors'][0]['slug']
            line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
            print(line)
            counter += 1
            if author == 'discours':
                discours_author += 1
            open('./shouts.id.log', 'a').write(line + '\n')
        except Exception as e:
            print(entry['_id'])
            errored.append(entry)
            raise e
    try:
        limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
    except ValueError:
        limit = len(content_data)
    open('migration/data/shouts.dict.json',
        'w').write(json.dumps(newdata, cls=DateTimeEncoder))
    print(str(counter) + '/' + str(len(content_data)) +
        ' content items were migrated')
    print(str(discours_author) + ' from them by @discours')

def comments():
    ''' migrating comments on content items one by one '''
    content_data = json.loads(open('migration/data/content_items.json').read()) # old content
    content_dict = { x['_id']: x for x in content_data } # by slug
    shouts_dict = json.loads(open('migration/data/shouts.dict.json', 'r').read()) # all shouts by slug
    print(str(len(shouts_dict.keys())) + ' migrated shouts loaded')
    shouts_old = { x['old_id']: x for slug, x in shouts_dict.items() } # shouts by old_id
    print(str(len(content_data)) + ' content items loaded')
    comments_data = json.loads(open('migration/data/comments.json').read()) # by slug
    print(str(len(comments_data)) + ' comments loaded')
    comments_by_post = {}
    # sort comments by old posts ids
    for old_comment in comments_data:
        cid = old_comment['contentItem']
        comments_by_post[cid] = comments_by_post.get(cid, [])
        comments_by_post[cid].append(old_comment)
    # migrate comments
    comments_by_shoutslug = {}
    for content_item in content_data:
        old_id = content_item['_id']
        if content_item.get('commentedAt', False):
            comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
            if comments.length > 0:
                shout = shouts_old.get(old_id, { 'slug': 'abandoned-comments' })
                comments_by_shoutslug[shout['slug']] = comments
    export_articles = json.loads(open('../src/data/articles.json').read())
    print(str(len(export_articles.items())) + ' articles were exported')
    export_comments = {}
    c = 0
    for slug, article in export_articles.items():
        comments = comments_by_shoutslug.get(slug, [])
        if len(comments) > 0:
            export_comments[slug] = comments
            c += len(comments)
    print(str(len(export_comments.items())) + ' after adding those having comments')
    open('../src/data/comments.json', 'w').write(json.dumps(dict(export_comments),
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
    print(str(c) + ' comments were exported')


def export_shouts(limit):
    print('reading json...')
    content_data = json.loads(open('migration/data/content_items.json').read())
    content_dict = { x['_id']:x for x in content_data }
    print(str(len(content_data)) + ' content items loaded')
    newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
    print(str(len(newdata.keys())) + ' migrated shouts loaded')
    users_old = json.loads(open('migration/data/users.dict.json').read())
    print(str(len(newdata.keys())) + ' migrated users loaded')
    export_authors = json.loads(open('../src/data/authors.json').read())
    print(str(len(export_authors.items())) + ' exported authors loaded')
    users_slug = { u['slug']: u for old_id, u in users_old.items()}
    print(str(len(users_slug.items())) + ' users loaded')

    export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
    export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
    print(str(len(export_list)) + ' filtered')

    export_list = export_list[:limit or len(export_list)]
    export_clean = {}
    for (slug, article) in export_list:
        if article['layout'] == 'article':
            for author in article['authors']:
                export_authors[author['slug']] = users_slug[author['slug']]
            export_clean[article['slug']] = extract_images(article)
            metadata = get_metadata(article)
            content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
            open('../content/discours.io/'+slug+'.md', 'w').write(content)
            # print(slug)
            open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
    open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
    print(str(len(export_clean.items())) + ' articles exported')
    open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
    comments()
    print(str(len(export_authors.items())) + ' total authors exported')

def export_slug(slug):
    shouts_dict = json.loads(open('migration/data/shouts.dict.json').read())
    print(str(len(shouts_dict.items())) + ' migrated shouts loaded')
    users_old = json.loads(open('migration/data/users.dict.json').read()) # NOTE: this exact file is by old_id
    print(str(len(users_old.items())) + ' migrated users loaded')
    users_dict = { x[1]['slug']:x for x in users_old.items() }
    exported_authors = json.loads(open('../src/data/authors.json').read())
    print(str(len(exported_authors.items())) + ' exported authors loaded')
    exported_articles = json.loads(open('../src/data/articles.json').read())
    print(str(len(exported_articles.items())) + ' exported articles loaded')
    shout = shouts_dict.get(slug, False)
    if shout:
        author = users_dict.get(shout['authors'][0]['slug'], None)
        exported_authors.update({shout['authors'][0]['slug']: author})
        exported_articles.update({shout['slug']: shout})
        print(shout)
        open('../src/data/articles.json', 'w').write(json.dumps(exported_articles,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
        open('../src/data/authors.json', 'w').write(json.dumps(exported_authors,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
    else:
        print('no old id error!')
        # print(str(len(shouts_dict)) + ' shouts were migrated')
        print(slug)
    comments()
    print('finished.')


if __name__ == '__main__':
    import sys
    if len(sys.argv) > 1:
        if sys.argv[1] == "users":
            users()
        elif sys.argv[1] == "topics":
            topics()
        elif sys.argv[1] == "shouts":
            try:
                Community.create(**{
                    'slug': 'discours.io',
                    'name': 'Дискурс',
                    'pic': 'https://discours.io/images/logo-min.svg',
                    'createdBy': '0',
                    'createdAt': date_parse(OLD_DATE)
                })
            except Exception:
                pass
            shouts()
        elif sys.argv[1] == "comments":
            comments()
        elif sys.argv[1] == "export_shouts":
            limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
            export_shouts(limit)
        elif sys.argv[1] == "all":
            users()
            topics()
            shouts()
            comments()
        elif sys.argv[1] == "bson":
            from migration import bson2json
            bson2json.json_tables()
        elif sys.argv[1] == 'slug':
            export_slug(sys.argv[2])
    else:
        print('usage: python migrate.py bson\n.. \ttopics <limit>\n.. \tusers <limit>\n.. \tshouts <limit>\n.. \tcomments\n.. \texport_shouts <limit>\n.. \tslug <slug>\n.. \tall>')