core/migrate.py

''' cmd managed migration '''
import json
import base64
import re
import frontmatter
from migration.tables.users import migrate as migrateUser
from migration.tables.content_items import get_metadata, migrate as migrateShout
from migration.tables.content_item_categories import migrate as migrateCategory
from migration.tables.tags import migrate as migrateTag
from migration.tables.comments import migrate as migrateComment
from migration.utils import DateTimeEncoder
from orm import Community
from dateutil.parser import parse as date_parse


IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'


if __name__ == '__main__':
    import sys

    users_data = json.loads(open('migration/data/users.json').read())
    # users_dict = { x['_id']: x for x in users_data } # by id
    print(str(len(users_data)) + ' users loaded')
    users_by_oid = {}
    users_by_slug = {}

    tags_data = json.loads(open('migration/data/tags.json').read())
    print(str(len(tags_data)) + ' tags loaded')

    cats_data = json.loads(open('migration/data/content_item_categories.json').read())
    print(str(len(cats_data)) + ' cats loaded')
    topics_by_cat = {}
    topics_by_tag = {}
    topics_by_slug = {}

    content_data = json.loads(open('migration/data/content_items.json').read())
    content_dict = { x['_id']: x for x in content_data }
    print(str(len(content_data)) + ' content items loaded')
    shouts_by_slug = {}
    shouts_by_oid = {}

    comments_data = json.loads(open('migration/data/comments.json').read())
    print(str(len(comments_data)) + ' comments loaded')
    comments_by_post = {}

    # sort comments by old posts ids
    for old_comment in comments_data:
        cid = old_comment['contentItem']
        comments_by_post[cid] = comments_by_post.get(cid, [])
        if not old_comment.get('deletedAt', True):
            comments_by_post[cid].append(old_comment)
    print(str(len(comments_by_post.keys())) + ' articles with comments')

    export_articles = {} # slug: shout
    export_authors = {} # slug: user
    export_comments = {} # shout-slug: comment[] (list)
    export_topics = {} # slug: topic


    def extract_images(article):
        ''' extract b64 encoded images from markdown in article body '''
        body = article['body']
        images = []
        matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
        for i, match in enumerate(matches, start=1):
            ext = match.group(3)
            link = '/static/upload/image-' + \
                article['old_id'] + str(i) + '.' + ext
            img = match.group(4)
            if img not in images:
                open('..' + link, 'wb').write(base64.b64decode(img))
                images.append(img)
            body = body.replace(match.group(2), link)
            print(link)
        article['body'] = body
        return article


    def users(users_by_oid, users_by_slug, users_data):
        ''' migrating users first '''
        # limiting
        limit = len(users_data)
        if len(sys.argv) > 2: limit = int(sys.argv[2])
        print('migrating %d users...' % limit)
        counter = 0
        for entry in users_data:
            oid = entry['_id']
            user = migrateUser(entry)
            users_by_oid[oid] = user # full
            del user['password']
            del user['notifications']
            # del user['oauth']
            del user['emailConfirmed']
            del user['username']
            del user['email']
            users_by_slug[user['slug']] = user # public
            counter += 1
        export_authors = dict(sorted(users_by_slug.items(), key=lambda item: item[1]['rating'])[-10:])
        open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder))  # NOTE: by old_id
        open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder))  # NOTE: by old_id
        print(str(len(users_by_slug.items())) + ' users migrated')


    def topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data):
        ''' topics from categories and tags '''
        # limiting
        limit = len(cats_data) + len(tags_data)
        if len(sys.argv) > 2: limit = int(sys.argv[2])
        print('migrating %d topics...' % limit)
        counter = 0
        for cat in cats_data:
            try: topic = migrateCategory(cat)
            except Exception as e: raise e
            topics_by_cat[topic['cat_id']] = topic
            topics_by_slug[topic['slug']] = topic
            counter += 1
        for tag in tags_data:
            topic = migrateTag(tag)
            topics_by_tag[topic['tag_id']] = topic
            if not topics_by_slug.get(topic['slug']): topics_by_slug[topic['slug']] = topic
            counter += 1
        export_topics = dict(sorted(topics_by_slug.items(), key=lambda item: str(item[1]['createdAt']))) # NOTE: sorting does not work :)
        open('migration/data/topics.slug.json','w').write(json.dumps(topics_by_slug,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))

        open('migration/data/topics.cat_id.json','w').write(json.dumps(topics_by_cat,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))

    def shouts(content_data, shouts_by_slug, shouts_by_oid):
        ''' migrating content items one by one '''
        # limiting
        limit = len(content_data)
        if len(sys.argv) > 2: limit = int(sys.argv[2])
        print('migrating %d content items...' % limit)
        counter = 0
        discours_author = 0
        errored = []

        # limiting
        try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
        except ValueError:  limit = len(content_data)

        for entry in content_data[:limit]:
            try:
                shout = migrateShout(entry, users_by_oid, topics_by_cat)
                author = shout['authors'][0]
                shout['authors'] = [ author.id, ]
                shouts_by_slug[shout['slug']] = shout
                shouts_by_oid[entry['_id']] = shout
                line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
                counter += 1
                if author.slug == 'discours': discours_author += 1
                print(line)
                # open('./shouts.id.log', 'a').write(line + '\n')
            except Exception as e:
                print(entry['_id'])
                errored.append(entry)
                raise e
        open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
        open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
        print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
        print(str(discours_author) + ' authored by @discours')

    def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
        # update what was just migrated or load json again
        if len(export_authors.keys()) == 0:
            export_authors = json.loads(open('../src/data/authors.json').read())
            print(str(len(export_authors.items())) + ' exported authors loaded')
        if len(export_articles.keys()) == 0:
            export_articles = json.loads(open('../src/data/articles.json').read())
            print(str(len(export_articles.items())) + ' exported articles loaded')

        # limiting
        limit = 33
        if len(sys.argv) > 2: limit = int(sys.argv[2])
        print('exporting %d articles to json...' % limit)

        # filter
        export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
        export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
        print(str(len(export_list)) + ' filtered')
        export_list = export_list[:limit or len(export_list)]

        for (slug, article) in export_list:
            if article['layout'] == 'article':
                export_slug(slug, export_articles, export_authors, content_dict)

    def export_body(article, content_dict):
        article = extract_images(article)
        metadata = get_metadata(article)
        content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
        open('../content/discours.io/'+slug+'.md', 'w').write(content)
        open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])

    def export_slug(slug, export_articles, export_authors, content_dict):
        print('exporting %s ' % slug)
        if export_authors == {}:
            export_authors = json.loads(open('../src/data/authors.json').read())
            print(str(len(export_authors.items())) + ' exported authors loaded')
        if export_articles == {}:
            export_articles = json.loads(open('../src/data/articles.json').read())
            print(str(len(export_articles.items())) + ' exported articles loaded')

        shout = shouts_by_slug.get(slug, False)
        assert shout, 'no data error'
        author = users_by_slug.get(shout['authors'][0]['slug'], None)
        export_authors.update({shout['authors'][0]['slug']: author})
        export_articles.update({shout['slug']: shout})
        export_body(shout, content_dict)
        comments([slug, ])

    def comments(sluglist, export_comments, export_articles, shouts_by_slug, content_dict):
        ''' migrating comments on content items one '''
        if len(sluglist) == 0:
            export_articles = json.loads(open('../src/data/articles.json').read())
            print(str(len(export_articles.items())) + ' articles were exported before')
            if len(sluglist) == 0: sluglist = list(export_articles.keys())

        if len(sluglist) > 0:
            print('exporting comments for: ')
            print(' '.join(sluglist))
            for slug in sluglist:
                shout = shouts_by_slug[slug]
                old_id = shout['old_id']
                content_item = content_dict.get(old_id, {})
                if content_item.get('commentedAt', False):
                    comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
                    if len(comments) > 0:
                        export_comments[slug] = comments
                        sys.stdout.write('.')
        else:

            print('exporting comments for top 10 commented articles...')
            comments_by_shoutslug = {}
            for content_item in content_data:
                old_id = content_item['_id']
                if content_item.get('commentedAt', False):
                    comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
                    if len(comments) > 0:
                        shout = shouts_by_oid.get(old_id, { 'slug': 'abandoned-comments' })
                        comments_by_shoutslug[shout['slug']] = comments

            top = dict(sorted(comments_by_shoutslug.items(), reverse=True, key=lambda c: len(c[1]))[:10])
            export_comments.update(top)

            print(str(len(export_comments.keys())) + ' articls with comments exported\n')


    def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
        open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
                                                                cls=DateTimeEncoder,
                                                                indent=4,
                                                                sort_keys=True,
                                                                ensure_ascii=False))
        print(str(len(export_authors.items())) + ' authors exported')
        open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
        print(str(len(export_topics.keys())) + ' topics exported')

        open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
                                                                cls=DateTimeEncoder,
                                                                indent=4,
                                                                sort_keys=True,
                                                                ensure_ascii=False))
        print(str(len(export_articles.items())) + ' articles exported')
        open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
                                                                cls=DateTimeEncoder,
                                                                indent=4,
                                                                sort_keys=True,
                                                                ensure_ascii=False))
        print(str(len(export_comments.items())) + ' exported articles with comments')

    if len(sys.argv) > 1:
        cmd = sys.argv[1]
        if cmd == "users":
            users(users_by_oid, users_by_slug, users_data)
        elif cmd == "topics":
            topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
        elif cmd == "shouts":
            try:
                Community.create(**{
                    'slug': 'discours.io',
                    'name': 'Дискурс',
                    'pic': 'https://discours.io/images/logo-min.svg',
                    'createdBy': '0',
                    'createdAt': date_parse(OLD_DATE)
                })
            except Exception:
                pass
            shouts(shouts_by_slug, shouts_by_oid) # NOTE: listens limit
        elif cmd == "export_shouts":
            export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
        elif cmd == "all":
            users(users_by_oid, users_by_slug, users_data)
            topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
            shouts(content_data, shouts_by_slug, shouts_by_oid)
            cl = sys.argv[2] if len(sys.argv) > 2 else 10
            topOids = sorted([ c[0] for c in comments_by_post.items()], reverse=True,  key=lambda i: len(i[1]))[-cl:]
            topSlugs = [ shouts_by_oid[oid]['slug'] for oid in topOids ]
            comments(topSlugs, export_comments, export_articles, shouts_by_slug, content_dict)
        elif cmd == "bson":
            from migration import bson2json
            bson2json.json_tables()
        elif cmd == 'slug':
            export_slug(sys.argv[2], export_articles, export_authors, content_dict)
        export_finish(export_articles, export_authors, export_topics, export_comments)
    else:
        print('''
            usage: python migrate.py bson
            \n.. \ttopics <limit>
            \n.. \tusers <limit>
            \n.. \tshouts <limit>
            \n.. \texport_shouts <limit>
            \n.. \tslug <slug>
            \n.. \tall
            ''')