migration fix, new html2text, export wip

2021-10-15 13:00:26 +03:00
parent 7ec763391b
commit 14fdfe71e5
21 changed files with 3358 additions and 564 deletions
--- a/migrate.py
+++ b/migrate.py
@@ -17,263 +17,275 @@ IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
 OLD_DATE = '2016-03-05 22:22:00.350000'


-def extract_images(article):
-    ''' extract b64 encoded images from markdown in article body '''
-    body = article['body']
-    images = []
-    matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
-    for i, match in enumerate(matches, start=1):
-        ext = match.group(3)
-        link = '/static/upload/image-' + \
-            article['old_id'] + str(i) + '.' + ext
-        img = match.group(4)
-        if img not in images:
-            open('..' + link, 'wb').write(base64.b64decode(img))
-            images.append(img)
-        body = body.replace(match.group(2), link)
-        print(link)
-    article['body'] = body
-    return article
+if __name__ == '__main__':
+    import sys

+    users_data = json.loads(open('migration/data/users.json').read())
+    users_dict = { x['_id']: x for x in users_data } # by id
+    print(str(len(users_data)) + ' users loaded')
+    users_by_oid = {}
+    users_by_slug = {}

-def users():
-    ''' migrating users first '''
-    print('migrating users...')
-    newdata = {}
-    data = json.loads(open('migration/data/users.json').read())
-    counter = 0
-    export_data = {}
-    for entry in data:
-        oid = entry['_id']
-        user = migrateUser(entry)
-        newdata[oid] = user
-        del user['password']
-        del user['notifications']
-        # del user['oauth']
-        del user['emailConfirmed']
-        del user['username']
-        del user['email']
-        export_data[user['slug']] = user
-        counter += 1
-    export_list = sorted(export_data.items(), key=lambda item: item[1]['rating'])[-10:]
-    open('migration/data/users.dict.json', 'w').write(json.dumps(newdata, cls=DateTimeEncoder))  # NOTE: by old_id
-    open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list),
-                                                            cls=DateTimeEncoder,
-                                                            indent=4,
-                                                            sort_keys=True,
-                                                            ensure_ascii=False))
-    print(str(len(newdata.items())) + ' user accounts were migrated')
-    print(str(len(export_list)) + ' authors were exported')
+    tags_data = json.loads(open('migration/data/tags.json').read())
+    print(str(len(tags_data)) + ' tags loaded')

-
-def topics():
-    ''' topics from categories and tags '''
-    print('migrating topics...')
    cats_data = json.loads(open('migration/data/content_item_categories.json').read())
-    cat_topics = {}
-    slug_topics = {}
-    counter = 0
-    try:
-        for cat in cats_data:
-            topic = migrateCategory(cat)
-            cat_topics[topic['cat_id']] = topic
-            slug_topics[topic['slug']] = topic
-            counter += 1
-    except Exception as e:
-        print('cats exception, try to remove database first')
-        raise e
-    '''
-    try:
-        for tag in tag_data:
-            topic = migrateTag(tag)
-            newdata[topic['slug']] = topic
-            counter += 1
-    except Exception:
-        print('tags exception, try to remove database first')
-        raise Exception
-    '''
-    export_list = sorted(slug_topics.items(), key=lambda item: str(
-        item[1]['createdAt']))
-    open('migration/data/topics.dict.json','w').write(json.dumps(cat_topics,
-                                                        cls=DateTimeEncoder,
-                                                        indent=4,
-                                                        sort_keys=True,
-                                                        ensure_ascii=False))
-    open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list),
-                                                        cls=DateTimeEncoder,
-                                                        indent=4,
-                                                        sort_keys=True,
-                                                        ensure_ascii=False))
-    #' tags and ' + str(len(tag_data)) +
-    print(str(counter) + ' / ' + str(len(cats_data)) + ' migrated')
-    print(str(len(export_list)) + ' topics were exported')
+    print(str(len(cats_data)) + ' cats loaded')
+    topics_by_cat = {}
+    topics_by_tag = {}
+    topics_by_slug = {}

-
-def shouts():
-    ''' migrating content items one by one '''
-    print('loading shouts...')
-    counter = 0
-    discours_author = 0
    content_data = json.loads(open('migration/data/content_items.json').read())
-    content_dict = { x['_id']:x for x in content_data }
-    newdata = {}
-    print(str(len(content_data)) + ' entries loaded. now migrating...')
-    errored = []
-    for entry in content_data:
-        try:
-            shout = migrateShout(entry)
-            newdata[shout['slug']] = shout
-            author = newdata[shout['slug']]['authors'][0]['slug']
-            line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
-            print(line)
-            counter += 1
-            if author == 'discours':
-                discours_author += 1
-            open('./shouts.id.log', 'a').write(line + '\n')
-        except Exception as e:
-            print(entry['_id'])
-            errored.append(entry)
-            raise e
-    try:
-        limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
-    except ValueError:
-        limit = len(content_data)
-    open('migration/data/shouts.dict.json',
-        'w').write(json.dumps(newdata, cls=DateTimeEncoder))
-    print(str(counter) + '/' + str(len(content_data)) +
-        ' content items were migrated')
-    print(str(discours_author) + ' from them by @discours')
-    
-def comments():
-    ''' migrating comments on content items one by one '''
-    content_data = json.loads(open('migration/data/content_items.json').read()) # old content
-    content_dict = { x['_id']: x for x in content_data } # by slug
-    shouts_dict = json.loads(open('migration/data/shouts.dict.json', 'r').read()) # all shouts by slug
-    print(str(len(shouts_dict.keys())) + ' migrated shouts loaded')
-    shouts_old = { x['old_id']: x for slug, x in shouts_dict.items() } # shouts by old_id
+    content_dict = { x['_id']: x for x in content_data }
    print(str(len(content_data)) + ' content items loaded')
-    comments_data = json.loads(open('migration/data/comments.json').read()) # by slug
+    shouts_by_slug = {}
+    shouts_by_oid = {}
+
+    comments_data = json.loads(open('migration/data/comments.json').read())
    print(str(len(comments_data)) + ' comments loaded')
    comments_by_post = {}
+
    # sort comments by old posts ids
    for old_comment in comments_data:
        cid = old_comment['contentItem']
        comments_by_post[cid] = comments_by_post.get(cid, [])
        comments_by_post[cid].append(old_comment)
-    # migrate comments
-    comments_by_shoutslug = {}
-    for content_item in content_data:
-        old_id = content_item['_id']
-        if content_item.get('commentedAt', False):
-            comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
-            if comments.length > 0:
-                shout = shouts_old.get(old_id, { 'slug': 'abandoned-comments' })
-                comments_by_shoutslug[shout['slug']] = comments
-    export_articles = json.loads(open('../src/data/articles.json').read())
-    print(str(len(export_articles.items())) + ' articles were exported')
-    export_comments = {}
-    c = 0
-    for slug, article in export_articles.items():
-        comments = comments_by_shoutslug.get(slug, [])
-        if len(comments) > 0:
-            export_comments[slug] = comments
-            c += len(comments)
-    print(str(len(export_comments.items())) + ' after adding those having comments')
-    open('../src/data/comments.json', 'w').write(json.dumps(dict(export_comments),
+    print(str(len(comments_by_post.keys())) + ' articles with comments')
+
+    export_articles = {} # slug: shout
+    export_authors = {} # slug: user
+    export_comments = {} # shout-slug: comment[] (list)
+    export_topics = {} # slug: topic
+
+
+    def extract_images(article):
+        ''' extract b64 encoded images from markdown in article body '''
+        body = article['body']
+        images = []
+        matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
+        for i, match in enumerate(matches, start=1):
+            ext = match.group(3)
+            link = '/static/upload/image-' + \
+                article['old_id'] + str(i) + '.' + ext
+            img = match.group(4)
+            if img not in images:
+                open('..' + link, 'wb').write(base64.b64decode(img))
+                images.append(img)
+            body = body.replace(match.group(2), link)
+            print(link)
+        article['body'] = body
+        return article
+
+
+    def users():
+        ''' migrating users first '''
+        # limiting
+        limit = len(users_data)
+        if len(sys.argv) > 2: limit = int(sys.argv[2])
+        print('migrating %d users...' % limit)
+        counter = 0
+        for entry in users_data:
+            oid = entry['_id']
+            user = migrateUser(entry)
+            users_by_oid[oid] = user # full
+            del user['password']
+            del user['notifications']
+            # del user['oauth']
+            del user['emailConfirmed']
+            del user['username']
+            del user['email']
+            users_by_slug[user['slug']] = user # public
+            counter += 1
+        export_authors = dict(sorted(users_by_slug.items(), key=lambda item: item[1]['rating'])[-10:])
+        open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder))  # NOTE: by old_id
+        open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder))  # NOTE: by old_id
+        print(str(len(users_by_slug.items())) + ' users migrated')
+
+
+    def topics():
+        ''' topics from categories and tags '''
+        # limiting
+        limit = len(cats_data) + len(tags_data)
+        if len(sys.argv) > 2: limit = int(sys.argv[2])
+        print('migrating %d topics...' % limit)
+        counter = 0
+        for cat in cats_data:
+            try: topic = migrateCategory(cat)            
+            except Exception as e: raise e
+            topics_by_cat[topic['cat_id']] = topic
+            topics_by_slug[topic['slug']] = topic
+            counter += 1
+        for tag in tags_data:
+            topic = migrateTag(tag)
+            topics_by_tag[topic['tag_id']] = topic
+            if not topics_by_slug.get(topic['slug']): topics_by_slug[topic['slug']] = topic
+            counter += 1
+        export_topics = dict(sorted(topics_by_slug.items(), key=lambda item: str(item[1]['createdAt']))) # NOTE: sorting does not work :)
+        open('migration/data/topics.slug.json','w').write(json.dumps(topics_by_slug,
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
-    print(str(c) + ' comments were exported')
+
+        open('migration/data/topics.cat_id.json','w').write(json.dumps(topics_by_cat,
+                                                            cls=DateTimeEncoder,
+                                                            indent=4,
+                                                            sort_keys=True,
+                                                            ensure_ascii=False))
+
+    def shouts():
+        ''' migrating content items one by one '''
+        # limiting
+        limit = len(content_data)
+        if len(sys.argv) > 2: limit = int(sys.argv[2])
+        print('migrating %d content items...' % limit)
+        counter = 0
+        discours_author = 0
+        errored = []
+
+        # limiting
+        try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
+        except ValueError:  limit = len(content_data)
+
+        for entry in content_data[:limit]:
+            try:
+                shout = migrateShout(entry, users_by_oid, topics_by_cat)
+                author = shout['authors'][0]
+                shout['authors'] = [ author.id, ]
+                shouts_by_slug[shout['slug']] = shout
+                shouts_by_oid[entry['_id']] = shout
+                line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
+                counter += 1
+                if author.slug == 'discours': discours_author += 1
+                print(line)
+                # open('./shouts.id.log', 'a').write(line + '\n')
+            except Exception as e:
+                print(entry['_id'])
+                errored.append(entry)
+                raise e
+        open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
+        open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
+        print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
+        print(str(discours_author) + ' authored by @discours')
        
+    def export_shouts(shouts_by_slug, export_articles, export_authors):
+        # update what was just migrated or load json again
+        if len(export_authors.keys()) == 0:
+            export_authors = json.loads(open('../src/data/authors.json').read())
+            print(str(len(export_authors.items())) + ' exported authors loaded')
+        if len(export_articles.keys()) == 0:
+            export_articles = json.loads(open('../src/data/articles.json').read())
+            print(str(len(export_articles.items())) + ' exported articles loaded')
        
+        # limiting
+        limit = 33
+        if len(sys.argv) > 2: limit = int(sys.argv[2])
+        print('exporting %d articles to json...' % limit)
+        
+        # filter 
+        export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
+        export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
+        print(str(len(export_list)) + ' filtered')
+        export_list = export_list[:limit or len(export_list)]
+        
+        for (slug, article) in export_list:
+            if article['layout'] == 'article':
+                export_slug(slug, export_articles, export_authors)
+        
+    def export_body(article):
+        article = extract_images(article)
+        metadata = get_metadata(article)
+        content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
+        open('../content/discours.io/'+slug+'.md', 'w').write(content)
+        open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])

-def export_shouts(limit):
-    print('reading json...')
-    content_data = json.loads(open('migration/data/content_items.json').read())
-    content_dict = { x['_id']:x for x in content_data }
-    print(str(len(content_data)) + ' content items loaded')
-    newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
-    print(str(len(newdata.keys())) + ' migrated shouts loaded')
-    users_old = json.loads(open('migration/data/users.dict.json').read())
-    print(str(len(newdata.keys())) + ' migrated users loaded')
-    export_authors = json.loads(open('../src/data/authors.json').read())
-    print(str(len(export_authors.items())) + ' exported authors loaded')
-    users_slug = { u['slug']: u for old_id, u in users_old.items()}
-    print(str(len(users_slug.items())) + ' users loaded')
-
-    export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
-    export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
-    print(str(len(export_list)) + ' filtered')
-
-    export_list = export_list[:limit or len(export_list)]
-    export_clean = {}
-    for (slug, article) in export_list:
-        if article['layout'] == 'article':
-            for author in article['authors']:
-                export_authors[author['slug']] = users_slug[author['slug']]
-            export_clean[article['slug']] = extract_images(article)
-            metadata = get_metadata(article)
-            content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
-            open('../content/discours.io/'+slug+'.md', 'w').write(content)
-            # print(slug)
-            open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
-    open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
-                                                            cls=DateTimeEncoder,
-                                                            indent=4,
-                                                            sort_keys=True,
-                                                            ensure_ascii=False))
-    print(str(len(export_clean.items())) + ' articles exported')
-    open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
-                                                            cls=DateTimeEncoder,
-                                                            indent=4,
-                                                            sort_keys=True,
-                                                            ensure_ascii=False))
-    comments()
-    print(str(len(export_authors.items())) + ' total authors exported')
-
-def export_slug(slug):
-    shouts_dict = json.loads(open('migration/data/shouts.dict.json').read())
-    print(str(len(shouts_dict.items())) + ' migrated shouts loaded')
-    users_old = json.loads(open('migration/data/users.dict.json').read()) # NOTE: this exact file is by old_id
-    print(str(len(users_old.items())) + ' migrated users loaded')
-    users_dict = { x[1]['slug']:x for x in users_old.items() }
-    exported_authors = json.loads(open('../src/data/authors.json').read())
-    print(str(len(exported_authors.items())) + ' exported authors loaded')
-    exported_articles = json.loads(open('../src/data/articles.json').read())
-    print(str(len(exported_articles.items())) + ' exported articles loaded')
-    shout = shouts_dict.get(slug, False)
-    if shout:
-        author = users_dict.get(shout['authors'][0]['slug'], None)
+    def export_slug(slug, export_articles, export_authors):
+        if exported_authors == {}: 
+            exported_authors = json.loads(open('../src/data/authors.json').read())
+            print(str(len(exported_authors.items())) + ' exported authors loaded')
+        if exported_articles == {}:
+            exported_articles = json.loads(open('../src/data/articles.json').read())
+            print(str(len(exported_articles.items())) + ' exported articles loaded')
+            
+        shout = shouts_by_slug.get(slug, False)
+        assert shout, 'no data error'
+        author = users_by_slug.get(shout['authors'][0]['slug'], None)
        exported_authors.update({shout['authors'][0]['slug']: author})
        exported_articles.update({shout['slug']: shout})
-        print(shout)
-        open('../src/data/articles.json', 'w').write(json.dumps(exported_articles,
-                                                            cls=DateTimeEncoder,
-                                                            indent=4,
-                                                            sort_keys=True,
-                                                            ensure_ascii=False))
-        open('../src/data/authors.json', 'w').write(json.dumps(exported_authors,
-                                                            cls=DateTimeEncoder,
-                                                            indent=4,
-                                                            sort_keys=True,
-                                                            ensure_ascii=False))
-    else:
-        print('no old id error!')
-        # print(str(len(shouts_dict)) + ' shouts were migrated')
-        print(slug)
-    comments()
-    print('finished.')
-    
+        export_body(shout)
+        comments([slug, ])
+        
+
+    def comments(sluglist = []):
+        ''' migrating comments on content items one '''
+        if len(sluglist) == 0:
+            export_articles = json.loads(open('../src/data/articles.json').read())
+            print(str(len(export_articles.items())) + ' articles were exported before')
+            if len(sluglist) == 0: sluglist = list(export_articles.keys())
+
+        if len(sluglist) > 0:
+            print('exporting comments for exact articles...')
+            for slug in sluglist:
+                shout = shouts_by_slug[slug]
+                old_id = shout['old_id']
+                content_item = content_dict.get(old_id, {})
+                if content_item.get('commentedAt', False):
+                    comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
+                    if len(comments) > 0: 
+                        export_comments[slug] = comments
+                        sys.stdout.write('.')
+        else:
+
+            print('exporting comments for top 10 commented articles...')
+            comments_by_shoutslug = {}
+            for content_item in content_data:
+                old_id = content_item['_id']
+                if content_item.get('commentedAt', False):
+                    comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
+                    if len(comments) > 0:
+                        shout = shouts_by_oid.get(old_id, { 'slug': 'abandoned-comments' })
+                        comments_by_shoutslug[shout['slug']] = comments
+            
+            top = dict(sorted(comments_by_shoutslug.items(), reverse=True, key=lambda c: len(c[1]))[:10])
+            export_comments.update(top)
+                
+            print(str(len(export_comments.keys())) + ' articls with comments exported\n')
+
+
+    def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
+        open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
+                                                                cls=DateTimeEncoder,
+                                                                indent=4,
+                                                                sort_keys=True,
+                                                                ensure_ascii=False))
+        print(str(len(export_authors.items())) + ' authors exported')
+        open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
+                                                            cls=DateTimeEncoder,
+                                                            indent=4,
+                                                            sort_keys=True,
+                                                            ensure_ascii=False))
+        print(str(len(export_topics.keys())) + ' topics exported')
+        
+        open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
+                                                                cls=DateTimeEncoder,
+                                                                indent=4,
+                                                                sort_keys=True,
+                                                                ensure_ascii=False))
+        print(str(len(export_articles.items())) + ' articles exported')
+        open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
+                                                                cls=DateTimeEncoder,
+                                                                indent=4,
+                                                                sort_keys=True,
+                                                                ensure_ascii=False))
+        print(str(len(export_comments.items())) + ' exported articles with comments')

-if __name__ == '__main__':
-    import sys
    if len(sys.argv) > 1:
-        if sys.argv[1] == "users":
-            users()
-        elif sys.argv[1] == "topics":
-            topics()
-        elif sys.argv[1] == "shouts":
+        cmd = sys.argv[1]
+        if cmd == "users":
+            users(users_by_oid, users_by_slug, users_data, users_dict)
+        elif cmd == "topics":
+            topics(topics_by_cat, topics_by_tag, topics_by_slug)
+        elif cmd == "shouts":
            try:
                Community.create(**{
                    'slug': 'discours.io',
@@ -284,21 +296,30 @@ if __name__ == '__main__':
                })
            except Exception:
                pass
-            shouts()
-        elif sys.argv[1] == "comments":
+            shouts(shouts_by_slug, shouts_by_oid) # NOTE: listens limit
+        elif cmd == "comments":
            comments()
-        elif sys.argv[1] == "export_shouts":
-            limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
-            export_shouts(limit)
-        elif sys.argv[1] == "all":
+        elif cmd == "export_shouts":
+            export_shouts(shouts_by_slug, export_articles, export_authors)
+        elif cmd == "all":
            users()
            topics()
            shouts()
            comments()
-        elif sys.argv[1] == "bson":
+        elif cmd == "bson":
            from migration import bson2json
            bson2json.json_tables()
-        elif sys.argv[1] == 'slug':
-            export_slug(sys.argv[2])
+        elif cmd == 'slug':
+            export_slug(sys.argv[2], export_articles, export_authors)
+        export_finish(export_articles, export_authors, export_topics, export_comments)
    else:
-        print('usage: python migrate.py bson\n.. \ttopics <limit>\n.. \tusers <limit>\n.. \tshouts <limit>\n.. \tcomments\n.. \texport_shouts <limit>\n.. \tslug <slug>\n.. \tall>')
+        print('''
+            usage: python migrate.py bson
+            \n.. \ttopics <limit>
+            \n.. \tusers <limit>
+            \n.. \tshouts <limit>
+            \n.. \tcomments
+            \n.. \texport_shouts <limit>
+            \n.. \tslug <slug>
+            \n.. \tall
+            ''')