core/migrate.py
2021-11-23 17:17:19 +03:00

368 lines
14 KiB
Python

''' cmd managed migration '''
import json
import pprint
import base64
import re
import frontmatter
from migration.tables.users import migrate as migrateUser
from migration.tables.content_items import get_metadata, migrate as migrateShout
from migration.tables.content_item_categories import migrate as migrateCategory
from migration.tables.tags import migrate as migrateTag
from migration.tables.comments import migrate as migrateComment
from migration.utils import DateTimeEncoder
from orm import Community, Topic
from dateutil.parser import parse as date_parse
from orm.base import local_session
from orm import User
print = pprint.pprint
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = '/static/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
open('..' + link, 'wb').write(base64.b64decode(img))
images.append(img)
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
def users(users_by_oid, users_by_slug, users_data):
''' migrating users first '''
# limiting
limit = len(users_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d users...' % limit)
counter = 0
for entry in users_data:
oid = entry['_id']
user = migrateUser(entry)
users_by_oid[oid] = user # full
del user['password']
del user['notifications']
# del user['oauth']
del user['emailConfirmed']
del user['username']
del user['email']
users_by_slug[user['slug']] = user # public
counter += 1
export_authors = dict(sorted(users_by_slug.items(), key=lambda item: item[1]['rating'])[-10:])
try:
open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id
open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug
print(str(len(users_by_slug.items())) + ' users migrated')
except Exception:
print('json dump error')
print(users_by_oid)
def topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data):
''' topics from categories and tags '''
# limiting
limit = len(cats_data) + len(tags_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d topics...' % limit)
counter = 0
for cat in cats_data:
old_id = cat["createdBy"]
# cat["createdBy"] = user_id_map[old_id]
try: topic = migrateCategory(cat)
except Exception as e: raise e
topics_by_cat[topic['cat_id']] = topic
topics_by_slug[topic['slug']] = topic
counter += 1
for tag in tags_data:
old_id = tag["createdBy"]
tag["createdBy"] = user_id_map.get(old_id, 0)
topic = migrateTag(tag)
topics_by_tag[topic['tag_id']] = topic
if not topics_by_slug.get(topic['slug']): topics_by_slug[topic['slug']] = topic
counter += 1
export_topics = dict(topics_by_slug.items()) # sorted(topics_by_slug.items(), key=lambda item: str(item[1]['createdAt']))) # NOTE: sorting does not work :)
open('migration/data/topics.slug.json','w').write(json.dumps(topics_by_slug,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
open('migration/data/topics.cat_id.json','w').write(json.dumps(topics_by_cat,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
def shouts(content_data, shouts_by_slug, shouts_by_oid):
''' migrating content items one by one '''
# limiting
limit = len(content_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d content items...' % limit)
counter = 0
discours_author = 0
errored = []
# limiting
try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError: limit = len(content_data)
with local_session() as session:
community = session.query(Community).filter(Community.id == 0).first()
if not community:
Community.create(**{
'id' : 0,
'slug': 'discours.io',
'name': 'Дискурс',
'pic': 'https://discours.io/images/logo-min.svg',
'createdBy': '0',
'createdAt': date_parse(OLD_DATE)
})
if not topics_by_cat:
with local_session() as session:
topics = session.query(Topic).all()
print("loaded %s topics" % len(topics))
for topic in topics:
topics_by_cat[topic.cat_id] = topic
for entry in content_data[:limit]:
try:
shout = migrateShout(entry, users_by_oid, topics_by_cat)
author = shout['authors'][0]
shout['authors'] = [ author.id, ]
shouts_by_slug[shout['slug']] = shout
shouts_by_oid[entry['_id']] = shout
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
counter += 1
if author.slug == 'discours': discours_author += 1
print(line)
# open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e:
print(entry['_id'])
errored.append(entry)
raise e
open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
print(str(discours_author) + ' authored by @discours')
def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
# update what was just migrated or load json again
if len(export_authors.keys()) == 0:
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
if len(export_articles.keys()) == 0:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded')
# limiting
limit = 33
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('exporting %d articles to json...' % limit)
# filter
export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
for (slug, article) in export_list:
if article['layout'] == 'article':
export_slug(slug, export_articles, export_authors, content_dict)
def export_body(article, content_dict):
article = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
def export_slug(slug, export_articles, export_authors, content_dict):
print('exporting %s ' % slug)
if export_authors == {}:
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
if export_articles == {}:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded')
shout = shouts_by_slug.get(slug, False)
assert shout, 'no data error'
author = users_by_slug.get(shout['authors'][0]['slug'], None)
export_authors.update({shout['authors'][0]['slug']: author})
export_articles.update({shout['slug']: shout})
export_body(shout, content_dict)
comments([slug, ])
def comments(sluglist, export_comments, export_articles, shouts_by_slug, content_dict):
''' migrating comments on content items one '''
if len(sluglist) == 0:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' articles were exported before')
if len(sluglist) == 0: sluglist = list(export_articles.keys())
if len(sluglist) > 0:
print('exporting comments for: ')
print(' '.join(sluglist))
for slug in sluglist:
shout = shouts_by_slug[slug]
old_id = shout['old_id']
content_item = content_dict.get(old_id, {})
if content_item.get('commentedAt', False):
comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
if len(comments) > 0:
export_comments[slug] = comments
sys.stdout.write('.')
else:
print('exporting comments for top 10 commented articles...')
comments_by_shoutslug = {}
for content_item in content_data:
old_id = content_item['_id']
if content_item.get('commentedAt', False):
comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
if len(comments) > 0:
shout = shouts_by_oid.get(old_id, { 'slug': 'abandoned-comments' })
comments_by_shoutslug[shout['slug']] = comments
top = dict(sorted(comments_by_shoutslug.items(), reverse=True, key=lambda c: len(c[1]))[:10])
export_comments.update(top)
print(str(len(export_comments.keys())) + ' articls with comments exported\n')
def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_authors.items())) + ' authors exported')
open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_topics.keys())) + ' topics exported')
open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_articles.items())) + ' articles exported')
open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_comments.items())) + ' exported articles with comments')
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
cmd = sys.argv[1]
if cmd == "bson":
# decode bson
from migration import bson2json
bson2json.json_tables()
else:
# preparing data
users_data = json.loads(open('migration/data/users.json').read())
# users_dict = { x['_id']: x for x in users_data } # by id
print(str(len(users_data)) + ' users loaded')
users_by_oid = {}
users_by_slug = {}
with local_session() as session:
default_user = session.query(User).filter(User.id == 0).first()
if not default_user:
default_user = User.create(id = 0, email = "discours@discours.io", username = "discours", slug = "default", old_id = 0)
user_id_map = {}
with local_session() as session:
users_list = session.query(User).all()
for user in users_list:
user_id_map[user.old_id] = user.id
users_by_oid[user.old_id] = vars(user)
tags_data = json.loads(open('migration/data/tags.json').read())
print(str(len(tags_data)) + ' tags loaded')
cats_data = json.loads(open('migration/data/content_item_categories.json').read())
print(str(len(cats_data)) + ' cats loaded')
topics_by_cat = {}
topics_by_tag = {}
topics_by_slug = {}
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']: x for x in content_data }
print(str(len(content_data)) + ' content items loaded')
shouts_by_slug = {}
shouts_by_oid = {}
comments_data = json.loads(open('migration/data/comments.json').read())
print(str(len(comments_data)) + ' comments loaded')
comments_by_post = {}
# sort comments by old posts ids
for old_comment in comments_data:
cid = old_comment['contentItem']
comments_by_post[cid] = comments_by_post.get(cid, [])
if not old_comment.get('deletedAt', True):
comments_by_post[cid].append(old_comment)
print(str(len(comments_by_post.keys())) + ' articles with comments')
export_articles = {} # slug: shout
export_authors = {} # slug: user
export_comments = {} # shout-slug: comment[] (list)
export_topics = {} # slug: topic
##################### COMMANDS ##########################3
if cmd == "users":
users(users_by_oid, users_by_slug, users_data)
elif cmd == "topics":
topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
elif cmd == "shouts":
shouts(content_data, shouts_by_slug, shouts_by_oid) # NOTE: listens limit
elif cmd == "comments":
for comment in comments_data:
migrateComment(comment)
elif cmd == "export_shouts":
export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
elif cmd == "all":
users(users_by_oid, users_by_slug, users_data)
topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
shouts(content_data, shouts_by_slug, shouts_by_oid)
for comment in comments_data:
migrateComment(comment)
elif cmd == 'slug':
export_slug(sys.argv[2], export_articles, export_authors, content_dict)
#export_finish(export_articles, export_authors, export_topics, export_comments)
else:
print('''
usage: python migrate.py bson
\n.. \ttopics <limit>
\n.. \tusers <limit>
\n.. \tshouts <limit>
\n.. \texport_shouts <limit>
\n.. \tslug <slug>
\n.. \tall
''')