core/migrate.py

327 lines
12 KiB
Python
Raw Normal View History

2021-10-08 04:42:59 +00:00
''' cmd managed migration '''
2021-08-20 09:27:19 +00:00
import json
2021-11-23 07:16:42 +00:00
import pprint
2021-10-08 04:42:59 +00:00
import base64
import re
2021-10-08 09:58:19 +00:00
import frontmatter
2021-08-20 09:27:19 +00:00
from migration.tables.users import migrate as migrateUser
from migration.tables.users import migrate_2stage as migrateUser_2stage
2022-06-04 06:34:19 +00:00
from migration.tables.users import migrate_email_subscription
2021-10-08 09:58:19 +00:00
from migration.tables.content_items import get_metadata, migrate as migrateShout
2021-10-08 04:42:59 +00:00
from migration.tables.content_item_categories import migrate as migrateCategory
from migration.tables.tags import migrate as migrateTag
2021-10-13 17:46:30 +00:00
from migration.tables.comments import migrate as migrateComment
from migration.tables.comments import migrate_2stage as migrateComment_2stage
2021-08-20 09:27:19 +00:00
from migration.utils import DateTimeEncoder
from orm import Community, Topic
2021-10-14 05:59:42 +00:00
from dateutil.parser import parse as date_parse
2021-08-20 09:27:19 +00:00
2021-10-16 13:53:46 +00:00
from orm.base import local_session
from orm import User
2021-11-23 07:16:42 +00:00
print = pprint.pprint
2021-10-08 04:42:59 +00:00
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'
2021-11-23 07:16:42 +00:00
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = '/static/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
open('..' + link, 'wb').write(base64.b64decode(img))
images.append(img)
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
def users(users_by_oid, users_by_slug, users_data):
''' migrating users first '''
# limiting
limit = len(users_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d users...' % limit)
counter = 0
id_map = {}
2021-11-23 07:16:42 +00:00
for entry in users_data:
oid = entry['_id']
user = migrateUser(entry)
users_by_oid[oid] = user # full
del user['password']
del user['notifications']
# del user['oauth']
del user['emailConfirmed']
del user['username']
del user['email']
users_by_slug[user['slug']] = user # public
id_map[user['old_id']] = user['slug']
2021-11-23 07:16:42 +00:00
counter += 1
2022-06-21 12:21:02 +00:00
print(' - * - stage 2 users migration - * -')
for entry in users_data:
migrateUser_2stage(entry, id_map)
2021-11-23 07:16:42 +00:00
try:
open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id
open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug
print(str(len(users_by_slug.items())) + ' users migrated')
except Exception:
print('json dump error')
2022-06-21 12:21:02 +00:00
# print(users_by_oid)
2021-11-23 07:16:42 +00:00
2021-12-15 15:28:54 +00:00
def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
2021-11-23 07:16:42 +00:00
''' topics from categories and tags '''
# limiting
limit = len(cats_data) + len(tags_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d topics...' % limit)
counter = 0
for tag in tags_data:
old_id = tag["createdBy"]
tag["createdBy"] = user_id_map.get(old_id, 0)
topic = migrateTag(tag)
2022-06-25 15:06:21 +00:00
#topics_by_title[topic['title']] = topic
topics_by_oid[topic['tag_id']] = topic
if not topics_by_slug.get(topic['slug']): topics_by_slug[topic['slug']] = topic
counter += 1
2021-11-23 07:16:42 +00:00
for cat in cats_data:
old_id = cat["createdBy"]
# cat["createdBy"] = user_id_map[old_id]
try: topic = migrateCategory(cat)
except Exception as e: raise e
2021-12-15 15:28:54 +00:00
topics_by_oid[topic['cat_id']] = topic
2021-11-23 07:16:42 +00:00
topics_by_slug[topic['slug']] = topic
2022-06-25 15:06:21 +00:00
#topics_by_title[topic['title']] = topic
2021-11-23 07:16:42 +00:00
counter += 1
2022-06-25 15:06:21 +00:00
#export_topics = dict(topics_by_title.items())
2021-11-23 07:16:42 +00:00
def shouts(content_data, shouts_by_slug, shouts_by_oid):
''' migrating content items one by one '''
# limiting
limit = len(content_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d content items...' % limit)
counter = 0
discours_author = 0
errored = []
# limiting
try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError: limit = len(content_data)
2021-12-10 13:52:55 +00:00
2021-11-23 07:16:42 +00:00
for entry in content_data[:limit]:
try:
2021-12-15 15:28:54 +00:00
shout = migrateShout(entry, users_by_oid, topics_by_oid)
2021-11-23 07:16:42 +00:00
author = shout['authors'][0]
shout['authors'] = [ author.id, ]
shouts_by_slug[shout['slug']] = shout
shouts_by_oid[entry['_id']] = shout
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
counter += 1
if author.slug == 'discours': discours_author += 1
print(line)
# open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e:
print(entry['_id'])
errored.append(entry)
raise e
open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
print(str(discours_author) + ' authored by @discours')
def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
# update what was just migrated or load json again
if len(export_authors.keys()) == 0:
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
if len(export_articles.keys()) == 0:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded')
# limiting
limit = 33
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('exporting %d articles to json...' % limit)
# filter
export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
for (slug, article) in export_list:
if article['layout'] == 'article':
export_slug(slug, export_articles, export_authors, content_dict)
def export_body(article, content_dict):
article = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
def export_slug(slug, export_articles, export_authors, content_dict):
print('exporting %s ' % slug)
if export_authors == {}:
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
if export_articles == {}:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded')
shout = shouts_by_slug.get(slug, False)
assert shout, 'no data error'
author = users_by_slug.get(shout['authors'][0]['slug'], None)
export_authors.update({shout['authors'][0]['slug']: author})
export_articles.update({shout['slug']: shout})
export_body(shout, content_dict)
comments([slug, ])
def comments(comments_data):
id_map = {}
for comment in comments_data:
2021-12-13 07:50:33 +00:00
comment = migrateComment(comment, shouts_by_oid)
if not comment:
continue
id = comment.get('id')
old_id = comment.get('old_id')
id_map[old_id] = id
for comment in comments_data:
migrateComment_2stage(comment, id_map)
print(str(len(id_map)) + ' comments exported')
2021-11-23 07:16:42 +00:00
2022-06-04 06:34:19 +00:00
def export_email_subscriptions(email_subscriptions_data):
for data in email_subscriptions_data:
migrate_email_subscription(data)
print(str(len(email_subscriptions_data)) + ' email subscriptions exported')
2021-11-23 07:16:42 +00:00
def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_authors.items())) + ' authors exported')
open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_topics.keys())) + ' topics exported')
open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_articles.items())) + ' articles exported')
open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_comments.items())) + ' exported articles with comments')
if __name__ == '__main__':
2021-11-23 07:16:42 +00:00
import sys
if len(sys.argv) > 1:
cmd = sys.argv[1]
if cmd == "bson":
# decode bson
from migration import bson2json
bson2json.json_tables()
else:
# preparing data
users_data = json.loads(open('migration/data/users.json').read())
# users_dict = { x['_id']: x for x in users_data } # by id
print(str(len(users_data)) + ' users loaded')
users_by_oid = {}
users_by_slug = {}
user_id_map = {}
with local_session() as session:
users_list = session.query(User).all()
for user in users_list:
user_id_map[user.old_id] = user.id
users_by_oid[user.old_id] = vars(user)
tags_data = json.loads(open('migration/data/tags.json').read())
print(str(len(tags_data)) + ' tags loaded')
cats_data = json.loads(open('migration/data/content_item_categories.json').read())
print(str(len(cats_data)) + ' cats loaded')
2021-12-15 15:28:54 +00:00
topics_by_oid = {}
2021-11-23 07:16:42 +00:00
topics_by_slug = {}
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']: x for x in content_data }
print(str(len(content_data)) + ' content items loaded')
shouts_by_slug = {}
shouts_by_oid = {}
comments_data = json.loads(open('migration/data/comments.json').read())
print(str(len(comments_data)) + ' comments loaded')
comments_by_post = {}
# sort comments by old posts ids
for old_comment in comments_data:
cid = old_comment['contentItem']
comments_by_post[cid] = comments_by_post.get(cid, [])
if not old_comment.get('deletedAt', True):
comments_by_post[cid].append(old_comment)
print(str(len(comments_by_post.keys())) + ' articles with comments')
2022-06-04 06:34:19 +00:00
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
print(str(len(email_subscriptions_data)) + ' email subscriptions loaded')
2021-11-23 07:16:42 +00:00
export_articles = {} # slug: shout
export_authors = {} # slug: user
export_comments = {} # shout-slug: comment[] (list)
export_topics = {} # slug: topic
##################### COMMANDS ##########################3
if cmd == "users":
users(users_by_oid, users_by_slug, users_data)
elif cmd == "topics":
2021-12-15 15:28:54 +00:00
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
2021-11-23 07:16:42 +00:00
elif cmd == "shouts":
shouts(content_data, shouts_by_slug, shouts_by_oid) # NOTE: listens limit
elif cmd == "comments":
comments(comments_data)
2021-11-23 07:16:42 +00:00
elif cmd == "export_shouts":
export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
2022-06-04 06:34:19 +00:00
elif cmd == "email_subscriptions":
export_email_subscriptions(email_subscriptions_data)
2021-11-23 07:16:42 +00:00
elif cmd == "all":
users(users_by_oid, users_by_slug, users_data)
2021-12-15 15:28:54 +00:00
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
2021-11-23 07:16:42 +00:00
shouts(content_data, shouts_by_slug, shouts_by_oid)
comments(comments_data)
2022-06-04 06:34:19 +00:00
export_email_subscriptions(email_subscriptions_data)
2021-11-23 07:16:42 +00:00
elif cmd == 'slug':
export_slug(sys.argv[2], export_articles, export_authors, content_dict)
#export_finish(export_articles, export_authors, export_topics, export_comments)
else:
print('''
usage: python migrate.py bson
\n.. \ttopics <limit>
\n.. \tusers <limit>
\n.. \tshouts <limit>
\n.. \texport_shouts <limit>
\n.. \tslug <slug>
\n.. \tall
''')