core/migrate.py
2021-10-14 08:59:42 +03:00

305 lines
14 KiB
Python

''' cmd managed migration '''
import json
import base64
import re
import frontmatter
from migration.tables.users import migrate as migrateUser
from migration.tables.content_items import get_metadata, migrate as migrateShout
from migration.tables.content_item_categories import migrate as migrateCategory
from migration.tables.tags import migrate as migrateTag
from migration.tables.comments import migrate as migrateComment
from migration.utils import DateTimeEncoder
from orm import Community
from dateutil.parser import parse as date_parse
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = '/static/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
open('..' + link, 'wb').write(base64.b64decode(img))
images.append(img)
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
def users():
''' migrating users first '''
print('migrating users...')
newdata = {}
data = json.loads(open('migration/data/users.json').read())
counter = 0
export_data = {}
for entry in data:
oid = entry['_id']
user = migrateUser(entry)
newdata[oid] = user
del user['password']
del user['notifications']
# del user['oauth']
del user['emailConfirmed']
del user['username']
del user['email']
export_data[user['slug']] = user
counter += 1
export_list = sorted(export_data.items(), key=lambda item: item[1]['rating'])[-10:]
open('migration/data/users.dict.json', 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) # NOTE: by old_id
open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(newdata.items())) + ' user accounts were migrated')
print(str(len(export_list)) + ' authors were exported')
def topics():
''' topics from categories and tags '''
print('migrating topics...')
cats_data = json.loads(open('migration/data/content_item_categories.json').read())
cat_topics = {}
slug_topics = {}
counter = 0
try:
for cat in cats_data:
topic = migrateCategory(cat)
cat_topics[topic['cat_id']] = topic
slug_topics[topic['slug']] = topic
counter += 1
except Exception as e:
print('cats exception, try to remove database first')
raise e
'''
try:
for tag in tag_data:
topic = migrateTag(tag)
newdata[topic['slug']] = topic
counter += 1
except Exception:
print('tags exception, try to remove database first')
raise Exception
'''
export_list = sorted(slug_topics.items(), key=lambda item: str(
item[1]['createdAt']))
open('migration/data/topics.dict.json','w').write(json.dumps(cat_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
#' tags and ' + str(len(tag_data)) +
print(str(counter) + ' / ' + str(len(cats_data)) + ' migrated')
print(str(len(export_list)) + ' topics were exported')
def shouts():
''' migrating content items one by one '''
print('loading shouts...')
counter = 0
discours_author = 0
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
newdata = {}
print(str(len(content_data)) + ' entries loaded. now migrating...')
errored = []
for entry in content_data:
try:
shout = migrateShout(entry)
newdata[shout['slug']] = shout
author = newdata[shout['slug']]['authors'][0]['slug']
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
print(line)
counter += 1
if author == 'discours':
discours_author += 1
open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e:
print(entry['_id'])
errored.append(entry)
raise e
try:
limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError:
limit = len(content_data)
open('migration/data/shouts.dict.json',
'w').write(json.dumps(newdata, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) +
' content items were migrated')
print(str(discours_author) + ' from them by @discours')
def comments():
''' migrating comments on content items one by one '''
content_data = json.loads(open('migration/data/content_items.json').read()) # old content
content_dict = { x['_id']: x for x in content_data } # by slug
shouts_dict = json.loads(open('migration/data/shouts.dict.json', 'r').read()) # all shouts by slug
print(str(len(shouts_dict.keys())) + ' migrated shouts loaded')
shouts_old = { x['old_id']: x for slug, x in shouts_dict.items() } # shouts by old_id
print(str(len(content_data)) + ' content items loaded')
comments_data = json.loads(open('migration/data/comments.json').read()) # by slug
print(str(len(comments_data)) + ' comments loaded')
comments_by_post = {}
# sort comments by old posts ids
for old_comment in comments_data:
cid = old_comment['contentItem']
comments_by_post[cid] = comments_by_post.get(cid, [])
comments_by_post[cid].append(old_comment)
# migrate comments
comments_by_shoutslug = {}
for content_item in content_data:
old_id = content_item['_id']
if content_item.get('commentedAt', False):
comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
if comments.length > 0:
shout = shouts_old.get(old_id, { 'slug': 'abandoned-comments' })
comments_by_shoutslug[shout['slug']] = comments
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' articles were exported')
export_comments = {}
c = 0
for slug, article in export_articles.items():
comments = comments_by_shoutslug.get(slug, [])
if len(comments) > 0:
export_comments[slug] = comments
c += len(comments)
print(str(len(export_comments.items())) + ' after adding those having comments')
open('../src/data/comments.json', 'w').write(json.dumps(dict(export_comments),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(c) + ' comments were exported')
def export_shouts(limit):
print('reading json...')
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
print(str(len(content_data)) + ' content items loaded')
newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
print(str(len(newdata.keys())) + ' migrated shouts loaded')
users_old = json.loads(open('migration/data/users.dict.json').read())
print(str(len(newdata.keys())) + ' migrated users loaded')
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
users_slug = { u['slug']: u for old_id, u in users_old.items()}
print(str(len(users_slug.items())) + ' users loaded')
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
export_clean = {}
for (slug, article) in export_list:
if article['layout'] == 'article':
for author in article['authors']:
export_authors[author['slug']] = users_slug[author['slug']]
export_clean[article['slug']] = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
# print(slug)
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_clean.items())) + ' articles exported')
open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
comments()
print(str(len(export_authors.items())) + ' total authors exported')
def export_slug(slug):
shouts_dict = json.loads(open('migration/data/shouts.dict.json').read())
print(str(len(shouts_dict.items())) + ' migrated shouts loaded')
users_old = json.loads(open('migration/data/users.dict.json').read()) # NOTE: this exact file is by old_id
print(str(len(users_old.items())) + ' migrated users loaded')
users_dict = { x[1]['slug']:x for x in users_old.items() }
exported_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(exported_authors.items())) + ' exported authors loaded')
exported_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(exported_articles.items())) + ' exported articles loaded')
shout = shouts_dict.get(slug, False)
if shout:
author = users_dict.get(shout['authors'][0]['slug'], None)
exported_authors.update({shout['authors'][0]['slug']: author})
exported_articles.update({shout['slug']: shout})
print(shout)
open('../src/data/articles.json', 'w').write(json.dumps(exported_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
open('../src/data/authors.json', 'w').write(json.dumps(exported_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
else:
print('no old id error!')
# print(str(len(shouts_dict)) + ' shouts were migrated')
print(slug)
comments()
print('finished.')
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
if sys.argv[1] == "users":
users()
elif sys.argv[1] == "topics":
topics()
elif sys.argv[1] == "shouts":
try:
Community.create(**{
'slug': 'discours.io',
'name': 'Дискурс',
'pic': 'https://discours.io/images/logo-min.svg',
'createdBy': '0',
'createdAt': date_parse(OLD_DATE)
})
except Exception:
pass
shouts()
elif sys.argv[1] == "comments":
comments()
elif sys.argv[1] == "export_shouts":
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
export_shouts(limit)
elif sys.argv[1] == "all":
users()
topics()
shouts()
comments()
elif sys.argv[1] == "bson":
from migration import bson2json
bson2json.json_tables()
elif sys.argv[1] == 'slug':
export_slug(sys.argv[2])
else:
print('usage: python migrate.py bson\n.. \ttopics <limit>\n.. \tusers <limit>\n.. \tshouts <limit>\n.. \tcomments\n.. \texport_shouts <limit>\n.. \tslug <slug>\n.. \tall>')