core/migrate.py

283 lines
12 KiB
Python
Raw Normal View History

2021-10-08 04:42:59 +00:00
''' cmd managed migration '''
2021-08-20 09:27:19 +00:00
import json
2021-10-08 04:42:59 +00:00
import base64
import re
2021-10-08 09:58:19 +00:00
import frontmatter
2021-08-20 09:27:19 +00:00
from migration.tables.users import migrate as migrateUser
2021-10-08 09:58:19 +00:00
from migration.tables.content_items import get_metadata, migrate as migrateShout
2021-10-08 04:42:59 +00:00
from migration.tables.content_item_categories import migrate as migrateCategory
from migration.tables.tags import migrate as migrateTag
2021-08-20 09:27:19 +00:00
from migration.utils import DateTimeEncoder
2021-10-04 17:06:05 +00:00
from orm import Community
2021-08-20 09:27:19 +00:00
2021-10-08 04:42:59 +00:00
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = '/static/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
2021-10-12 19:38:12 +00:00
open('..' + link, 'wb').write(base64.b64decode(img))
images.append(img)
2021-10-08 04:42:59 +00:00
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
def users():
''' migrating users first '''
2021-08-20 09:27:19 +00:00
print('migrating users...')
newdata = {}
2021-10-08 04:42:59 +00:00
data = json.loads(open('migration/data/users.json').read())
2021-08-20 09:27:19 +00:00
counter = 0
2021-10-08 04:42:59 +00:00
export_data = {}
2021-08-20 15:10:15 +00:00
for entry in data:
oid = entry['_id']
2021-10-04 17:06:05 +00:00
user = migrateUser(entry)
newdata[oid] = user
del user['password']
del user['notifications']
# del user['oauth']
del user['emailConfirmed']
del user['username']
del user['email']
2021-10-08 04:42:59 +00:00
export_data[user['slug']] = user
2021-08-20 15:10:15 +00:00
counter += 1
2021-10-08 04:42:59 +00:00
export_list = sorted(export_data.items(),
key=lambda item: item[1]['rating'])[-10:]
2021-10-12 19:38:12 +00:00
open('migration/data/users.dict.json', 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) # NOTE: by old_id
2021-10-08 04:42:59 +00:00
open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list),
2021-10-12 19:38:12 +00:00
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
2021-10-08 04:42:59 +00:00
print(str(len(newdata.items())) + ' user accounts were migrated')
print(str(len(export_list)) + ' authors were exported')
2021-08-20 09:27:19 +00:00
2021-10-04 17:06:05 +00:00
def topics():
2021-10-08 04:42:59 +00:00
''' topics from categories and tags '''
2021-08-20 09:27:19 +00:00
print('migrating topics...')
2021-10-08 04:42:59 +00:00
cat_data = json.loads(
open('migration/data/content_item_categories.json').read())
2021-10-08 09:58:19 +00:00
# tag_data = json.loads(open('migration/data/tags.json').read())
new_data = {}
old_data = {}
2021-08-20 09:27:19 +00:00
counter = 0
try:
2021-10-08 04:42:59 +00:00
for cat in cat_data:
topic = migrateCategory(cat)
2021-10-08 09:58:19 +00:00
old_data[topic['old_id']] = topic
new_data[topic['slug']] = topic
2021-08-20 09:27:19 +00:00
counter += 1
except Exception:
2021-10-08 04:42:59 +00:00
print('cats exception, try to remove database first')
2021-10-08 09:58:19 +00:00
'''
2021-10-08 04:42:59 +00:00
try:
for tag in tag_data:
topic = migrateTag(tag)
newdata[topic['slug']] = topic
counter += 1
except Exception:
print('tags exception, try to remove database first')
raise Exception
2021-10-08 09:58:19 +00:00
'''
export_list = sorted(new_data.items(), key=lambda item: str(
item[1]['createdAt']))
2021-10-08 04:42:59 +00:00
open('migration/data/topics.dict.json',
2021-10-12 19:38:12 +00:00
'w').write(json.dumps(old_data, cls=DateTimeEncoder))
2021-10-08 04:42:59 +00:00
open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list),
2021-10-12 19:38:12 +00:00
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(counter) + ' from ' + str(len(cat_data)) + ' cats were migrated')
#' tags and ' + str(len(tag_data)) +
2021-10-08 04:42:59 +00:00
print(str(len(export_list)) + ' topics were exported')
2021-08-20 09:27:19 +00:00
2021-10-08 04:42:59 +00:00
def shouts():
''' migrating content items one by one '''
2021-08-23 08:44:46 +00:00
print('loading shouts...')
2021-08-20 09:27:19 +00:00
counter = 0
2021-10-08 04:42:59 +00:00
discours_author = 0
content_data = json.loads(open('migration/data/content_items.json').read())
2021-10-12 19:38:12 +00:00
content_dict = { x['_id']:x for x in content_data }
2021-08-20 09:27:19 +00:00
newdata = {}
2021-10-08 04:42:59 +00:00
print(str(len(content_data)) + ' entries loaded. now migrating...')
2021-08-25 21:20:53 +00:00
errored = []
2021-10-08 04:42:59 +00:00
for entry in content_data:
2021-08-25 21:20:53 +00:00
try:
2021-10-08 09:58:19 +00:00
shout = migrateShout(entry)
2021-10-08 04:42:59 +00:00
newdata[shout['slug']] = shout
author = newdata[shout['slug']]['authors'][0]['slug']
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
print(line)
counter += 1
2021-10-09 08:36:14 +00:00
if author == 'discours':
2021-10-08 04:42:59 +00:00
discours_author += 1
open('./shouts.id.log', 'a').write(line + '\n')
2021-10-12 19:38:12 +00:00
except Exception as e:
2021-08-25 21:20:53 +00:00
print(entry['_id'])
errored.append(entry)
2021-10-12 19:38:12 +00:00
raise e
2021-10-08 04:42:59 +00:00
try:
limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError:
limit = len(content_data)
open('migration/data/shouts.dict.json',
2021-10-12 19:38:12 +00:00
'w').write(json.dumps(newdata, cls=DateTimeEncoder))
2021-10-09 08:36:14 +00:00
print(str(counter) + '/' + str(len(content_data)) +
2021-10-12 19:38:12 +00:00
' content items were migrated')
2021-10-09 08:36:14 +00:00
print(str(discours_author) + ' from them by @discours')
2021-10-10 15:42:32 +00:00
def comments():
''' migrating comments on content items one by one '''
comments_data = json.loads(open('migration/data/comments.json').read())
print(str(len(comments_data)) + ' comments loaded')
comments_by_post = {}
for comment in comments_data:
p = comment['contentItem']
comments_by_post[p] = comments_by_post.get(p, [])
comments_by_post[p].append(comment)
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' articles were exported')
export_comments = {}
c = 0
2021-10-12 19:38:12 +00:00
for slug, article in export_articles.items():
comments = comments_by_post.get(slug, [])
if len(comments) > 0:
export_comments[slug] = comments
c += len(comments)
print(str(len(export_comments.items())) + ' after adding those having comments')
open('../src/data/comments.json', 'w').write(json.dumps(dict(export_comments),
2021-10-10 15:42:32 +00:00
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(c) + ' comments were exported')
2021-10-09 08:36:14 +00:00
def export_shouts(limit):
print('reading json...')
newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
2021-10-09 10:08:27 +00:00
print(str(len(newdata.keys())) + ' shouts loaded')
2021-10-10 12:36:35 +00:00
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
2021-10-09 10:08:27 +00:00
users_old = json.loads(open('migration/data/users.dict.json').read())
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' pre-exported authors loaded')
users_slug = { u['slug']: u for old_id, u in users_old.items()}
print(str(len(users_slug.items())) + ' users loaded')
2021-10-09 08:36:14 +00:00
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
2021-10-09 10:08:27 +00:00
2021-10-09 08:36:14 +00:00
export_list = export_list[:limit or len(export_list)]
export_clean = {}
for (slug, article) in export_list:
if article['layout'] == 'article':
2021-10-09 10:08:27 +00:00
for author in article['authors']:
2021-10-12 19:38:12 +00:00
export_authors[author['slug']] = users_slug[author['slug']]
2021-10-09 08:36:14 +00:00
export_clean[article['slug']] = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
# print(slug)
2021-10-10 12:36:35 +00:00
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
2021-10-08 04:42:59 +00:00
open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
2021-10-09 10:08:27 +00:00
print(str(len(export_clean.items())) + ' articles exported')
open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
2021-10-10 15:42:32 +00:00
comments()
2021-10-09 10:08:27 +00:00
print(str(len(export_authors.items())) + ' total authors exported')
2021-08-20 15:10:15 +00:00
2021-10-10 12:36:35 +00:00
def export_slug(slug):
shouts_dict = json.loads(open('migration/data/shouts.dict.json').read())
print(str(len(shouts_dict.items())) + ' shouts loaded')
users_old = json.loads(open('migration/data/users.dict.json').read())
print(str(len(users_old.items())) + ' users loaded')
users_dict = { x[1]['slug']:x for x in users_old.items() }
exported_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(exported_authors.items())) + ' authors were exported before')
exported_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(exported_articles.items())) + ' articles were exported before')
2021-10-12 19:38:12 +00:00
shout = shouts_dict.get(slug, False)
if shout:
author = users_dict.get(shout['authors'][0]['slug'], None)
exported_authors.update({shout['authors'][0]['slug']: author})
exported_articles.update({shout['slug']: shout})
print(shout)
open('../src/data/articles.json', 'w').write(json.dumps(exported_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
open('../src/data/authors.json', 'w').write(json.dumps(exported_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
else:
print('no old id error!')
print(str(len(shouts_dict)) + ' shouts were migrated')
print(slug)
2021-10-10 15:42:32 +00:00
comments()
2021-10-12 19:38:12 +00:00
print('finished.')
2021-10-10 12:36:35 +00:00
2021-08-20 09:27:19 +00:00
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
if sys.argv[1] == "users":
2021-10-08 04:42:59 +00:00
users()
2021-08-20 09:27:19 +00:00
elif sys.argv[1] == "topics":
2021-10-04 17:06:05 +00:00
topics()
2021-10-10 15:42:32 +00:00
elif sys.argv[1] == "comments":
comments()
2021-08-20 09:27:19 +00:00
elif sys.argv[1] == "shouts":
2021-10-08 04:42:59 +00:00
try:
Community.create(**{
'slug': 'discours.io',
'name': 'Дискурс',
'pic': 'https://discours.io/images/logo-min.svg',
'createdBy': '0',
'createdAt': OLD_DATE
2021-10-04 17:06:05 +00:00
})
2021-10-08 04:42:59 +00:00
except Exception:
pass
shouts()
2021-10-09 08:36:14 +00:00
elif sys.argv[1] == "export_shouts":
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
export_shouts(limit)
2021-08-20 09:27:19 +00:00
elif sys.argv[1] == "all":
2021-10-08 04:42:59 +00:00
users()
2021-10-04 17:06:05 +00:00
topics()
2021-10-08 04:42:59 +00:00
shouts()
2021-08-20 09:30:52 +00:00
elif sys.argv[1] == "bson":
2021-10-08 04:42:59 +00:00
from migration import bson2json
2021-08-20 09:30:52 +00:00
bson2json.json_tables()
2021-10-10 12:36:35 +00:00
elif sys.argv[1] == 'slug':
export_slug(sys.argv[2])
2021-08-20 09:27:19 +00:00
else:
2021-10-10 12:36:35 +00:00
print('usage: python migrate.py <bson|slug|topics|users|shouts|export_shouts [num]|slug [str]|all>')