migration fix, new html2text, export wip

This commit is contained in:
2021-10-15 13:00:26 +03:00
parent 7ec763391b
commit 14fdfe71e5
21 changed files with 3358 additions and 564 deletions

View File

@@ -17,263 +17,275 @@ IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000'
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = '/static/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
open('..' + link, 'wb').write(base64.b64decode(img))
images.append(img)
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
if __name__ == '__main__':
import sys
users_data = json.loads(open('migration/data/users.json').read())
users_dict = { x['_id']: x for x in users_data } # by id
print(str(len(users_data)) + ' users loaded')
users_by_oid = {}
users_by_slug = {}
def users():
''' migrating users first '''
print('migrating users...')
newdata = {}
data = json.loads(open('migration/data/users.json').read())
counter = 0
export_data = {}
for entry in data:
oid = entry['_id']
user = migrateUser(entry)
newdata[oid] = user
del user['password']
del user['notifications']
# del user['oauth']
del user['emailConfirmed']
del user['username']
del user['email']
export_data[user['slug']] = user
counter += 1
export_list = sorted(export_data.items(), key=lambda item: item[1]['rating'])[-10:]
open('migration/data/users.dict.json', 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) # NOTE: by old_id
open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(newdata.items())) + ' user accounts were migrated')
print(str(len(export_list)) + ' authors were exported')
tags_data = json.loads(open('migration/data/tags.json').read())
print(str(len(tags_data)) + ' tags loaded')
def topics():
''' topics from categories and tags '''
print('migrating topics...')
cats_data = json.loads(open('migration/data/content_item_categories.json').read())
cat_topics = {}
slug_topics = {}
counter = 0
try:
for cat in cats_data:
topic = migrateCategory(cat)
cat_topics[topic['cat_id']] = topic
slug_topics[topic['slug']] = topic
counter += 1
except Exception as e:
print('cats exception, try to remove database first')
raise e
'''
try:
for tag in tag_data:
topic = migrateTag(tag)
newdata[topic['slug']] = topic
counter += 1
except Exception:
print('tags exception, try to remove database first')
raise Exception
'''
export_list = sorted(slug_topics.items(), key=lambda item: str(
item[1]['createdAt']))
open('migration/data/topics.dict.json','w').write(json.dumps(cat_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
#' tags and ' + str(len(tag_data)) +
print(str(counter) + ' / ' + str(len(cats_data)) + ' migrated')
print(str(len(export_list)) + ' topics were exported')
print(str(len(cats_data)) + ' cats loaded')
topics_by_cat = {}
topics_by_tag = {}
topics_by_slug = {}
def shouts():
''' migrating content items one by one '''
print('loading shouts...')
counter = 0
discours_author = 0
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
newdata = {}
print(str(len(content_data)) + ' entries loaded. now migrating...')
errored = []
for entry in content_data:
try:
shout = migrateShout(entry)
newdata[shout['slug']] = shout
author = newdata[shout['slug']]['authors'][0]['slug']
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
print(line)
counter += 1
if author == 'discours':
discours_author += 1
open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e:
print(entry['_id'])
errored.append(entry)
raise e
try:
limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError:
limit = len(content_data)
open('migration/data/shouts.dict.json',
'w').write(json.dumps(newdata, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) +
' content items were migrated')
print(str(discours_author) + ' from them by @discours')
def comments():
''' migrating comments on content items one by one '''
content_data = json.loads(open('migration/data/content_items.json').read()) # old content
content_dict = { x['_id']: x for x in content_data } # by slug
shouts_dict = json.loads(open('migration/data/shouts.dict.json', 'r').read()) # all shouts by slug
print(str(len(shouts_dict.keys())) + ' migrated shouts loaded')
shouts_old = { x['old_id']: x for slug, x in shouts_dict.items() } # shouts by old_id
content_dict = { x['_id']: x for x in content_data }
print(str(len(content_data)) + ' content items loaded')
comments_data = json.loads(open('migration/data/comments.json').read()) # by slug
shouts_by_slug = {}
shouts_by_oid = {}
comments_data = json.loads(open('migration/data/comments.json').read())
print(str(len(comments_data)) + ' comments loaded')
comments_by_post = {}
# sort comments by old posts ids
for old_comment in comments_data:
cid = old_comment['contentItem']
comments_by_post[cid] = comments_by_post.get(cid, [])
comments_by_post[cid].append(old_comment)
# migrate comments
comments_by_shoutslug = {}
for content_item in content_data:
old_id = content_item['_id']
if content_item.get('commentedAt', False):
comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
if comments.length > 0:
shout = shouts_old.get(old_id, { 'slug': 'abandoned-comments' })
comments_by_shoutslug[shout['slug']] = comments
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' articles were exported')
export_comments = {}
c = 0
for slug, article in export_articles.items():
comments = comments_by_shoutslug.get(slug, [])
if len(comments) > 0:
export_comments[slug] = comments
c += len(comments)
print(str(len(export_comments.items())) + ' after adding those having comments')
open('../src/data/comments.json', 'w').write(json.dumps(dict(export_comments),
print(str(len(comments_by_post.keys())) + ' articles with comments')
export_articles = {} # slug: shout
export_authors = {} # slug: user
export_comments = {} # shout-slug: comment[] (list)
export_topics = {} # slug: topic
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = '/static/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
open('..' + link, 'wb').write(base64.b64decode(img))
images.append(img)
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
def users():
''' migrating users first '''
# limiting
limit = len(users_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d users...' % limit)
counter = 0
for entry in users_data:
oid = entry['_id']
user = migrateUser(entry)
users_by_oid[oid] = user # full
del user['password']
del user['notifications']
# del user['oauth']
del user['emailConfirmed']
del user['username']
del user['email']
users_by_slug[user['slug']] = user # public
counter += 1
export_authors = dict(sorted(users_by_slug.items(), key=lambda item: item[1]['rating'])[-10:])
open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id
open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by old_id
print(str(len(users_by_slug.items())) + ' users migrated')
def topics():
''' topics from categories and tags '''
# limiting
limit = len(cats_data) + len(tags_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d topics...' % limit)
counter = 0
for cat in cats_data:
try: topic = migrateCategory(cat)
except Exception as e: raise e
topics_by_cat[topic['cat_id']] = topic
topics_by_slug[topic['slug']] = topic
counter += 1
for tag in tags_data:
topic = migrateTag(tag)
topics_by_tag[topic['tag_id']] = topic
if not topics_by_slug.get(topic['slug']): topics_by_slug[topic['slug']] = topic
counter += 1
export_topics = dict(sorted(topics_by_slug.items(), key=lambda item: str(item[1]['createdAt']))) # NOTE: sorting does not work :)
open('migration/data/topics.slug.json','w').write(json.dumps(topics_by_slug,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(c) + ' comments were exported')
open('migration/data/topics.cat_id.json','w').write(json.dumps(topics_by_cat,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
def shouts():
''' migrating content items one by one '''
# limiting
limit = len(content_data)
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d content items...' % limit)
counter = 0
discours_author = 0
errored = []
# limiting
try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError: limit = len(content_data)
for entry in content_data[:limit]:
try:
shout = migrateShout(entry, users_by_oid, topics_by_cat)
author = shout['authors'][0]
shout['authors'] = [ author.id, ]
shouts_by_slug[shout['slug']] = shout
shouts_by_oid[entry['_id']] = shout
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
counter += 1
if author.slug == 'discours': discours_author += 1
print(line)
# open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e:
print(entry['_id'])
errored.append(entry)
raise e
open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
print(str(discours_author) + ' authored by @discours')
def export_shouts(shouts_by_slug, export_articles, export_authors):
# update what was just migrated or load json again
if len(export_authors.keys()) == 0:
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
if len(export_articles.keys()) == 0:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded')
# limiting
limit = 33
if len(sys.argv) > 2: limit = int(sys.argv[2])
print('exporting %d articles to json...' % limit)
# filter
export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
for (slug, article) in export_list:
if article['layout'] == 'article':
export_slug(slug, export_articles, export_authors)
def export_body(article):
article = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
def export_shouts(limit):
print('reading json...')
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
print(str(len(content_data)) + ' content items loaded')
newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
print(str(len(newdata.keys())) + ' migrated shouts loaded')
users_old = json.loads(open('migration/data/users.dict.json').read())
print(str(len(newdata.keys())) + ' migrated users loaded')
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
users_slug = { u['slug']: u for old_id, u in users_old.items()}
print(str(len(users_slug.items())) + ' users loaded')
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
export_clean = {}
for (slug, article) in export_list:
if article['layout'] == 'article':
for author in article['authors']:
export_authors[author['slug']] = users_slug[author['slug']]
export_clean[article['slug']] = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
# print(slug)
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_clean.items())) + ' articles exported')
open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
comments()
print(str(len(export_authors.items())) + ' total authors exported')
def export_slug(slug):
shouts_dict = json.loads(open('migration/data/shouts.dict.json').read())
print(str(len(shouts_dict.items())) + ' migrated shouts loaded')
users_old = json.loads(open('migration/data/users.dict.json').read()) # NOTE: this exact file is by old_id
print(str(len(users_old.items())) + ' migrated users loaded')
users_dict = { x[1]['slug']:x for x in users_old.items() }
exported_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(exported_authors.items())) + ' exported authors loaded')
exported_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(exported_articles.items())) + ' exported articles loaded')
shout = shouts_dict.get(slug, False)
if shout:
author = users_dict.get(shout['authors'][0]['slug'], None)
def export_slug(slug, export_articles, export_authors):
if exported_authors == {}:
exported_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(exported_authors.items())) + ' exported authors loaded')
if exported_articles == {}:
exported_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(exported_articles.items())) + ' exported articles loaded')
shout = shouts_by_slug.get(slug, False)
assert shout, 'no data error'
author = users_by_slug.get(shout['authors'][0]['slug'], None)
exported_authors.update({shout['authors'][0]['slug']: author})
exported_articles.update({shout['slug']: shout})
print(shout)
open('../src/data/articles.json', 'w').write(json.dumps(exported_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
open('../src/data/authors.json', 'w').write(json.dumps(exported_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
else:
print('no old id error!')
# print(str(len(shouts_dict)) + ' shouts were migrated')
print(slug)
comments()
print('finished.')
export_body(shout)
comments([slug, ])
def comments(sluglist = []):
''' migrating comments on content items one '''
if len(sluglist) == 0:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' articles were exported before')
if len(sluglist) == 0: sluglist = list(export_articles.keys())
if len(sluglist) > 0:
print('exporting comments for exact articles...')
for slug in sluglist:
shout = shouts_by_slug[slug]
old_id = shout['old_id']
content_item = content_dict.get(old_id, {})
if content_item.get('commentedAt', False):
comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
if len(comments) > 0:
export_comments[slug] = comments
sys.stdout.write('.')
else:
print('exporting comments for top 10 commented articles...')
comments_by_shoutslug = {}
for content_item in content_data:
old_id = content_item['_id']
if content_item.get('commentedAt', False):
comments = [ migrateComment(c) for c in comments_by_post.get(old_id, []) ]
if len(comments) > 0:
shout = shouts_by_oid.get(old_id, { 'slug': 'abandoned-comments' })
comments_by_shoutslug[shout['slug']] = comments
top = dict(sorted(comments_by_shoutslug.items(), reverse=True, key=lambda c: len(c[1]))[:10])
export_comments.update(top)
print(str(len(export_comments.keys())) + ' articls with comments exported\n')
def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
open('../src/data/authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_authors.items())) + ' authors exported')
open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_topics.keys())) + ' topics exported')
open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_articles.items())) + ' articles exported')
open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(len(export_comments.items())) + ' exported articles with comments')
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
if sys.argv[1] == "users":
users()
elif sys.argv[1] == "topics":
topics()
elif sys.argv[1] == "shouts":
cmd = sys.argv[1]
if cmd == "users":
users(users_by_oid, users_by_slug, users_data, users_dict)
elif cmd == "topics":
topics(topics_by_cat, topics_by_tag, topics_by_slug)
elif cmd == "shouts":
try:
Community.create(**{
'slug': 'discours.io',
@@ -284,21 +296,30 @@ if __name__ == '__main__':
})
except Exception:
pass
shouts()
elif sys.argv[1] == "comments":
shouts(shouts_by_slug, shouts_by_oid) # NOTE: listens limit
elif cmd == "comments":
comments()
elif sys.argv[1] == "export_shouts":
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
export_shouts(limit)
elif sys.argv[1] == "all":
elif cmd == "export_shouts":
export_shouts(shouts_by_slug, export_articles, export_authors)
elif cmd == "all":
users()
topics()
shouts()
comments()
elif sys.argv[1] == "bson":
elif cmd == "bson":
from migration import bson2json
bson2json.json_tables()
elif sys.argv[1] == 'slug':
export_slug(sys.argv[2])
elif cmd == 'slug':
export_slug(sys.argv[2], export_articles, export_authors)
export_finish(export_articles, export_authors, export_topics, export_comments)
else:
print('usage: python migrate.py bson\n.. \ttopics <limit>\n.. \tusers <limit>\n.. \tshouts <limit>\n.. \tcomments\n.. \texport_shouts <limit>\n.. \tslug <slug>\n.. \tall>')
print('''
usage: python migrate.py bson
\n.. \ttopics <limit>
\n.. \tusers <limit>
\n.. \tshouts <limit>
\n.. \tcomments
\n.. \texport_shouts <limit>
\n.. \tslug <slug>
\n.. \tall
''')