migration: new extract logix
This commit is contained in:
parent
8491b12e45
commit
5b679f99e0
171
migrate.py
171
migrate.py
|
@ -1,9 +1,7 @@
|
||||||
''' cmd managed migration '''
|
''' cmd managed migration '''
|
||||||
import json
|
import json
|
||||||
import pprint
|
|
||||||
import base64
|
|
||||||
import re
|
|
||||||
import frontmatter
|
import frontmatter
|
||||||
|
from migration.extract import extract
|
||||||
from migration.tables.users import migrate as migrateUser
|
from migration.tables.users import migrate as migrateUser
|
||||||
from migration.tables.users import migrate_2stage as migrateUser_2stage
|
from migration.tables.users import migrate_2stage as migrateUser_2stage
|
||||||
from migration.tables.users import migrate_email_subscription
|
from migration.tables.users import migrate_email_subscription
|
||||||
|
@ -19,35 +17,14 @@ from dateutil.parser import parse as date_parse
|
||||||
from orm.base import local_session
|
from orm.base import local_session
|
||||||
from orm import User
|
from orm import User
|
||||||
|
|
||||||
print = pprint.pprint
|
|
||||||
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((.|\s)*?))\)"
|
|
||||||
OLD_DATE = '2016-03-05 22:22:00.350000'
|
OLD_DATE = '2016-03-05 22:22:00.350000'
|
||||||
|
|
||||||
|
|
||||||
def extract_images(article):
|
|
||||||
''' extract b64 encoded images from markdown in article body '''
|
|
||||||
body = article['body']
|
|
||||||
images = []
|
|
||||||
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
|
|
||||||
for i, match in enumerate(matches, start=1):
|
|
||||||
ext = match.group(3)
|
|
||||||
link = 'discoursio-web/public/upload/image-' + \
|
|
||||||
article['old_id'] + str(i) + '.' + ext
|
|
||||||
img = match.group(4)
|
|
||||||
if img not in images:
|
|
||||||
open('../' + link, 'wb').write(base64.b64decode(img))
|
|
||||||
images.append(img)
|
|
||||||
body = body.replace(match.group(2), link)
|
|
||||||
print(link)
|
|
||||||
article['body'] = body
|
|
||||||
return article
|
|
||||||
|
|
||||||
def users(users_by_oid, users_by_slug, users_data):
|
def users(users_by_oid, users_by_slug, users_data):
|
||||||
''' migrating users first '''
|
''' migrating users first '''
|
||||||
# limiting
|
# limiting
|
||||||
limit = len(users_data)
|
limit = len(users_data)
|
||||||
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
||||||
print('migrating %d users...' % limit)
|
print('[migration] %d users...' % limit)
|
||||||
counter = 0
|
counter = 0
|
||||||
id_map = {}
|
id_map = {}
|
||||||
for entry in users_data:
|
for entry in users_data:
|
||||||
|
@ -63,16 +40,18 @@ def users(users_by_oid, users_by_slug, users_data):
|
||||||
users_by_slug[user['slug']] = user # public
|
users_by_slug[user['slug']] = user # public
|
||||||
id_map[user['old_id']] = user['slug']
|
id_map[user['old_id']] = user['slug']
|
||||||
counter += 1
|
counter += 1
|
||||||
print(' - * - stage 2 users migration - * -')
|
# print(' - * - stage 2 users migration - * -')
|
||||||
|
ce = 0
|
||||||
for entry in users_data:
|
for entry in users_data:
|
||||||
migrateUser_2stage(entry, id_map)
|
ce += migrateUser_2stage(entry, id_map)
|
||||||
try:
|
# print(str(len(users_by_slug.items())) + ' users migrated')
|
||||||
open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id
|
print('[migration] %d user ratings errors' % ce)
|
||||||
open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug
|
#try:
|
||||||
print(str(len(users_by_slug.items())) + ' users migrated')
|
# open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id
|
||||||
except Exception:
|
# open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug
|
||||||
print('json dump error')
|
#except Exception:
|
||||||
# print(users_by_oid)
|
# print('json dump error')
|
||||||
|
# # print(users_by_oid)
|
||||||
|
|
||||||
|
|
||||||
def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
|
def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
|
||||||
|
@ -80,7 +59,7 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
|
||||||
# limiting
|
# limiting
|
||||||
limit = len(cats_data) + len(tags_data)
|
limit = len(cats_data) + len(tags_data)
|
||||||
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
||||||
print('migrating %d topics...' % limit)
|
print('[migration] %d topics...' % limit)
|
||||||
counter = 0
|
counter = 0
|
||||||
retopics = json.loads(open('migration/tables/replacements.json').read())
|
retopics = json.loads(open('migration/tables/replacements.json').read())
|
||||||
topicslugs_by_oid = {}
|
topicslugs_by_oid = {}
|
||||||
|
@ -106,8 +85,8 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
|
||||||
for oid, oslug in topicslugs_by_oid.items():
|
for oid, oslug in topicslugs_by_oid.items():
|
||||||
if topics_by_slug.get(oslug):
|
if topics_by_slug.get(oslug):
|
||||||
topics_by_oid[oid] = topics_by_slug.get(retopics.get(oslug, oslug))
|
topics_by_oid[oid] = topics_by_slug.get(retopics.get(oslug, oslug))
|
||||||
print( str(len(topics_by_oid.values())) + ' topics by oid' )
|
print( '[migration] ' + str(len(topics_by_oid.values())) + ' topics by oid' )
|
||||||
print( str(len(topics_by_slug.values())) + ' topics by slug' )
|
print( '[migration] ' + str(len(topics_by_slug.values())) + ' topics by slug' )
|
||||||
#replacements = {} # json.loads(open('migration/tables/replacements.json').read())
|
#replacements = {} # json.loads(open('migration/tables/replacements.json').read())
|
||||||
#for t in topics_by_title.values():
|
#for t in topics_by_title.values():
|
||||||
# slug = replacements.get(t['slug'].strip()) or t['slug'].strip()
|
# slug = replacements.get(t['slug'].strip()) or t['slug'].strip()
|
||||||
|
@ -121,32 +100,24 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
|
||||||
# sort_keys=True,
|
# sort_keys=True,
|
||||||
# ensure_ascii=False))
|
# ensure_ascii=False))
|
||||||
|
|
||||||
def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid):
|
def shouts(content_data, shouts_by_slug, shouts_by_oid):
|
||||||
''' migrating content items one by one '''
|
''' migrating content items one by one '''
|
||||||
# limiting
|
# limiting
|
||||||
limit = len(content_data)
|
limit = len(content_data)
|
||||||
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
||||||
print('migrating %d content items...' % limit)
|
print('[migration] %d content items...' % limit)
|
||||||
counter = 0
|
counter = 0
|
||||||
discours_author = 0
|
discours_author = 0
|
||||||
errored = []
|
errored = []
|
||||||
|
pub_counter = 0
|
||||||
# limiting
|
# limiting
|
||||||
try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
|
try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
|
||||||
except ValueError: limit = len(content_data)
|
except ValueError: limit = len(content_data)
|
||||||
te = {}
|
|
||||||
for entry in content_data[:limit]:
|
for entry in content_data[:limit]:
|
||||||
|
if 'slug' in sys.argv and entry['slug'] not in sys.argv: continue
|
||||||
try:
|
try:
|
||||||
shout, terrors = migrateShout(entry, users_by_oid, topics_by_oid)
|
shout, terrors = migrateShout(entry, users_by_oid, topics_by_oid)
|
||||||
for oid in terrors:
|
if entry.get('published'): pub_counter += 1
|
||||||
if not te.get(oid):
|
|
||||||
if oldtopics_by_oid.get(oid):
|
|
||||||
te[oldtopics_by_oid[oid]['slug']] = []
|
|
||||||
else:
|
|
||||||
# print('lost old topic id: ' + oid)
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
te[oid].append(shout['slug'])
|
|
||||||
author = shout['authors'][0]
|
author = shout['authors'][0]
|
||||||
shout['authors'] = [ author.id, ]
|
shout['authors'] = [ author.id, ]
|
||||||
newtopics = []
|
newtopics = []
|
||||||
|
@ -156,7 +127,6 @@ def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid):
|
||||||
if nt not in newtopics:
|
if nt not in newtopics:
|
||||||
newtopics.append(nt)
|
newtopics.append(nt)
|
||||||
shout['topics'] = newtopics
|
shout['topics'] = newtopics
|
||||||
shout = extract_images(shout)
|
|
||||||
shouts_by_slug[shout['slug']] = shout
|
shouts_by_slug[shout['slug']] = shout
|
||||||
shouts_by_oid[entry['_id']] = shout
|
shouts_by_oid[entry['_id']] = shout
|
||||||
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
|
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
|
||||||
|
@ -165,33 +135,34 @@ def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid):
|
||||||
print(line)
|
print(line)
|
||||||
# open('./shouts.id.log', 'a').write(line + '\n')
|
# open('./shouts.id.log', 'a').write(line + '\n')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(entry['_id'])
|
# print(entry['_id'])
|
||||||
errored.append(entry)
|
errored.append(entry)
|
||||||
raise e
|
raise e
|
||||||
print(te)
|
# print(te)
|
||||||
open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
|
# open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
|
||||||
open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
|
# open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
|
||||||
print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
|
print('[migration] ' + str(counter) + ' content items were migrated')
|
||||||
print(str(discours_author) + ' authored by @discours')
|
print('[migration] ' + str(pub_counter) + ' have been published')
|
||||||
|
print('[migration] ' + str(discours_author) + ' authored by @discours')
|
||||||
|
|
||||||
def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
|
def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
|
||||||
# update what was just migrated or load json again
|
# update what was just migrated or load json again
|
||||||
if len(export_authors.keys()) == 0:
|
if len(export_authors.keys()) == 0:
|
||||||
export_authors = json.loads(open('../src/data/authors.json').read())
|
export_authors = json.loads(open('../src/data/authors.json').read())
|
||||||
print(str(len(export_authors.items())) + ' exported authors loaded')
|
print('[migration] ' + str(len(export_authors.items())) + ' exported authors loaded')
|
||||||
if len(export_articles.keys()) == 0:
|
if len(export_articles.keys()) == 0:
|
||||||
export_articles = json.loads(open('../src/data/articles.json').read())
|
export_articles = json.loads(open('../src/data/articles.json').read())
|
||||||
print(str(len(export_articles.items())) + ' exported articles loaded')
|
print('[migration] ' + str(len(export_articles.items())) + ' exported articles loaded')
|
||||||
|
|
||||||
# limiting
|
# limiting
|
||||||
limit = 33
|
limit = 33
|
||||||
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
if len(sys.argv) > 2: limit = int(sys.argv[2])
|
||||||
print('exporting %d articles to json...' % limit)
|
print('[migration] ' + 'exporting %d articles to json...' % limit)
|
||||||
|
|
||||||
# filter
|
# filter
|
||||||
export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
|
export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
|
||||||
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
|
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
|
||||||
print(str(len(export_list)) + ' filtered')
|
print('[migration] ' + str(len(export_list)) + ' filtered')
|
||||||
export_list = export_list[:limit or len(export_list)]
|
export_list = export_list[:limit or len(export_list)]
|
||||||
|
|
||||||
for (slug, article) in export_list:
|
for (slug, article) in export_list:
|
||||||
|
@ -199,20 +170,20 @@ def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
|
||||||
export_slug(slug, export_articles, export_authors, content_dict)
|
export_slug(slug, export_articles, export_authors, content_dict)
|
||||||
|
|
||||||
def export_body(article, content_dict):
|
def export_body(article, content_dict):
|
||||||
article = extract_images(article)
|
article['body'] = extract(article['body'], article['oid'])
|
||||||
metadata = get_metadata(article)
|
metadata = get_metadata(article)
|
||||||
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
|
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
|
||||||
open('../discoursio-web/content/' + slug + '.mdx', 'w').write(content)
|
open('../discoursio-web/content/' + article['slug'] + '.mdx', 'w').write(content)
|
||||||
# open('../discoursio-web/content/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
|
open('../discoursio-web/content/'+ article['slug'] + '.html', 'w').write(content_dict[article['old_id']]['body'])
|
||||||
|
|
||||||
def export_slug(slug, export_articles, export_authors, content_dict):
|
def export_slug(slug, export_articles, export_authors, content_dict):
|
||||||
print('exporting %s ' % slug)
|
print('[migration] ' + 'exporting %s ' % slug)
|
||||||
if export_authors == {}:
|
if export_authors == {}:
|
||||||
export_authors = json.loads(open('../src/data/authors.json').read())
|
export_authors = json.loads(open('../src/data/authors.json').read())
|
||||||
print(str(len(export_authors.items())) + ' exported authors loaded')
|
print('[migration] ' + str(len(export_authors.items())) + ' exported authors loaded')
|
||||||
if export_articles == {}:
|
if export_articles == {}:
|
||||||
export_articles = json.loads(open('../src/data/articles.json').read())
|
export_articles = json.loads(open('../src/data/articles.json').read())
|
||||||
print(str(len(export_articles.items())) + ' exported articles loaded')
|
print('[migration] ' + str(len(export_articles.items())) + ' exported articles loaded')
|
||||||
|
|
||||||
shout = shouts_by_slug.get(slug, False)
|
shout = shouts_by_slug.get(slug, False)
|
||||||
assert shout, 'no data error'
|
assert shout, 'no data error'
|
||||||
|
@ -233,12 +204,14 @@ def comments(comments_data):
|
||||||
id_map[old_id] = id
|
id_map[old_id] = id
|
||||||
for comment in comments_data:
|
for comment in comments_data:
|
||||||
migrateComment_2stage(comment, id_map)
|
migrateComment_2stage(comment, id_map)
|
||||||
print(str(len(id_map)) + ' comments exported')
|
print('[migration] ' + str(len(id_map)) + ' comments exported')
|
||||||
|
|
||||||
def export_email_subscriptions(email_subscriptions_data):
|
def export_email_subscriptions():
|
||||||
|
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
|
||||||
|
print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions loaded')
|
||||||
for data in email_subscriptions_data:
|
for data in email_subscriptions_data:
|
||||||
migrate_email_subscription(data)
|
migrate_email_subscription(data)
|
||||||
print(str(len(email_subscriptions_data)) + ' email subscriptions exported')
|
print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions exported')
|
||||||
|
|
||||||
|
|
||||||
def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
|
def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
|
||||||
|
@ -247,26 +220,26 @@ def export_finish(export_articles = {}, export_authors = {}, export_topics = {},
|
||||||
indent=4,
|
indent=4,
|
||||||
sort_keys=True,
|
sort_keys=True,
|
||||||
ensure_ascii=False))
|
ensure_ascii=False))
|
||||||
print(str(len(export_authors.items())) + ' authors exported')
|
print('[migration] ' + str(len(export_authors.items())) + ' authors exported')
|
||||||
open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
|
open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
|
||||||
cls=DateTimeEncoder,
|
cls=DateTimeEncoder,
|
||||||
indent=4,
|
indent=4,
|
||||||
sort_keys=True,
|
sort_keys=True,
|
||||||
ensure_ascii=False))
|
ensure_ascii=False))
|
||||||
print(str(len(export_topics.keys())) + ' topics exported')
|
print('[migration] ' + str(len(export_topics.keys())) + ' topics exported')
|
||||||
|
|
||||||
open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
|
open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
|
||||||
cls=DateTimeEncoder,
|
cls=DateTimeEncoder,
|
||||||
indent=4,
|
indent=4,
|
||||||
sort_keys=True,
|
sort_keys=True,
|
||||||
ensure_ascii=False))
|
ensure_ascii=False))
|
||||||
print(str(len(export_articles.items())) + ' articles exported')
|
print('[migration] ' + str(len(export_articles.items())) + ' articles exported')
|
||||||
open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
|
open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
|
||||||
cls=DateTimeEncoder,
|
cls=DateTimeEncoder,
|
||||||
indent=4,
|
indent=4,
|
||||||
sort_keys=True,
|
sort_keys=True,
|
||||||
ensure_ascii=False))
|
ensure_ascii=False))
|
||||||
print(str(len(export_comments.items())) + ' exported articles with comments')
|
print('[migration] ' + str(len(export_comments.items())) + ' exported articles with comments')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -280,10 +253,10 @@ if __name__ == '__main__':
|
||||||
bson2json.json_tables()
|
bson2json.json_tables()
|
||||||
else:
|
else:
|
||||||
# preparing data
|
# preparing data
|
||||||
|
|
||||||
# users
|
# users
|
||||||
users_data = json.loads(open('migration/data/users.json').read())
|
users_data = json.loads(open('migration/data/users.json').read())
|
||||||
print(str(len(users_data)) + ' users loaded')
|
print('[migration] ' + str(len(users_data)) + ' users loaded')
|
||||||
users_by_oid = {}
|
users_by_oid = {}
|
||||||
users_by_slug = {}
|
users_by_slug = {}
|
||||||
user_id_map = {}
|
user_id_map = {}
|
||||||
|
@ -294,10 +267,10 @@ if __name__ == '__main__':
|
||||||
users_by_oid[user.old_id] = vars(user)
|
users_by_oid[user.old_id] = vars(user)
|
||||||
# tags
|
# tags
|
||||||
tags_data = json.loads(open('migration/data/tags.json').read())
|
tags_data = json.loads(open('migration/data/tags.json').read())
|
||||||
print(str(len(tags_data)) + ' tags loaded')
|
print('[migration] ' + str(len(tags_data)) + ' tags loaded')
|
||||||
# cats
|
# cats
|
||||||
cats_data = json.loads(open('migration/data/content_item_categories.json').read())
|
cats_data = json.loads(open('migration/data/content_item_categories.json').read())
|
||||||
print(str(len(cats_data)) + ' cats loaded')
|
print('[migration] ' + str(len(cats_data)) + ' cats loaded')
|
||||||
topics_data = tags_data
|
topics_data = tags_data
|
||||||
tags_data.extend(cats_data)
|
tags_data.extend(cats_data)
|
||||||
oldtopics_by_oid = { x['_id']: x for x in topics_data }
|
oldtopics_by_oid = { x['_id']: x for x in topics_data }
|
||||||
|
@ -308,12 +281,12 @@ if __name__ == '__main__':
|
||||||
# content
|
# content
|
||||||
content_data = json.loads(open('migration/data/content_items.json').read())
|
content_data = json.loads(open('migration/data/content_items.json').read())
|
||||||
content_dict = { x['_id']: x for x in content_data }
|
content_dict = { x['_id']: x for x in content_data }
|
||||||
print(str(len(content_data)) + ' content items loaded')
|
print('[migration] ' + str(len(content_data)) + ' content items loaded')
|
||||||
shouts_by_slug = {}
|
shouts_by_slug = {}
|
||||||
shouts_by_oid = {}
|
shouts_by_oid = {}
|
||||||
|
|
||||||
comments_data = json.loads(open('migration/data/comments.json').read())
|
comments_data = json.loads(open('migration/data/comments.json').read())
|
||||||
print(str(len(comments_data)) + ' comments loaded')
|
print('[migration] ' + str(len(comments_data)) + ' comments loaded')
|
||||||
comments_by_post = {}
|
comments_by_post = {}
|
||||||
# sort comments by old posts ids
|
# sort comments by old posts ids
|
||||||
for old_comment in comments_data:
|
for old_comment in comments_data:
|
||||||
|
@ -321,10 +294,7 @@ if __name__ == '__main__':
|
||||||
comments_by_post[cid] = comments_by_post.get(cid, [])
|
comments_by_post[cid] = comments_by_post.get(cid, [])
|
||||||
if not old_comment.get('deletedAt', True):
|
if not old_comment.get('deletedAt', True):
|
||||||
comments_by_post[cid].append(old_comment)
|
comments_by_post[cid].append(old_comment)
|
||||||
print(str(len(comments_by_post.keys())) + ' articles with comments')
|
print('[migration] ' + str(len(comments_by_post.keys())) + ' articles with comments')
|
||||||
|
|
||||||
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
|
|
||||||
print(str(len(email_subscriptions_data)) + ' email subscriptions loaded')
|
|
||||||
|
|
||||||
export_articles = {} # slug: shout
|
export_articles = {} # slug: shout
|
||||||
export_authors = {} # slug: user
|
export_authors = {} # slug: user
|
||||||
|
@ -338,29 +308,32 @@ if __name__ == '__main__':
|
||||||
elif cmd == "topics":
|
elif cmd == "topics":
|
||||||
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
|
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
|
||||||
elif cmd == "shouts":
|
elif cmd == "shouts":
|
||||||
shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid) # NOTE: listens limit
|
shouts(content_data, shouts_by_slug, shouts_by_oid) # NOTE: listens limit
|
||||||
elif cmd == "comments":
|
elif cmd == "comments":
|
||||||
comments(comments_data)
|
comments(comments_data)
|
||||||
elif cmd == "export_shouts":
|
elif cmd == "export_shouts":
|
||||||
export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
|
export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
|
||||||
elif cmd == "email_subscriptions":
|
elif cmd == "email_subscriptions":
|
||||||
export_email_subscriptions(email_subscriptions_data)
|
export_email_subscriptions()
|
||||||
|
elif cmd == 'slug':
|
||||||
|
export_slug(sys.argv[2], export_articles, export_authors, content_dict)
|
||||||
elif cmd == "all":
|
elif cmd == "all":
|
||||||
users(users_by_oid, users_by_slug, users_data)
|
users(users_by_oid, users_by_slug, users_data)
|
||||||
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
|
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
|
||||||
shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid)
|
shouts(content_data, shouts_by_slug, shouts_by_oid)
|
||||||
comments(comments_data)
|
comments(comments_data)
|
||||||
export_email_subscriptions(email_subscriptions_data)
|
export_email_subscriptions()
|
||||||
elif cmd == 'slug':
|
else:
|
||||||
export_slug(sys.argv[2], export_articles, export_authors, content_dict)
|
print('[migration] --- debug users, topics, shouts')
|
||||||
|
users(users_by_oid, users_by_slug, users_data)
|
||||||
|
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
|
||||||
|
shouts(content_data, shouts_by_slug, shouts_by_oid)
|
||||||
#export_finish(export_articles, export_authors, export_topics, export_comments)
|
#export_finish(export_articles, export_authors, export_topics, export_comments)
|
||||||
else:
|
else:
|
||||||
print('''
|
print('usage: python migrate.py bson')
|
||||||
usage: python migrate.py bson
|
print('.. \ttopics <limit>')
|
||||||
\n.. \ttopics <limit>
|
print('.. \tusers <limit>')
|
||||||
\n.. \tusers <limit>
|
print('.. \tshouts <limit>')
|
||||||
\n.. \tshouts <limit>
|
print('.. \texport_shouts <limit>')
|
||||||
\n.. \texport_shouts <limit>
|
print('.. \tslug <slug>')
|
||||||
\n.. \tslug <slug>
|
print('.. \tall')
|
||||||
\n.. \tall
|
|
||||||
''')
|
|
||||||
|
|
154
migration/extract.py
Normal file
154
migration/extract.py
Normal file
|
@ -0,0 +1,154 @@
|
||||||
|
import re
|
||||||
|
import base64
|
||||||
|
|
||||||
|
TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)'
|
||||||
|
|
||||||
|
|
||||||
|
def replace_tooltips(body):
|
||||||
|
newbody = body
|
||||||
|
matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
|
||||||
|
for match in matches:
|
||||||
|
newbody = body.replace(match.group(1), '<Tooltip text="' + match.group(2) + '" />') # FIXME: doesn't work
|
||||||
|
if len(matches) > 0:
|
||||||
|
print('[extract] found %d tooltips' % len(matches))
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
|
||||||
|
def place_tooltips(body):
|
||||||
|
parts = body.split('///')
|
||||||
|
l = len(parts)
|
||||||
|
newparts = list(parts)
|
||||||
|
if l & 1:
|
||||||
|
if l > 1:
|
||||||
|
i = 1
|
||||||
|
print('[extract] found %d tooltips' % (l-1))
|
||||||
|
for part in parts[1:]:
|
||||||
|
if i & 1:
|
||||||
|
# print('[extract] tooltip: ' + part)
|
||||||
|
if 'a class="footnote-url" href=' in part:
|
||||||
|
fn = 'a class="footnote-url" href="'
|
||||||
|
link = part.split(fn,1)[1].split('"', 1)[0]
|
||||||
|
extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1]
|
||||||
|
newparts[i] = '<Tooltip text="' + extracted_part + '" link="' + link + '" />'
|
||||||
|
else:
|
||||||
|
newparts[i] = '<Tooltip text="%s" />' % part
|
||||||
|
# print('[extract] tooltip: ' + newparts[i])
|
||||||
|
else:
|
||||||
|
# print('[extract] pass: ' + part[:10] + '..')
|
||||||
|
newparts[i] = part
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return ''.join(newparts)
|
||||||
|
|
||||||
|
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)"
|
||||||
|
public = '../discoursio-web/public'
|
||||||
|
cdn = 'https://assets.discours.io'
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def reextract_images(body, oid):
|
||||||
|
matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
|
||||||
|
i = 0
|
||||||
|
for match in matches:
|
||||||
|
print('[extract] image ' + match.group(1))
|
||||||
|
ext = match.group(3)
|
||||||
|
name = oid + str(i)
|
||||||
|
link = public + '/upload/image-' + name + '.' + ext
|
||||||
|
img = match.group(4)
|
||||||
|
title = match.group(1) # FIXME: this is not the title
|
||||||
|
if img not in cache:
|
||||||
|
content = base64.b64decode(img + '==')
|
||||||
|
print(str(len(img)) + ' image bytes been written')
|
||||||
|
open('../' + link, 'wb').write(content)
|
||||||
|
cache[img] = name
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
print('[extract] image cached ' + cache[img])
|
||||||
|
body.replace(str(match), '') # FIXME: this does not work
|
||||||
|
return body
|
||||||
|
|
||||||
|
IMAGES = {
|
||||||
|
'data:image/png': 'png',
|
||||||
|
'data:image/jpg': 'jpg',
|
||||||
|
'data:image/jpeg': 'jpg',
|
||||||
|
}
|
||||||
|
|
||||||
|
sep = ';base64,'
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images(body, oid):
|
||||||
|
newbody = ''
|
||||||
|
body = body.replace(' [](data:image', '.replace('\n[](data:image', '
|
||||||
|
oldparts = body.split(sep)
|
||||||
|
newparts = list(oldparts)
|
||||||
|
print()
|
||||||
|
if len(oldparts) > 1:
|
||||||
|
print('[extract] images for %s' % oid)
|
||||||
|
print('[extract] %d candidates' % (len(oldparts)-1))
|
||||||
|
i = 0
|
||||||
|
for current in oldparts:
|
||||||
|
next = ''
|
||||||
|
try: next = oldparts[i+1]
|
||||||
|
except: newbody += current
|
||||||
|
start = oldparts.index(current) == 0
|
||||||
|
end = not next
|
||||||
|
if end:
|
||||||
|
continue
|
||||||
|
else: # start or between
|
||||||
|
# print('[extract_images] have next')
|
||||||
|
for mime in IMAGES.keys():
|
||||||
|
if mime in current[-15:]:
|
||||||
|
# print('[extract_images] found proper mime type')
|
||||||
|
print('[extract] ' + current[-15:])
|
||||||
|
if ')' in next:
|
||||||
|
b64encoded = next.split(')')[0]
|
||||||
|
print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded))
|
||||||
|
# print(meta)
|
||||||
|
ext = IMAGES[mime]
|
||||||
|
print('[extract] type: ' + mime)
|
||||||
|
name = oid + '-' + str(i)
|
||||||
|
print('[extract] name: ' + name)
|
||||||
|
link = '/upload/image-' + name + '.' + ext
|
||||||
|
print('[extract] link: ' + link)
|
||||||
|
if b64encoded:
|
||||||
|
if b64encoded not in cache:
|
||||||
|
content = base64.b64decode(b64encoded + '==')
|
||||||
|
open(public + link, 'wb').write(content)
|
||||||
|
cache[b64encoded] = name
|
||||||
|
else:
|
||||||
|
print('[extract] cached: ' + cache[b64encoded])
|
||||||
|
name = cache[b64encoded]
|
||||||
|
link = cdn + '/upload/image-' + name + '.' + ext
|
||||||
|
newparts[i] = current.split('[0] + ''
|
||||||
|
newparts[i+1] = next.replace(b64encoded + ')', '')
|
||||||
|
else:
|
||||||
|
print('[extract] not b64encoded')
|
||||||
|
print(current[-15:])
|
||||||
|
i += 1
|
||||||
|
newbody = ''.join(newparts)
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup(body):
|
||||||
|
newbody = body\
|
||||||
|
.replace('<', '').replace('>', '')\
|
||||||
|
.replace('{', '(').replace('}', ')')\
|
||||||
|
.replace('…', '...')\
|
||||||
|
.replace(' __ ', ' ')\
|
||||||
|
.replace('_ _', ' ')\
|
||||||
|
.replace('****', '')\
|
||||||
|
.replace('\u00a0', ' ')\
|
||||||
|
.replace('\u02c6', '^')\
|
||||||
|
.replace('\u00a0',' ')\
|
||||||
|
.replace('\ufeff', '')\
|
||||||
|
.replace('\u200b', '')\
|
||||||
|
.replace('\u200c', '')\
|
||||||
|
# .replace('\u2212', '-')
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
|
||||||
|
def extract(body, oid):
|
||||||
|
newbody = extract_images(body, oid)
|
||||||
|
newbody = cleanup(newbody)
|
||||||
|
newbody = place_tooltips(newbody)
|
||||||
|
return newbody
|
Loading…
Reference in New Issue
Block a user