migration: new extract logix

This commit is contained in:
tonyrewin 2022-07-03 03:58:41 +03:00
parent 8491b12e45
commit 5b679f99e0
2 changed files with 226 additions and 99 deletions

View File

@ -1,9 +1,7 @@
''' cmd managed migration ''' ''' cmd managed migration '''
import json import json
import pprint
import base64
import re
import frontmatter import frontmatter
from migration.extract import extract
from migration.tables.users import migrate as migrateUser from migration.tables.users import migrate as migrateUser
from migration.tables.users import migrate_2stage as migrateUser_2stage from migration.tables.users import migrate_2stage as migrateUser_2stage
from migration.tables.users import migrate_email_subscription from migration.tables.users import migrate_email_subscription
@ -19,35 +17,14 @@ from dateutil.parser import parse as date_parse
from orm.base import local_session from orm.base import local_session
from orm import User from orm import User
print = pprint.pprint
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((.|\s)*?))\)"
OLD_DATE = '2016-03-05 22:22:00.350000' OLD_DATE = '2016-03-05 22:22:00.350000'
def extract_images(article):
''' extract b64 encoded images from markdown in article body '''
body = article['body']
images = []
matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
for i, match in enumerate(matches, start=1):
ext = match.group(3)
link = 'discoursio-web/public/upload/image-' + \
article['old_id'] + str(i) + '.' + ext
img = match.group(4)
if img not in images:
open('../' + link, 'wb').write(base64.b64decode(img))
images.append(img)
body = body.replace(match.group(2), link)
print(link)
article['body'] = body
return article
def users(users_by_oid, users_by_slug, users_data): def users(users_by_oid, users_by_slug, users_data):
''' migrating users first ''' ''' migrating users first '''
# limiting # limiting
limit = len(users_data) limit = len(users_data)
if len(sys.argv) > 2: limit = int(sys.argv[2]) if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d users...' % limit) print('[migration] %d users...' % limit)
counter = 0 counter = 0
id_map = {} id_map = {}
for entry in users_data: for entry in users_data:
@ -63,16 +40,18 @@ def users(users_by_oid, users_by_slug, users_data):
users_by_slug[user['slug']] = user # public users_by_slug[user['slug']] = user # public
id_map[user['old_id']] = user['slug'] id_map[user['old_id']] = user['slug']
counter += 1 counter += 1
print(' - * - stage 2 users migration - * -') # print(' - * - stage 2 users migration - * -')
ce = 0
for entry in users_data: for entry in users_data:
migrateUser_2stage(entry, id_map) ce += migrateUser_2stage(entry, id_map)
try: # print(str(len(users_by_slug.items())) + ' users migrated')
open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id print('[migration] %d user ratings errors' % ce)
open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug #try:
print(str(len(users_by_slug.items())) + ' users migrated') # open('migration/data/users.old_id.json', 'w').write(json.dumps(users_by_oid, cls=DateTimeEncoder)) # NOTE: by old_id
except Exception: # open('migration/data/users.slug.json', 'w').write(json.dumps(users_by_slug, cls=DateTimeEncoder)) # NOTE: by slug
print('json dump error') #except Exception:
# print(users_by_oid) # print('json dump error')
# # print(users_by_oid)
def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data): def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
@ -80,7 +59,7 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
# limiting # limiting
limit = len(cats_data) + len(tags_data) limit = len(cats_data) + len(tags_data)
if len(sys.argv) > 2: limit = int(sys.argv[2]) if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d topics...' % limit) print('[migration] %d topics...' % limit)
counter = 0 counter = 0
retopics = json.loads(open('migration/tables/replacements.json').read()) retopics = json.loads(open('migration/tables/replacements.json').read())
topicslugs_by_oid = {} topicslugs_by_oid = {}
@ -106,8 +85,8 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
for oid, oslug in topicslugs_by_oid.items(): for oid, oslug in topicslugs_by_oid.items():
if topics_by_slug.get(oslug): if topics_by_slug.get(oslug):
topics_by_oid[oid] = topics_by_slug.get(retopics.get(oslug, oslug)) topics_by_oid[oid] = topics_by_slug.get(retopics.get(oslug, oslug))
print( str(len(topics_by_oid.values())) + ' topics by oid' ) print( '[migration] ' + str(len(topics_by_oid.values())) + ' topics by oid' )
print( str(len(topics_by_slug.values())) + ' topics by slug' ) print( '[migration] ' + str(len(topics_by_slug.values())) + ' topics by slug' )
#replacements = {} # json.loads(open('migration/tables/replacements.json').read()) #replacements = {} # json.loads(open('migration/tables/replacements.json').read())
#for t in topics_by_title.values(): #for t in topics_by_title.values():
# slug = replacements.get(t['slug'].strip()) or t['slug'].strip() # slug = replacements.get(t['slug'].strip()) or t['slug'].strip()
@ -121,32 +100,24 @@ def topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data):
# sort_keys=True, # sort_keys=True,
# ensure_ascii=False)) # ensure_ascii=False))
def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid): def shouts(content_data, shouts_by_slug, shouts_by_oid):
''' migrating content items one by one ''' ''' migrating content items one by one '''
# limiting # limiting
limit = len(content_data) limit = len(content_data)
if len(sys.argv) > 2: limit = int(sys.argv[2]) if len(sys.argv) > 2: limit = int(sys.argv[2])
print('migrating %d content items...' % limit) print('[migration] %d content items...' % limit)
counter = 0 counter = 0
discours_author = 0 discours_author = 0
errored = [] errored = []
pub_counter = 0
# limiting # limiting
try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data) try: limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError: limit = len(content_data) except ValueError: limit = len(content_data)
te = {}
for entry in content_data[:limit]: for entry in content_data[:limit]:
if 'slug' in sys.argv and entry['slug'] not in sys.argv: continue
try: try:
shout, terrors = migrateShout(entry, users_by_oid, topics_by_oid) shout, terrors = migrateShout(entry, users_by_oid, topics_by_oid)
for oid in terrors: if entry.get('published'): pub_counter += 1
if not te.get(oid):
if oldtopics_by_oid.get(oid):
te[oldtopics_by_oid[oid]['slug']] = []
else:
# print('lost old topic id: ' + oid)
pass
else:
te[oid].append(shout['slug'])
author = shout['authors'][0] author = shout['authors'][0]
shout['authors'] = [ author.id, ] shout['authors'] = [ author.id, ]
newtopics = [] newtopics = []
@ -156,7 +127,6 @@ def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid):
if nt not in newtopics: if nt not in newtopics:
newtopics.append(nt) newtopics.append(nt)
shout['topics'] = newtopics shout['topics'] = newtopics
shout = extract_images(shout)
shouts_by_slug[shout['slug']] = shout shouts_by_slug[shout['slug']] = shout
shouts_by_oid[entry['_id']] = shout shouts_by_oid[entry['_id']] = shout
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug) line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author.slug)
@ -165,33 +135,34 @@ def shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid):
print(line) print(line)
# open('./shouts.id.log', 'a').write(line + '\n') # open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e: except Exception as e:
print(entry['_id']) # print(entry['_id'])
errored.append(entry) errored.append(entry)
raise e raise e
print(te) # print(te)
open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder)) # open('migration/data/shouts.old_id.json','w').write(json.dumps(shouts_by_oid, cls=DateTimeEncoder))
open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder)) # open('migration/data/shouts.slug.json','w').write(json.dumps(shouts_by_slug, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated') print('[migration] ' + str(counter) + ' content items were migrated')
print(str(discours_author) + ' authored by @discours') print('[migration] ' + str(pub_counter) + ' have been published')
print('[migration] ' + str(discours_author) + ' authored by @discours')
def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict): def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
# update what was just migrated or load json again # update what was just migrated or load json again
if len(export_authors.keys()) == 0: if len(export_authors.keys()) == 0:
export_authors = json.loads(open('../src/data/authors.json').read()) export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded') print('[migration] ' + str(len(export_authors.items())) + ' exported authors loaded')
if len(export_articles.keys()) == 0: if len(export_articles.keys()) == 0:
export_articles = json.loads(open('../src/data/articles.json').read()) export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded') print('[migration] ' + str(len(export_articles.items())) + ' exported articles loaded')
# limiting # limiting
limit = 33 limit = 33
if len(sys.argv) > 2: limit = int(sys.argv[2]) if len(sys.argv) > 2: limit = int(sys.argv[2])
print('exporting %d articles to json...' % limit) print('[migration] ' + 'exporting %d articles to json...' % limit)
# filter # filter
export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article'] export_list = [i for i in shouts_by_slug.items() if i[1]['layout'] == 'article']
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True) export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered') print('[migration] ' + str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)] export_list = export_list[:limit or len(export_list)]
for (slug, article) in export_list: for (slug, article) in export_list:
@ -199,20 +170,20 @@ def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
export_slug(slug, export_articles, export_authors, content_dict) export_slug(slug, export_articles, export_authors, content_dict)
def export_body(article, content_dict): def export_body(article, content_dict):
article = extract_images(article) article['body'] = extract(article['body'], article['oid'])
metadata = get_metadata(article) metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata)) content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../discoursio-web/content/' + slug + '.mdx', 'w').write(content) open('../discoursio-web/content/' + article['slug'] + '.mdx', 'w').write(content)
# open('../discoursio-web/content/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body']) open('../discoursio-web/content/'+ article['slug'] + '.html', 'w').write(content_dict[article['old_id']]['body'])
def export_slug(slug, export_articles, export_authors, content_dict): def export_slug(slug, export_articles, export_authors, content_dict):
print('exporting %s ' % slug) print('[migration] ' + 'exporting %s ' % slug)
if export_authors == {}: if export_authors == {}:
export_authors = json.loads(open('../src/data/authors.json').read()) export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded') print('[migration] ' + str(len(export_authors.items())) + ' exported authors loaded')
if export_articles == {}: if export_articles == {}:
export_articles = json.loads(open('../src/data/articles.json').read()) export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded') print('[migration] ' + str(len(export_articles.items())) + ' exported articles loaded')
shout = shouts_by_slug.get(slug, False) shout = shouts_by_slug.get(slug, False)
assert shout, 'no data error' assert shout, 'no data error'
@ -233,12 +204,14 @@ def comments(comments_data):
id_map[old_id] = id id_map[old_id] = id
for comment in comments_data: for comment in comments_data:
migrateComment_2stage(comment, id_map) migrateComment_2stage(comment, id_map)
print(str(len(id_map)) + ' comments exported') print('[migration] ' + str(len(id_map)) + ' comments exported')
def export_email_subscriptions(email_subscriptions_data): def export_email_subscriptions():
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions loaded')
for data in email_subscriptions_data: for data in email_subscriptions_data:
migrate_email_subscription(data) migrate_email_subscription(data)
print(str(len(email_subscriptions_data)) + ' email subscriptions exported') print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions exported')
def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}): def export_finish(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
@ -247,26 +220,26 @@ def export_finish(export_articles = {}, export_authors = {}, export_topics = {},
indent=4, indent=4,
sort_keys=True, sort_keys=True,
ensure_ascii=False)) ensure_ascii=False))
print(str(len(export_authors.items())) + ' authors exported') print('[migration] ' + str(len(export_authors.items())) + ' authors exported')
open('../src/data/topics.json', 'w').write(json.dumps(export_topics, open('../src/data/topics.json', 'w').write(json.dumps(export_topics,
cls=DateTimeEncoder, cls=DateTimeEncoder,
indent=4, indent=4,
sort_keys=True, sort_keys=True,
ensure_ascii=False)) ensure_ascii=False))
print(str(len(export_topics.keys())) + ' topics exported') print('[migration] ' + str(len(export_topics.keys())) + ' topics exported')
open('../src/data/articles.json', 'w').write(json.dumps(export_articles, open('../src/data/articles.json', 'w').write(json.dumps(export_articles,
cls=DateTimeEncoder, cls=DateTimeEncoder,
indent=4, indent=4,
sort_keys=True, sort_keys=True,
ensure_ascii=False)) ensure_ascii=False))
print(str(len(export_articles.items())) + ' articles exported') print('[migration] ' + str(len(export_articles.items())) + ' articles exported')
open('../src/data/comments.json', 'w').write(json.dumps(export_comments, open('../src/data/comments.json', 'w').write(json.dumps(export_comments,
cls=DateTimeEncoder, cls=DateTimeEncoder,
indent=4, indent=4,
sort_keys=True, sort_keys=True,
ensure_ascii=False)) ensure_ascii=False))
print(str(len(export_comments.items())) + ' exported articles with comments') print('[migration] ' + str(len(export_comments.items())) + ' exported articles with comments')
if __name__ == '__main__': if __name__ == '__main__':
@ -280,10 +253,10 @@ if __name__ == '__main__':
bson2json.json_tables() bson2json.json_tables()
else: else:
# preparing data # preparing data
# users # users
users_data = json.loads(open('migration/data/users.json').read()) users_data = json.loads(open('migration/data/users.json').read())
print(str(len(users_data)) + ' users loaded') print('[migration] ' + str(len(users_data)) + ' users loaded')
users_by_oid = {} users_by_oid = {}
users_by_slug = {} users_by_slug = {}
user_id_map = {} user_id_map = {}
@ -294,10 +267,10 @@ if __name__ == '__main__':
users_by_oid[user.old_id] = vars(user) users_by_oid[user.old_id] = vars(user)
# tags # tags
tags_data = json.loads(open('migration/data/tags.json').read()) tags_data = json.loads(open('migration/data/tags.json').read())
print(str(len(tags_data)) + ' tags loaded') print('[migration] ' + str(len(tags_data)) + ' tags loaded')
# cats # cats
cats_data = json.loads(open('migration/data/content_item_categories.json').read()) cats_data = json.loads(open('migration/data/content_item_categories.json').read())
print(str(len(cats_data)) + ' cats loaded') print('[migration] ' + str(len(cats_data)) + ' cats loaded')
topics_data = tags_data topics_data = tags_data
tags_data.extend(cats_data) tags_data.extend(cats_data)
oldtopics_by_oid = { x['_id']: x for x in topics_data } oldtopics_by_oid = { x['_id']: x for x in topics_data }
@ -308,12 +281,12 @@ if __name__ == '__main__':
# content # content
content_data = json.loads(open('migration/data/content_items.json').read()) content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']: x for x in content_data } content_dict = { x['_id']: x for x in content_data }
print(str(len(content_data)) + ' content items loaded') print('[migration] ' + str(len(content_data)) + ' content items loaded')
shouts_by_slug = {} shouts_by_slug = {}
shouts_by_oid = {} shouts_by_oid = {}
comments_data = json.loads(open('migration/data/comments.json').read()) comments_data = json.loads(open('migration/data/comments.json').read())
print(str(len(comments_data)) + ' comments loaded') print('[migration] ' + str(len(comments_data)) + ' comments loaded')
comments_by_post = {} comments_by_post = {}
# sort comments by old posts ids # sort comments by old posts ids
for old_comment in comments_data: for old_comment in comments_data:
@ -321,10 +294,7 @@ if __name__ == '__main__':
comments_by_post[cid] = comments_by_post.get(cid, []) comments_by_post[cid] = comments_by_post.get(cid, [])
if not old_comment.get('deletedAt', True): if not old_comment.get('deletedAt', True):
comments_by_post[cid].append(old_comment) comments_by_post[cid].append(old_comment)
print(str(len(comments_by_post.keys())) + ' articles with comments') print('[migration] ' + str(len(comments_by_post.keys())) + ' articles with comments')
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
print(str(len(email_subscriptions_data)) + ' email subscriptions loaded')
export_articles = {} # slug: shout export_articles = {} # slug: shout
export_authors = {} # slug: user export_authors = {} # slug: user
@ -338,29 +308,32 @@ if __name__ == '__main__':
elif cmd == "topics": elif cmd == "topics":
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data) topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
elif cmd == "shouts": elif cmd == "shouts":
shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid) # NOTE: listens limit shouts(content_data, shouts_by_slug, shouts_by_oid) # NOTE: listens limit
elif cmd == "comments": elif cmd == "comments":
comments(comments_data) comments(comments_data)
elif cmd == "export_shouts": elif cmd == "export_shouts":
export_shouts(shouts_by_slug, export_articles, export_authors, content_dict) export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
elif cmd == "email_subscriptions": elif cmd == "email_subscriptions":
export_email_subscriptions(email_subscriptions_data) export_email_subscriptions()
elif cmd == 'slug':
export_slug(sys.argv[2], export_articles, export_authors, content_dict)
elif cmd == "all": elif cmd == "all":
users(users_by_oid, users_by_slug, users_data) users(users_by_oid, users_by_slug, users_data)
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data) topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
shouts(content_data, shouts_by_slug, shouts_by_oid, oldtopics_by_oid) shouts(content_data, shouts_by_slug, shouts_by_oid)
comments(comments_data) comments(comments_data)
export_email_subscriptions(email_subscriptions_data) export_email_subscriptions()
elif cmd == 'slug': else:
export_slug(sys.argv[2], export_articles, export_authors, content_dict) print('[migration] --- debug users, topics, shouts')
users(users_by_oid, users_by_slug, users_data)
topics(export_topics, topics_by_slug, topics_by_oid, cats_data, tags_data)
shouts(content_data, shouts_by_slug, shouts_by_oid)
#export_finish(export_articles, export_authors, export_topics, export_comments) #export_finish(export_articles, export_authors, export_topics, export_comments)
else: else:
print(''' print('usage: python migrate.py bson')
usage: python migrate.py bson print('.. \ttopics <limit>')
\n.. \ttopics <limit> print('.. \tusers <limit>')
\n.. \tusers <limit> print('.. \tshouts <limit>')
\n.. \tshouts <limit> print('.. \texport_shouts <limit>')
\n.. \texport_shouts <limit> print('.. \tslug <slug>')
\n.. \tslug <slug> print('.. \tall')
\n.. \tall
''')

154
migration/extract.py Normal file
View File

@ -0,0 +1,154 @@
import re
import base64
TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)'
def replace_tooltips(body):
newbody = body
matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
for match in matches:
newbody = body.replace(match.group(1), '<Tooltip text="' + match.group(2) + '" />') # FIXME: doesn't work
if len(matches) > 0:
print('[extract] found %d tooltips' % len(matches))
return newbody
def place_tooltips(body):
parts = body.split('///')
l = len(parts)
newparts = list(parts)
if l & 1:
if l > 1:
i = 1
print('[extract] found %d tooltips' % (l-1))
for part in parts[1:]:
if i & 1:
# print('[extract] tooltip: ' + part)
if 'a class="footnote-url" href=' in part:
fn = 'a class="footnote-url" href="'
link = part.split(fn,1)[1].split('"', 1)[0]
extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1]
newparts[i] = '<Tooltip text="' + extracted_part + '" link="' + link + '" />'
else:
newparts[i] = '<Tooltip text="%s" />' % part
# print('[extract] tooltip: ' + newparts[i])
else:
# print('[extract] pass: ' + part[:10] + '..')
newparts[i] = part
i += 1
return ''.join(newparts)
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)"
public = '../discoursio-web/public'
cdn = 'https://assets.discours.io'
cache = {}
def reextract_images(body, oid):
matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
i = 0
for match in matches:
print('[extract] image ' + match.group(1))
ext = match.group(3)
name = oid + str(i)
link = public + '/upload/image-' + name + '.' + ext
img = match.group(4)
title = match.group(1) # FIXME: this is not the title
if img not in cache:
content = base64.b64decode(img + '==')
print(str(len(img)) + ' image bytes been written')
open('../' + link, 'wb').write(content)
cache[img] = name
i += 1
else:
print('[extract] image cached ' + cache[img])
body.replace(str(match), '![' + title + '](' + cdn + link + ')') # FIXME: this does not work
return body
IMAGES = {
'data:image/png': 'png',
'data:image/jpg': 'jpg',
'data:image/jpeg': 'jpg',
}
sep = ';base64,'
def extract_images(body, oid):
newbody = ''
body = body.replace(' [](data:image', '![](data:image').replace('\n[](data:image', '![](data:image')
oldparts = body.split(sep)
newparts = list(oldparts)
print()
if len(oldparts) > 1:
print('[extract] images for %s' % oid)
print('[extract] %d candidates' % (len(oldparts)-1))
i = 0
for current in oldparts:
next = ''
try: next = oldparts[i+1]
except: newbody += current
start = oldparts.index(current) == 0
end = not next
if end:
continue
else: # start or between
# print('[extract_images] have next')
for mime in IMAGES.keys():
if mime in current[-15:]:
# print('[extract_images] found proper mime type')
print('[extract] ' + current[-15:])
if ')' in next:
b64encoded = next.split(')')[0]
print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded))
# print(meta)
ext = IMAGES[mime]
print('[extract] type: ' + mime)
name = oid + '-' + str(i)
print('[extract] name: ' + name)
link = '/upload/image-' + name + '.' + ext
print('[extract] link: ' + link)
if b64encoded:
if b64encoded not in cache:
content = base64.b64decode(b64encoded + '==')
open(public + link, 'wb').write(content)
cache[b64encoded] = name
else:
print('[extract] cached: ' + cache[b64encoded])
name = cache[b64encoded]
link = cdn + '/upload/image-' + name + '.' + ext
newparts[i] = current.split('![](' + mime)[0] + '![](' + link + ')'
newparts[i+1] = next.replace(b64encoded + ')', '')
else:
print('[extract] not b64encoded')
print(current[-15:])
i += 1
newbody = ''.join(newparts)
return newbody
def cleanup(body):
newbody = body\
.replace('<', '').replace('>', '')\
.replace('{', '(').replace('}', ')')\
.replace('', '...')\
.replace(' __ ', ' ')\
.replace('_ _', ' ')\
.replace('****', '')\
.replace('\u00a0', ' ')\
.replace('\u02c6', '^')\
.replace('\u00a0',' ')\
.replace('\ufeff', '')\
.replace('\u200b', '')\
.replace('\u200c', '')\
# .replace('\u2212', '-')
return newbody
def extract(body, oid):
newbody = extract_images(body, oid)
newbody = cleanup(newbody)
newbody = place_tooltips(newbody)
return newbody