migration-orm-fixes

2022-07-07 16:55:13 +03:00
parent bd4221e9af
commit 56dcd7ecbc
23 changed files with 706 additions and 799 deletions
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -1,16 +1,16 @@
-import json
+import os
 import re
 import base64
-
+import sys
 from migration.html2text import html2text

 TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)'
-
+contentDir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'discoursio-web', 'content')
 s3 = 'https://discours-io.s3.amazonaws.com/'
 cdn = 'https://assets.discours.io'
-retopics = json.loads(open('migration/tables/replacements.json', 'r').read())

-def replace_tooltips(body):
+def replace_tooltips(body): 
+	# FIXME: if you prefer regexp
 	newbody = body
 	matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
 	for match in matches:
@@ -21,37 +21,40 @@ def replace_tooltips(body):


 def place_tooltips(body):
-	parts = body.split('///')
+	parts = body.split('&&&')
 	l = len(parts)
 	newparts = list(parts)
+	placed = False
 	if l & 1:
 		if l > 1: 
 			i = 1
 			print('[extract] found %d tooltips' % (l-1))
 			for part in parts[1:]:
 				if i & 1: 
+					# print([ len(p) for p in parts ])
 					# print('[extract] tooltip: ' + part)
 					if 'a class="footnote-url" href=' in part:
+						print('[extract] footnote: ' + part)
 						fn = 'a class="footnote-url" href="'
 						link = part.split(fn,1)[1].split('"', 1)[0]
 						extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1]
 						newparts[i] = '<Tooltip' + (' link="' + link + '" ' if link else '') + '>' + extracted_part + '</Tooltip>'
 					else:
 						newparts[i] = '<Tooltip>%s</Tooltip>' % part
-					# print('[extract] tooltip: ' + newparts[i])
 				else:
 					# print('[extract] pass: ' + part[:10] + '..')
 					newparts[i] = part
 				i += 1
-			
-	return ''.join(newparts)
+			placed = True
+	return (''.join(newparts), placed)

 IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)"
 public = '../discoursio-web/public'
 cache = {}


-def reextract_images(body, oid):
+def reextract_images(body, oid): 
+	# FIXME: if you prefer regexp
 	matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
 	i = 0
 	for match in matches:
@@ -80,54 +83,50 @@ IMAGES = {

 sep = ';base64,'

+def extract_imageparts(bodyparts, prefix):
+	# recursive loop
+	for current in bodyparts:
+		i = bodyparts.index(current)
+		for mime in IMAGES.keys():
+			if mime == current[-len(mime):] and (i + 1 < len(bodyparts)):
+				print('[extract] ' + mime)
+				next = bodyparts[i+1]
+				ext = IMAGES[mime]
+				b64end = next.index(')')
+				b64encoded = next[:b64end]
+				name = prefix + '-' + str(len(cache))
+				link = '/upload/image-' + name + '.' + ext
+				print('[extract] name: ' + name)
+				print('[extract] link: ' + link)
+				print('[extract] %d bytes' % len(b64encoded))
+				if b64encoded not in cache:
+					try:
+						content = base64.b64decode(b64encoded + '==')
+						open(public + link, 'wb').write(content)
+						print('[extract] ' +str(len(content)) + ' image bytes been written')
+						cache[b64encoded] = name
+					except:
+						raise Exception
+						# raise Exception('[extract] error decoding image %r' %b64encoded)
+				else:
+					print('[extract] cached: ' + cache[b64encoded])
+					name = cache[b64encoded]
+					link = cdn + '/upload/image-' + name + '.' + ext
+				bodyparts[i] = current[:-len(mime)] + current[-len(mime):] + link + next[-b64end:]
+				bodyparts[i+1] = next[:-b64end]
+				break
+	return extract_imageparts(sep.join(bodyparts[i+1:]), prefix) \
+		if len(bodyparts) > (i + 1) else ''.join(bodyparts)

 def extract_images(body, oid):
 	newbody = ''
-	body = body.replace(' [](data:image', '![](data:image').replace('\n[](data:image', '![](data:image')
-	oldparts = body.split(sep)
-	newparts = list(oldparts)
-	# print()
-	if len(oldparts) > 1: 
-		print('[extract] images for %s' % oid)
-		print('[extract] %d candidates' % (len(oldparts)-1))
-		i = 0
-		for current in oldparts:
-			next = ''
-			try: next = oldparts[i+1]
-			except: newbody += current
-			start = oldparts.index(current) == 0
-			end = not next
-			if end:
-				continue
-			else: # start or between
-				for mime in IMAGES.keys():
-					if mime in current[-15:]:
-						print('[extract] ' + current[-15:])
-						if ')' in next: 
-							b64encoded = next.split(')')[0]
-						print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded))
-						ext = IMAGES[mime]
-						print('[extract] type: ' + mime)
-						name = oid + '-' + str(i)
-						print('[extract] name: ' + name)
-						link = '/upload/image-' + name + '.' + ext
-						print('[extract] link: ' + link)
-						if b64encoded:
-							if b64encoded not in cache:
-								content = base64.b64decode(b64encoded + '==')
-								open(public + link, 'wb').write(content)
-								cache[b64encoded] = name
-							else:
-								print('[extract] cached: ' + cache[b64encoded])
-								name = cache[b64encoded]
-								link = cdn + '/upload/image-' + name + '.' + ext
-							newparts[i] = current.split('![](' + mime)[0] + '![](' + link + ')'
-							newparts[i+1] = next.replace(b64encoded + ')', '')
-						else:
-							print('[extract] ERROR: no b64encoded')
-							# print(current[-15:])
-				i += 1
-	newbody = ''.join(newparts)
+	body = body\
+		.replace(' [](data:image', '![](data:image')\
+		.replace('\n[](data:image', '![](data:image')
+	parts = body.split(sep)
+	i = 0
+	if len(parts) > 1: newbody = extract_imageparts(parts, oid)
+	else: newbody = body
 	return newbody


@@ -149,25 +148,34 @@ def cleanup(body):
 	return newbody

 def extract(body, oid):
-	newbody = extract_images(body, oid)
-	newbody = cleanup(newbody)
-	newbody = place_tooltips(newbody)
-	return newbody
+	if body:
+		newbody = extract_images(body, oid)
+		if not newbody: raise Exception('extract_images error')
+		newbody = cleanup(newbody)
+		if not newbody: raise Exception('cleanup error')
+		newbody, placed = place_tooltips(newbody)
+		if not newbody: raise Exception('place_tooltips error')
+		if placed:
+			newbody = 'import Tooltip from \'$/components/Article/Tooltip\'\n\n' + newbody
+		return newbody
+	return body

 def prepare_body(entry):
+	# print('[migration] preparing body %s' % entry.get('slug',''))
 	# body modifications
 	body = ''
 	body_orig = entry.get('body', '')
 	if not body_orig: body_orig = ''

 	if entry.get('type') == 'Literature':
+		print('[extract] literature')
 		for m in entry.get('media', []):
 			t = m.get('title', '')
 			if t: body_orig += '<h5>' + t + '</h5>\n'
-			body_orig += (m.get('body', '') or '')
-			body_orig += '\n' + m.get('literatureBody', '') + '\n'
+			body_orig += (m.get('body') or '').replace((m.get('literatureBody') or ''), '') + m.get('literatureBody', '') + '\n'

 	elif entry.get('type') == 'Video':
+		print('[extract] embedding video')
 		providers = set([])
 		video_url = ''
 		require = False
@@ -187,8 +195,10 @@ def prepare_body(entry):
 			body += extract(html2text(m.get('body', '')), entry['_id'])
 			if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
 		if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n'
+		# already body_orig = entry.get('body', '')

 	elif entry.get('type') == 'Music':
+		print('[extract] music album')
 		for m in entry.get('media', []):
 			artist = m.get('performer')
 			trackname = ''
@@ -197,42 +207,46 @@ def prepare_body(entry):
 			body += '<MusicPlayer src=\"' + m.get('fileUrl','') + '\" title=\"' + trackname + '\" />\n' 
 			body += extract(html2text(m.get('body', '')), entry['_id'])
 		body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n'
+		# already body_orig = entry.get('body', '')

 	elif entry.get('type') == 'Image':
+		print('[extract] image gallery')
 		cover = ''
 		if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId']
-		if not cover and 'image' in entry:
-			cover = entry['image'].get('url', '')
+		if not cover:
+			if 'image' in entry: cover = entry['image'].get('url', '')
 			if 'cloudinary' in cover: cover = ''
+		else:
+			print('[migration] cover: ' + cover)
 		images = {}
 		for m in entry.get('media', []):
-			t = m.get('title', '')
-			if t: body += '#### ' + t + '\n'
-			u = m.get('image', {}).get('url', '')
-			if 'cloudinary' in u:
-				u = m.get('thumborId')
-				if not u: u = cover
+			b = ''
+			title = m.get('title','').replace('\n', ' ').replace('&nbsp;', ' ')
+			u = m.get('image', {}).get('url', '') or m.get('thumborId') or cover
 			u = str(u)
+			b += '<h4>' + title + '</h4>\n' + body_orig
+			if not u.startswith('http'): u = s3 + u
+			if not u: print('[extract] no image for ' + str(m))
+			if 'cloudinary' in u: u = 'img/lost.svg'
 			if u not in images.keys():
-				if u.startswith('production'): u = s3 + u 
-				body += '![' + m.get('title','').replace('\n', ' ') + '](' + u + ')\n' # TODO: gallery here
-				images[u] = u
-			body += extract(html2text(m.get('body', '')), entry['_id']) + '\n'
+				# print('[extract] image: ' + u)
+				images[u] = title
+				b += '<img src=\"' + u + '\" alt=\"'+ title +'\" />\n'
+			b += m.get('body', '') + '\n'
+			body += extract(html2text(b), entry['_id'])

-	if not body_orig:
-		print('[prepare] using body history...')
-		# print(entry.get('bodyHistory', ''))
-		try: 
-			for up in entry.get('bodyHistory', []):
-				body_orig = up.get('text', '') or ''
-				if body_orig: break
-		except: pass
+	elif not body_orig:
+		for up in entry.get('bodyHistory', []) or []:
+			body_orig = up.get('text', '') or ''
+			if body_orig:
+				print('[extract] body from history!')
+				break
+			if not body and not body_orig: print('[extract] error: EMPTY BODY')

 	# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
-	body += extract(html2text(body_orig), entry['_id'])
-	
-	# replace some topics
-	for oldtopicslug, newtopicslug in retopics.items():
-		body.replace(oldtopicslug, newtopicslug)
-	
+	# print('[extract] adding original body')
+	if body_orig: body += extract(html2text(body_orig), entry['_id'])
+	if entry['slug'] in sys.argv: 
+		open(contentDir + '/' + entry['slug'] + '.html', 'w')\
+			.write(entry.get('body',''))
 	return body