migration topics fixed, markdown fixed

2022-07-01 09:39:19 +03:00
parent 0f6e505706
commit 90babaec95
10 changed files with 1151 additions and 113 deletions
--- a/migration/html2text/init.py
+++ b/migration/html2text/init.py
@@ -385,15 +385,15 @@ class HTML2Text(html.parser.HTMLParser):
 					elif self.current_class == 'lead' and \
 						self.inheader == False and \
 						self.span_highlight == False:
-							self.o("==") # NOTE: but CriticMarkup uses {== ==}
+							#self.o("==") # NOTE:  CriticMarkup {==
 							self.span_lead = True
 			else:
 				if self.span_highlight:
 					self.o('`')
 					self.span_highlight = False
 				elif self.span_lead:
-						self.o('==')
-						self.span_lead = False
+					#self.o('==')
+					self.span_lead = False

 		if tag in ["p", "div"]:
 			if self.google_doc:
@@ -401,7 +401,7 @@ class HTML2Text(html.parser.HTMLParser):
 					self.p()
 				else:
 					self.soft_br()
-			elif self.astack:
+			elif self.astack or self.inheader:
 				pass
 			else:
 				self.p()
@@ -468,20 +468,21 @@ class HTML2Text(html.parser.HTMLParser):
 			# without it, Markdown won't render the resulting *** correctly.
 			# (Don't add a space otherwise, though, since there isn't one in the
 			# original HTML.)
-			if (
-				start
-				and self.preceding_data
-				and self.preceding_data[-1] == self.strong_mark[0]
-			):
-				strong = " " + self.strong_mark
-				self.preceding_data += " "
-			else:
-				strong = self.strong_mark
+			if not self.inheader and not self.astack \
+				and not self.span_lead and not self.span_highlight:
+				if (
+					start
+					and self.preceding_data
+					and self.preceding_data[-1] == self.strong_mark[0]
+				):
+					strong = " " + self.strong_mark
+					self.preceding_data += " "
+				else:
+					strong = self.strong_mark

-			if not self.span_lead and not self.span_highlight:
 				self.o(strong)
-			if start:
-				self.stressed = True
+				if start:
+					self.stressed = True

 		if tag in ["del", "strike", "s"]:
 			if start and self.preceding_data and self.preceding_data[-1] == "~":
@@ -1030,4 +1031,12 @@ def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) ->
 		bodywidth = config.BODY_WIDTH
 	h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)

-	return h.handle(html)
+	return h.handle(html)\
+		.replace('<...>', '**...**')\
+		.replace('<…>', '***...**')\
+		.replace('****',  '')\
+		.replace('\u00a0',' ')\
+		.replace('\u200c', '')\
+		.replace('\u200b', '')\
+		.replace('\ufeff', '')
+		# .replace('\u2212', '-')
--- a/migration/html2text/config.py
+++ b/migration/html2text/config.py
@@ -156,7 +156,7 @@ IGNORE_TABLES = False

 # Use a single line break after a block element rather than two line breaks.
 # NOTE: Requires body width setting to be 0.
-SINGLE_LINE_BREAK = True
+SINGLE_LINE_BREAK = False


 # Use double quotation marks when converting the <q> tag.
--- a/migration/tables/content_item_categories.py
+++ b/migration/tables/content_item_categories.py
@@ -1,8 +1,11 @@
 from orm.base import local_session
 from orm import Topic, Community
 from dateutil.parser import parse as date_parse
+import json
+from migration.html2text import html2text
+import sqlalchemy

-def migrate(entry):
+def migrate(entry, topics_by_oid):
 	'''
 	type Topic {
 		slug: String! # ID
@@ -14,22 +17,40 @@ def migrate(entry):
 	'''
 	topic_dict = {
 		'slug': entry['slug'],
+		'oid': entry['_id'],
 		# 'createdBy': entry['createdBy'],
 		# 'createdAt': date_parse(entry['createdAt']),
 		'title': entry['title'].replace('&nbsp;', ' '), #.lower(),
 		'children': [],
 		'community' : Community.default_community.slug,
-		'body' : entry.get('description')
+		'body' : html2text(entry.get('description', '').replace('&nbsp;', ' '))
 	}
-	try:
-		with local_session() as session:
-			topic = session.query(Topic).filter(Topic.slug == topic_dict['slug']).first()
-			if not topic: 
-				topic = session.query(Topic).filter(Topic.title == topic_dict['title']).first()
+	retopics = json.loads(open('migration/tables/replacements.json').read())
+	with local_session() as session:
+		slug = topics_by_oid.get(topic_dict['oid'], topic_dict)['slug']
+		if slug:
+			try:
+				topic = session.query(Topic).filter(Topic.slug == slug).first()
 				if not topic:
+					del topic_dict['oid']
 					topic = Topic.create(**topic_dict)
-	except Exception as e:
-		print(e)
-		raise e
-	topic_dict['cat_id'] = entry['_id']
+					print('created')
+				else:
+					if len(topic.title) > len(topic_dict['title']) or \
+						len(topic.body) < len(topic_dict['body']):
+							print('updating topic')
+							topic.update({
+								'slug': slug,
+								'title':  topic_dict['title'] if len(topic.title) > len(topic_dict['title']) else topic.title,
+								'body':  topic_dict['body'] if len(topic.body) < len(topic_dict['body']) else topic.body,
+								#'views': topic.views + topic_dict['views']
+								#'authors': topic.views + topic_dict['views']
+								#'followers': topic.views + topic_dict['views']
+							})
+							print(slug + ': ' + topic.title)
+			except Exception as e:
+				print('not found old topic: ' + slug)
+		else:
+			raise Exception
+	topic_dict['oid'] = entry['_id']
 	return topic_dict
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -13,6 +13,7 @@ from sqlalchemy.exc import IntegrityError
 from orm.base import local_session
 from orm.community import Community
 import os
+import string

 DISCOURS_USER = {
 	'id': 9999999,
@@ -32,7 +33,6 @@ type2layout = {
 	'Image': 'image'
 }

-
 def get_metadata(r):
 	metadata = {}
 	metadata['title'] = r.get('title')
@@ -45,6 +45,9 @@ def get_metadata(r):
 		metadata['cover'] = r.get('cover')
 	return metadata

+
+retopics = json.loads(open('migration/tables/replacements.json').read())
+
 def migrate(entry, users_by_oid, topics_by_oid):
 	'''
 	type Shout {
@@ -96,11 +99,14 @@ def migrate(entry, users_by_oid, topics_by_oid):
 	if mainTopic:
 		r['mainTopic'] = mainTopic["slug"]
 	topic_oids = [category, ]
-	taglist = entry.get("tags", [])
-	topic_oids.extend(taglist)
+	topic_errors = []
+	topic_oids.extend(entry.get('tags', []))
 	for oid in topic_oids:
 		if oid in topics_by_oid:
 			r['topics'].append(topics_by_oid[oid])
+		else:
+			# print('ERROR: unknown old topic id: ' + oid)
+			topic_errors.append(oid)
 	if entry.get('image') is not None:
 		r['cover'] = entry['image']['url']
 	if entry.get('thumborId') is not None:
@@ -115,9 +121,9 @@ def migrate(entry, users_by_oid, topics_by_oid):
 			if body_orig == '':
 				print('EMPTY BODY!')
 			else:
-				body_html = str(BeautifulSoup(
-					body_orig, features="html.parser"))
-				r['body'] = html2text(body_html)
+				# body_html = str(BeautifulSoup(
+				#	body_orig, features="html.parser"))
+				r['body'] = html2text(body_orig)
 		else:
 			print(r['slug'] + ': literature has no media')
 	elif entry.get('type') == 'Video':
@@ -126,12 +132,12 @@ def migrate(entry, users_by_oid, topics_by_oid):
 		vm = m.get('vimeoId', '')
 		video_url = 'https://www.youtube.com/watch?v=' + yt if yt else '#'
 		therestof = html2text(m.get('body', entry.get('body', '')))
-		r['body'] = 'import { YouTube } from \"solid-social\"\n' + \
-			'<YouTube youtubeId=\"'''  + yt + '\" />\n\n' + therestof
+		r['body'] = 'import { YouTube } from \'solid-social\'\n\n' + \
+			'<YouTube youtubeId=\'' + yt + '\' />\n\n' + therestof
 		if video_url == '#':
 			video_url = 'https://vimeo.com/' + vm if vm else '#'
-			r['body'] = 'import { Vimeo } from \"solid-social\"\n' + \
-				'<Vimeo vimeoId=\"'''  + vm + '\" />\n\n' + therestof
+			r['body'] = 'import { Vimeo } from \'solid-social\'\n\n' + \
+				'<Vimeo vimeoId=\''  + vm + '\' />\n\n' + therestof
 		if video_url == '#':
 			print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
 			# raise Exception
@@ -147,21 +153,22 @@ def migrate(entry, users_by_oid, topics_by_oid):
 					print(m)
 					continue
 				else:
-					r['body'] = 'import MusicPlayer from \"src/components/MusicPlayer\"\n\n'
-					r['body'] += '<MusicPlayer src=\"' + fileUrl + '\" title=\"' + m.get('title','') + '\" />\n'
+					r['body'] = 'import MusicPlayer from \'../src/components/MusicPlayer\'\n\n'
+					r['body'] += '<MusicPlayer src=\'' + fileUrl + '\' title=\'' + m.get('title','') + '\' />\n'
 				r['body'] += html2text(entry.get('body', ''))
 	elif entry.get('type') == 'Image':
-		m = r.get('media')
 		r['body'] = ''
 		if 'cover' in r: r['body'] = '<img src=\"' + r.get('cover', '') + '\" />'
-		r['body'] += entry.get('body', '')
+		mbody = r.get('media', [{'body': ''},])[0].get('body', '')
+		r['body'] += mbody + entry.get('body', '')
 		if r['body'] == '': print(entry)
 	if r.get('body') is None:
-		body_orig = entry.get('body', '')
-		body_html = str(BeautifulSoup(body_orig, features="html.parser"))
-		r['body'] = html2text(body_html)
+		body_orig = entry.get('body', entry.get('bodyHistory', [{ 'text': '' }, ])[0].get('text', ''))
+		# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+		r['body'] = html2text(body_orig)
 	body = r.get('body', '')
-	
+	for oldtopicslug, newtopicslug in retopics.items():
+		body.replace(oldtopicslug, newtopicslug)
 	# get author data
 	userdata = {}
 	try: userdata = users_by_oid[entry['createdBy']]
@@ -200,9 +207,10 @@ def migrate(entry, users_by_oid, topics_by_oid):
 		content = frontmatter.dumps(frontmatter.Post(body, **metadata))
 		ext = 'mdx'
 		parentDir = '/'.join(os.getcwd().split('/')[:-1])
-		filepath =  parentDir + '/discoursio-web/content/' + r['layout'] + '/' + r['slug'] + '.' + ext
+		filepath =  parentDir + '/discoursio-web/content/' + r['slug'] + '.' + ext
 		# print(filepath)
-		open(filepath, 'w').write(content)
+		bc = bytes(content,'utf-8').decode('utf-8','ignore')
+		open(filepath, 'w').write(bc)
 	try:
 		shout_dict['createdAt'] = date_parse(r.get('createdAt')) if entry.get('createdAt') else ts
 		shout_dict['publishedAt'] = date_parse(entry.get('publishedAt')) if entry.get('published') else None
@@ -256,7 +264,9 @@ def migrate(entry, users_by_oid, topics_by_oid):
 			for topic in r['topics']:
 				try:
 					ShoutTopic.create(**{ 'shout': s.slug, 'topic': topic['slug'] })
-					shout_dict['topics'].append(topic['slug'])
+					tpc = topics_by_oid[topic['oid']]
+					slug = retopics.get(tpc['slug'], tpc['slug'])
+					shout_dict['topics'].append(slug)
 				except sqlalchemy.exc.IntegrityError:
 					pass

@@ -269,7 +279,6 @@ def migrate(entry, users_by_oid, topics_by_oid):
 		except Exception as e: 
 			raise e
 	except Exception as e:
-		if not shout_dict['body']: r['body'] = 'body moved'
 		raise e
 	shout_dict['old_id'] = entry.get('_id')
-	return shout_dict # for json
+	return shout_dict, topic_errors
--- a/migration/tables/replacements.json
+++ b/migration/tables/replacements.json
@@ -2,6 +2,8 @@
    "1990-e": "90s",
    "2000-e": "2000s",
    "90-e": "90s",
+    "207": "207",
+    "kartochki-rubinshteyna": "rubinstein-cards",
    "Georgia": "georgia",
    "Japan": "japan",
    "Sweden": "sweden",
@@ -13,6 +15,7 @@
    "afrika": "africa",
    "agata-kristi": "agatha-christie",
    "agressiya": "agression",
+    "agressivnoe-povedenie": "agression",
    "aktsii": "actions",
    "aktsionizm": "actionism",
    "alber-kamyu": "albert-kamus",
@@ -59,6 +62,7 @@
    "artists": "artists",
    "ateizm": "atheism",
    "audiopoeziya": "audio-poetry",
+    "audio-poetry": "audio-poetry",
    "audiospektakl": "audio-spectacles",
    "auktsyon": "auktsyon",
    "avangard": "avantgarde",
@@ -385,6 +389,8 @@
    "martin-haydegger": "martin-hidegger",
    "matematika": "maths",
    "vladimir-mayakovskiy": "vladimir-mayakovsky",
+    "mayakovskiy": "vladimir-mayakovsky",
+    "ekzistentsiya": "existence",
    "media": "media",
    "medicine": "medicine",
    "memuary": "memoirs",
@@ -738,6 +744,8 @@
    "zakonodatelstvo": "laws",
    "zakony-mira": "world-laws",
    "zametki": "notes",
+    "zhelanie": "wish",
+    "konets-vesny": "end-of-spring",
    "zhivotnye": "animals",
    "zhoze-saramago": "jose-saramago",
    "zigmund-freyd": "sigmund-freud",
--- a/migration/tables/tags.py
+++ b/migration/tables/tags.py
@@ -4,7 +4,7 @@ from orm.base import local_session
 from orm import Topic, Community
 from dateutil.parser import parse as date_parse

-def migrate(entry):
+def migrate(entry, topics_by_oid):
 	'''
 	type Topic {
 		slug: String! # ID
@@ -21,23 +21,30 @@ def migrate(entry):
 		ts = datetime.fromtimestamp(entry['createdAt']/1000)
 	topic_dict = {
 		'slug': entry['slug'],
+		'oid': entry['_id'],
 		# 'createdBy': entry['createdBy'],
 		# 'createdAt': ts,
 		'title': entry['title'].replace('&nbsp;', ' '), # .lower(),
 		'children': [],
 		'community' : Community.default_community.slug,
-		'body' : entry.get('description')
+		'body' : entry.get('description','').replace('&nbsp;', ' ')
 	}
 	try:
+		retopics = json.loads(open('migration/tables/replacements.json').read())
 		with local_session() as session:
-			topic = session.query(Topic).filter(Topic.slug == topic_dict['slug']).first()
-			if not topic: 
-				topic = session.query(Topic).filter(Topic.title == topic_dict['title']).first()
-				if not topic:
+			slug = topics_by_oid.get(topic_dict['oid'], topic_dict)['slug']
+			if slug:
+				topic = session.query(Topic).filter(Topic.slug == slug).first()
+				if not topic: 
+					del topic_dict['oid']
 					topic = Topic.create(**topic_dict)
+				else:
+					print(slug + ': ' + topic.title)
+			else:
+				print('not found topic: ' + slug)
+				raise Exception
 	except Exception as e:
 		print(e)
 		raise e
-	
-	topic_dict['tag_id'] = entry['_id']
+	topic_dict['oid'] = entry['_id']
 	return topic_dict