migration topics fixed, markdown fixed

This commit is contained in:
2022-07-01 09:39:19 +03:00
parent 0f6e505706
commit 90babaec95
10 changed files with 1151 additions and 113 deletions

View File

@@ -385,15 +385,15 @@ class HTML2Text(html.parser.HTMLParser):
elif self.current_class == 'lead' and \
self.inheader == False and \
self.span_highlight == False:
self.o("==") # NOTE: but CriticMarkup uses {== ==}
#self.o("==") # NOTE: CriticMarkup {==
self.span_lead = True
else:
if self.span_highlight:
self.o('`')
self.span_highlight = False
elif self.span_lead:
self.o('==')
self.span_lead = False
#self.o('==')
self.span_lead = False
if tag in ["p", "div"]:
if self.google_doc:
@@ -401,7 +401,7 @@ class HTML2Text(html.parser.HTMLParser):
self.p()
else:
self.soft_br()
elif self.astack:
elif self.astack or self.inheader:
pass
else:
self.p()
@@ -468,20 +468,21 @@ class HTML2Text(html.parser.HTMLParser):
# without it, Markdown won't render the resulting *** correctly.
# (Don't add a space otherwise, though, since there isn't one in the
# original HTML.)
if (
start
and self.preceding_data
and self.preceding_data[-1] == self.strong_mark[0]
):
strong = " " + self.strong_mark
self.preceding_data += " "
else:
strong = self.strong_mark
if not self.inheader and not self.astack \
and not self.span_lead and not self.span_highlight:
if (
start
and self.preceding_data
and self.preceding_data[-1] == self.strong_mark[0]
):
strong = " " + self.strong_mark
self.preceding_data += " "
else:
strong = self.strong_mark
if not self.span_lead and not self.span_highlight:
self.o(strong)
if start:
self.stressed = True
if start:
self.stressed = True
if tag in ["del", "strike", "s"]:
if start and self.preceding_data and self.preceding_data[-1] == "~":
@@ -1030,4 +1031,12 @@ def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) ->
bodywidth = config.BODY_WIDTH
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
return h.handle(html)
return h.handle(html)\
.replace('<...>', '**...**')\
.replace('<…>', '***...**')\
.replace('****', '')\
.replace('\u00a0',' ')\
.replace('\u200c', '')\
.replace('\u200b', '')\
.replace('\ufeff', '')
# .replace('\u2212', '-')

View File

@@ -156,7 +156,7 @@ IGNORE_TABLES = False
# Use a single line break after a block element rather than two line breaks.
# NOTE: Requires body width setting to be 0.
SINGLE_LINE_BREAK = True
SINGLE_LINE_BREAK = False
# Use double quotation marks when converting the <q> tag.

View File

@@ -1,8 +1,11 @@
from orm.base import local_session
from orm import Topic, Community
from dateutil.parser import parse as date_parse
import json
from migration.html2text import html2text
import sqlalchemy
def migrate(entry):
def migrate(entry, topics_by_oid):
'''
type Topic {
slug: String! # ID
@@ -14,22 +17,40 @@ def migrate(entry):
'''
topic_dict = {
'slug': entry['slug'],
'oid': entry['_id'],
# 'createdBy': entry['createdBy'],
# 'createdAt': date_parse(entry['createdAt']),
'title': entry['title'].replace('&nbsp;', ' '), #.lower(),
'children': [],
'community' : Community.default_community.slug,
'body' : entry.get('description')
'body' : html2text(entry.get('description', '').replace('&nbsp;', ' '))
}
try:
with local_session() as session:
topic = session.query(Topic).filter(Topic.slug == topic_dict['slug']).first()
if not topic:
topic = session.query(Topic).filter(Topic.title == topic_dict['title']).first()
retopics = json.loads(open('migration/tables/replacements.json').read())
with local_session() as session:
slug = topics_by_oid.get(topic_dict['oid'], topic_dict)['slug']
if slug:
try:
topic = session.query(Topic).filter(Topic.slug == slug).first()
if not topic:
del topic_dict['oid']
topic = Topic.create(**topic_dict)
except Exception as e:
print(e)
raise e
topic_dict['cat_id'] = entry['_id']
print('created')
else:
if len(topic.title) > len(topic_dict['title']) or \
len(topic.body) < len(topic_dict['body']):
print('updating topic')
topic.update({
'slug': slug,
'title': topic_dict['title'] if len(topic.title) > len(topic_dict['title']) else topic.title,
'body': topic_dict['body'] if len(topic.body) < len(topic_dict['body']) else topic.body,
#'views': topic.views + topic_dict['views']
#'authors': topic.views + topic_dict['views']
#'followers': topic.views + topic_dict['views']
})
print(slug + ': ' + topic.title)
except Exception as e:
print('not found old topic: ' + slug)
else:
raise Exception
topic_dict['oid'] = entry['_id']
return topic_dict

View File

@@ -13,6 +13,7 @@ from sqlalchemy.exc import IntegrityError
from orm.base import local_session
from orm.community import Community
import os
import string
DISCOURS_USER = {
'id': 9999999,
@@ -32,7 +33,6 @@ type2layout = {
'Image': 'image'
}
def get_metadata(r):
metadata = {}
metadata['title'] = r.get('title')
@@ -45,6 +45,9 @@ def get_metadata(r):
metadata['cover'] = r.get('cover')
return metadata
retopics = json.loads(open('migration/tables/replacements.json').read())
def migrate(entry, users_by_oid, topics_by_oid):
'''
type Shout {
@@ -96,11 +99,14 @@ def migrate(entry, users_by_oid, topics_by_oid):
if mainTopic:
r['mainTopic'] = mainTopic["slug"]
topic_oids = [category, ]
taglist = entry.get("tags", [])
topic_oids.extend(taglist)
topic_errors = []
topic_oids.extend(entry.get('tags', []))
for oid in topic_oids:
if oid in topics_by_oid:
r['topics'].append(topics_by_oid[oid])
else:
# print('ERROR: unknown old topic id: ' + oid)
topic_errors.append(oid)
if entry.get('image') is not None:
r['cover'] = entry['image']['url']
if entry.get('thumborId') is not None:
@@ -115,9 +121,9 @@ def migrate(entry, users_by_oid, topics_by_oid):
if body_orig == '':
print('EMPTY BODY!')
else:
body_html = str(BeautifulSoup(
body_orig, features="html.parser"))
r['body'] = html2text(body_html)
# body_html = str(BeautifulSoup(
# body_orig, features="html.parser"))
r['body'] = html2text(body_orig)
else:
print(r['slug'] + ': literature has no media')
elif entry.get('type') == 'Video':
@@ -126,12 +132,12 @@ def migrate(entry, users_by_oid, topics_by_oid):
vm = m.get('vimeoId', '')
video_url = 'https://www.youtube.com/watch?v=' + yt if yt else '#'
therestof = html2text(m.get('body', entry.get('body', '')))
r['body'] = 'import { YouTube } from \"solid-social\"\n' + \
'<YouTube youtubeId=\"''' + yt + '\" />\n\n' + therestof
r['body'] = 'import { YouTube } from \'solid-social\'\n\n' + \
'<YouTube youtubeId=\'' + yt + '\' />\n\n' + therestof
if video_url == '#':
video_url = 'https://vimeo.com/' + vm if vm else '#'
r['body'] = 'import { Vimeo } from \"solid-social\"\n' + \
'<Vimeo vimeoId=\"''' + vm + '\" />\n\n' + therestof
r['body'] = 'import { Vimeo } from \'solid-social\'\n\n' + \
'<Vimeo vimeoId=\'' + vm + '\' />\n\n' + therestof
if video_url == '#':
print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
# raise Exception
@@ -147,21 +153,22 @@ def migrate(entry, users_by_oid, topics_by_oid):
print(m)
continue
else:
r['body'] = 'import MusicPlayer from \"src/components/MusicPlayer\"\n\n'
r['body'] += '<MusicPlayer src=\"' + fileUrl + '\" title=\"' + m.get('title','') + '\" />\n'
r['body'] = 'import MusicPlayer from \'../src/components/MusicPlayer\'\n\n'
r['body'] += '<MusicPlayer src=\'' + fileUrl + '\' title=\'' + m.get('title','') + '\' />\n'
r['body'] += html2text(entry.get('body', ''))
elif entry.get('type') == 'Image':
m = r.get('media')
r['body'] = ''
if 'cover' in r: r['body'] = '<img src=\"' + r.get('cover', '') + '\" />'
r['body'] += entry.get('body', '')
mbody = r.get('media', [{'body': ''},])[0].get('body', '')
r['body'] += mbody + entry.get('body', '')
if r['body'] == '': print(entry)
if r.get('body') is None:
body_orig = entry.get('body', '')
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
r['body'] = html2text(body_html)
body_orig = entry.get('body', entry.get('bodyHistory', [{ 'text': '' }, ])[0].get('text', ''))
# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
r['body'] = html2text(body_orig)
body = r.get('body', '')
for oldtopicslug, newtopicslug in retopics.items():
body.replace(oldtopicslug, newtopicslug)
# get author data
userdata = {}
try: userdata = users_by_oid[entry['createdBy']]
@@ -200,9 +207,10 @@ def migrate(entry, users_by_oid, topics_by_oid):
content = frontmatter.dumps(frontmatter.Post(body, **metadata))
ext = 'mdx'
parentDir = '/'.join(os.getcwd().split('/')[:-1])
filepath = parentDir + '/discoursio-web/content/' + r['layout'] + '/' + r['slug'] + '.' + ext
filepath = parentDir + '/discoursio-web/content/' + r['slug'] + '.' + ext
# print(filepath)
open(filepath, 'w').write(content)
bc = bytes(content,'utf-8').decode('utf-8','ignore')
open(filepath, 'w').write(bc)
try:
shout_dict['createdAt'] = date_parse(r.get('createdAt')) if entry.get('createdAt') else ts
shout_dict['publishedAt'] = date_parse(entry.get('publishedAt')) if entry.get('published') else None
@@ -256,7 +264,9 @@ def migrate(entry, users_by_oid, topics_by_oid):
for topic in r['topics']:
try:
ShoutTopic.create(**{ 'shout': s.slug, 'topic': topic['slug'] })
shout_dict['topics'].append(topic['slug'])
tpc = topics_by_oid[topic['oid']]
slug = retopics.get(tpc['slug'], tpc['slug'])
shout_dict['topics'].append(slug)
except sqlalchemy.exc.IntegrityError:
pass
@@ -269,7 +279,6 @@ def migrate(entry, users_by_oid, topics_by_oid):
except Exception as e:
raise e
except Exception as e:
if not shout_dict['body']: r['body'] = 'body moved'
raise e
shout_dict['old_id'] = entry.get('_id')
return shout_dict # for json
return shout_dict, topic_errors

View File

@@ -2,6 +2,8 @@
"1990-e": "90s",
"2000-e": "2000s",
"90-e": "90s",
"207": "207",
"kartochki-rubinshteyna": "rubinstein-cards",
"Georgia": "georgia",
"Japan": "japan",
"Sweden": "sweden",
@@ -13,6 +15,7 @@
"afrika": "africa",
"agata-kristi": "agatha-christie",
"agressiya": "agression",
"agressivnoe-povedenie": "agression",
"aktsii": "actions",
"aktsionizm": "actionism",
"alber-kamyu": "albert-kamus",
@@ -59,6 +62,7 @@
"artists": "artists",
"ateizm": "atheism",
"audiopoeziya": "audio-poetry",
"audio-poetry": "audio-poetry",
"audiospektakl": "audio-spectacles",
"auktsyon": "auktsyon",
"avangard": "avantgarde",
@@ -385,6 +389,8 @@
"martin-haydegger": "martin-hidegger",
"matematika": "maths",
"vladimir-mayakovskiy": "vladimir-mayakovsky",
"mayakovskiy": "vladimir-mayakovsky",
"ekzistentsiya": "existence",
"media": "media",
"medicine": "medicine",
"memuary": "memoirs",
@@ -738,6 +744,8 @@
"zakonodatelstvo": "laws",
"zakony-mira": "world-laws",
"zametki": "notes",
"zhelanie": "wish",
"konets-vesny": "end-of-spring",
"zhivotnye": "animals",
"zhoze-saramago": "jose-saramago",
"zigmund-freyd": "sigmund-freud",

View File

@@ -4,7 +4,7 @@ from orm.base import local_session
from orm import Topic, Community
from dateutil.parser import parse as date_parse
def migrate(entry):
def migrate(entry, topics_by_oid):
'''
type Topic {
slug: String! # ID
@@ -21,23 +21,30 @@ def migrate(entry):
ts = datetime.fromtimestamp(entry['createdAt']/1000)
topic_dict = {
'slug': entry['slug'],
'oid': entry['_id'],
# 'createdBy': entry['createdBy'],
# 'createdAt': ts,
'title': entry['title'].replace('&nbsp;', ' '), # .lower(),
'children': [],
'community' : Community.default_community.slug,
'body' : entry.get('description')
'body' : entry.get('description','').replace('&nbsp;', ' ')
}
try:
retopics = json.loads(open('migration/tables/replacements.json').read())
with local_session() as session:
topic = session.query(Topic).filter(Topic.slug == topic_dict['slug']).first()
if not topic:
topic = session.query(Topic).filter(Topic.title == topic_dict['title']).first()
if not topic:
slug = topics_by_oid.get(topic_dict['oid'], topic_dict)['slug']
if slug:
topic = session.query(Topic).filter(Topic.slug == slug).first()
if not topic:
del topic_dict['oid']
topic = Topic.create(**topic_dict)
else:
print(slug + ': ' + topic.title)
else:
print('not found topic: ' + slug)
raise Exception
except Exception as e:
print(e)
raise e
topic_dict['tag_id'] = entry['_id']
topic_dict['oid'] = entry['_id']
return topic_dict