fixing-wip

This commit is contained in:
2023-01-17 22:56:48 +03:00
parent b966ce6c24
commit 82c6236a7f
8 changed files with 106 additions and 99 deletions

View File

@@ -27,6 +27,39 @@ def replace_tooltips(body):
return newbody
def extract_footnotes(body, shout_dict):
parts = body.split("&&&")
lll = len(parts)
newparts = list(parts)
placed = False
if lll & 1:
if lll > 1:
i = 1
print("[extract] found %d footnotes in body" % (lll - 1))
for part in parts[1:]:
if i & 1:
placed = True
if 'a class="footnote-url" href=' in part:
print("[extract] footnote: " + part)
fn = 'a class="footnote-url" href="'
exxtracted_link = part.split(fn, 1)[1].split('"', 1)[0]
extracted_body = part.split(fn, 1)[1].split('>', 1)[1].split('</a>', 1)[0]
print("[extract] footnote link: " + extracted_link)
with local_session() as session:
Reaction.create({
"shout": shout_dict['id'],
"kind": ReactionKind.FOOTNOTE,
"body": extracted_body,
"range": str(body.index(fn + link) - len('<')) + ':' + str(body.index(extracted_body) + len('</a>'))
})
newparts[i] = "<a href='#'></a>"
else:
newparts[i] = part
i += 1
return ("".join(newparts), placed)
def place_tooltips(body):
parts = body.split("&&&")
lll = len(parts)
@@ -203,7 +236,7 @@ def extract_dataimages(parts, prefix):
di = "data:image"
def extract_md_images(body, oid):
def extract_md_images(body, prefix):
newbody = ""
body = (
body.replace("\n! [](" + di, "\n ![](" + di)
@@ -212,7 +245,7 @@ def extract_md_images(body, oid):
)
parts = body.split(di)
if len(parts) > 1:
newbody = extract_dataimages(parts, oid)
newbody = extract_dataimages(parts, prefix)
else:
newbody = body
return newbody
@@ -238,24 +271,24 @@ def cleanup(body):
return newbody
def extract_md(body, oid=""):
def extract_md(body, shout_dict = None):
newbody = body
if newbody:
uid = oid or uuid.uuid4()
newbody = extract_md_images(newbody, uid)
if not newbody:
raise Exception("extract_images error")
newbody = cleanup(newbody)
if not newbody:
raise Exception("cleanup error")
newbody, placed = place_tooltips(newbody)
if not newbody:
raise Exception("place_tooltips error")
if shout_dict:
uid = shout_dict['id'] or uuid.uuid4()
newbody = extract_md_images(newbody, uid)
if not newbody:
raise Exception("extract_images error")
newbody, placed = extract_footnotes(body, shout_dict)
if not newbody:
raise Exception("extract_footnotes error")
if placed:
newbody = "import Tooltip from '$/components/Article/Tooltip'\n\n" + newbody
return newbody
@@ -342,7 +375,9 @@ def prepare_html_body(entry):
return body
def extract_html(entry):
def extract_html(entry, shout_id = None):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
if shout_id:
extract_footnotes(body_orig, shout_id)
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
return body_html

View File

@@ -1,31 +1,42 @@
from base.orm import local_session
from migration.extract import extract_md
from migration.html2text import html2text
from orm.remark import Remark
from orm.reaction import Reaction, ReactionKind
def migrate(entry, storage):
post_oid = entry['contentItem']
print(post_oid)
shout_dict = storage['shouts']['by_oid'].get(post_oid)
remark = {
"shout": shout_dict['id'],
"body": extract_md(
html2text(entry['body']),
entry['_id']
),
"desc": extract_md(
html2text(
entry['textAfter'] or '' + \
entry['textBefore'] or '' + \
entry['textSelected'] or ''
if shout_dict:
print(shout_dict['body'])
remark = {
"shout": shout_dict['id'],
"body": extract_md(
html2text(entry['body']),
shout_dict
),
entry["_id"]
)
}
"kind": ReactionKind.REMARK
}
with local_session() as session:
rmrk = Remark.create(**remark)
session.commit()
del rmrk["_sa_instance_state"]
return rmrk
if entry.get('textBefore'):
remark['range'] = str(
shout_dict['body']
.index(
entry['textBefore'] or ''
)
) + ':' + str(
shout_dict['body']
.index(
entry['textAfter'] or ''
) + len(
entry['textAfter'] or ''
)
)
with local_session() as session:
rmrk = Reaction.create(**remark)
session.commit()
del rmrk["_sa_instance_state"]
return rmrk
return

View File

@@ -10,7 +10,7 @@ def migrate(entry):
"slug": entry["slug"],
"oid": entry["_id"],
"title": entry["title"].replace("&nbsp;", " "),
"body": extract_md(html2text(body_orig), entry["_id"])
"body": extract_md(html2text(body_orig))
}
with local_session() as session: