normalize media fixed

This commit is contained in:
tonyrewin 2022-11-26 18:19:45 +03:00
parent 9ca9859563
commit 9a4cd6ba06
5 changed files with 15 additions and 64 deletions

View File

@ -314,9 +314,6 @@ async def handle_auto():
async def main():
if len(sys.argv) > 1:
cmd = sys.argv[1]
if type(cmd) == str:
print("[migration] command: " + cmd)
init_tables()
await handle_auto()
else:

View File

@ -4,7 +4,7 @@ from datetime import datetime, timezone
import frontmatter
from .extract import extract_html, prepare_html_body, extract_media
from .extract import extract_html, extract_media
from .utils import DateTimeEncoder
OLD_DATE = "2016-03-05 22:22:00.350000"
@ -50,11 +50,12 @@ def export_mdx(r):
def export_body(shout, storage):
entry = storage["content_items"]["by_oid"][shout["oid"]]
if entry:
shout["body"] = prepare_html_body(entry) # prepare_md_body(entry)
shout["media"] = extract_media(entry)
body = extract_html(entry)
media = extract_media(entry)
shout["body"] = body # prepare_html_body(entry) # prepare_md_body(entry)
shout["media"] = media
export_mdx(shout)
print("[export] html for %s" % shout["slug"])
body = extract_html(entry)
open(contentDir + shout["slug"] + ".html", "w").write(body)
else:
raise Exception("no content_items entry found")

View File

@ -3,6 +3,9 @@ import os
import re
import uuid
from bs4 import BeautifulSoup
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
contentDir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
@ -343,59 +346,7 @@ def prepare_html_body(entry):
def extract_html(entry):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
media = entry.get("media", [])
kind = entry.get("type") or ""
print("[extract] kind: " + kind)
mbodies = set([])
if media:
# print('[extract] media is found')
for m in media:
mbody = m.get("body", "")
addon = ""
if kind == "Literature":
mbody = m.get("literatureBody") or m.get("body", "")
elif kind == "Image":
cover = ""
if "thumborId" in entry:
cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
if not cover:
if "image" in entry:
cover = entry["image"].get("url", "")
if "cloudinary" in cover:
cover = ""
# else: print('[extract] cover: ' + cover)
title = m.get("title", "").replace("\n", " ").replace(" ", " ")
u = m.get("thumborId") or cover or ""
if title:
addon += "<h4>" + title + "</h4>\n"
if not u.startswith("http"):
u = s3 + u
if not u:
print("[extract] no image url for " + str(m))
if "cloudinary" in u:
u = "img/lost.svg"
if u != cover or (u == cover and media.index(m) == 0):
addon += '<img src="' + u + '" alt="' + title + '" />\n'
if addon:
body_orig += addon
# print('[extract] item addon: ' + addon)
# if addon: print('[extract] addon: %s' % addon)
if mbody and mbody not in mbodies:
mbodies.add(mbody)
body_orig += mbody
if len(list(mbodies)) != len(media):
print(
"[extract] %d/%d media item bodies appended"
% (len(list(mbodies)), len(media))
)
# print('[extract] media items body: \n' + body_orig)
if not body_orig:
for up in entry.get("bodyHistory", []) or []:
body_orig = up.get("text", "") or ""
if body_orig:
print("[extract] got html body from history")
break
if not body_orig:
print("[extract] empty HTML body")
# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
return body_orig
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
return body_html

View File

@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
from sqlalchemy.exc import IntegrityError
from transliterate import translit
from base.orm import local_session
from migration.extract import prepare_html_body
from migration.extract import extract_html, extract_media
from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
from orm.user import User
@ -195,7 +195,8 @@ async def migrate(entry, storage):
entry["cover"] = r["cover"]
# body
r["body"], media = prepare_html_body(entry)
r["body"] = extract_html(entry)
media = extract_media(entry)
if media:
r["media"] = json.dumps(media, ensure_ascii=True)
# save shout to db

View File

@ -1,5 +1,6 @@
from base.orm import local_session
from migration.extract import extract_md, html2text
from migration.extract import extract_md
from migration.html2text import html2text
from orm import Topic