normalize media fixed

This commit is contained in:
tonyrewin 2022-11-26 18:19:45 +03:00
parent 9ca9859563
commit 9a4cd6ba06
5 changed files with 15 additions and 64 deletions

View File

@ -314,9 +314,6 @@ async def handle_auto():
async def main(): async def main():
if len(sys.argv) > 1: if len(sys.argv) > 1:
cmd = sys.argv[1]
if type(cmd) == str:
print("[migration] command: " + cmd)
init_tables() init_tables()
await handle_auto() await handle_auto()
else: else:

View File

@ -4,7 +4,7 @@ from datetime import datetime, timezone
import frontmatter import frontmatter
from .extract import extract_html, prepare_html_body, extract_media from .extract import extract_html, extract_media
from .utils import DateTimeEncoder from .utils import DateTimeEncoder
OLD_DATE = "2016-03-05 22:22:00.350000" OLD_DATE = "2016-03-05 22:22:00.350000"
@ -50,11 +50,12 @@ def export_mdx(r):
def export_body(shout, storage): def export_body(shout, storage):
entry = storage["content_items"]["by_oid"][shout["oid"]] entry = storage["content_items"]["by_oid"][shout["oid"]]
if entry: if entry:
shout["body"] = prepare_html_body(entry) # prepare_md_body(entry) body = extract_html(entry)
shout["media"] = extract_media(entry) media = extract_media(entry)
shout["body"] = body # prepare_html_body(entry) # prepare_md_body(entry)
shout["media"] = media
export_mdx(shout) export_mdx(shout)
print("[export] html for %s" % shout["slug"]) print("[export] html for %s" % shout["slug"])
body = extract_html(entry)
open(contentDir + shout["slug"] + ".html", "w").write(body) open(contentDir + shout["slug"] + ".html", "w").write(body)
else: else:
raise Exception("no content_items entry found") raise Exception("no content_items entry found")

View File

@ -3,6 +3,9 @@ import os
import re import re
import uuid import uuid
from bs4 import BeautifulSoup
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)" TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
contentDir = os.path.join( contentDir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content" os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
@ -343,59 +346,7 @@ def prepare_html_body(entry):
def extract_html(entry): def extract_html(entry):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')') body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
media = entry.get("media", [])
kind = entry.get("type") or ""
print("[extract] kind: " + kind)
mbodies = set([])
if media:
# print('[extract] media is found')
for m in media:
mbody = m.get("body", "")
addon = ""
if kind == "Literature":
mbody = m.get("literatureBody") or m.get("body", "")
elif kind == "Image":
cover = ""
if "thumborId" in entry:
cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
if not cover:
if "image" in entry:
cover = entry["image"].get("url", "")
if "cloudinary" in cover:
cover = ""
# else: print('[extract] cover: ' + cover)
title = m.get("title", "").replace("\n", " ").replace(" ", " ")
u = m.get("thumborId") or cover or ""
if title:
addon += "<h4>" + title + "</h4>\n"
if not u.startswith("http"):
u = s3 + u
if not u:
print("[extract] no image url for " + str(m))
if "cloudinary" in u:
u = "img/lost.svg"
if u != cover or (u == cover and media.index(m) == 0):
addon += '<img src="' + u + '" alt="' + title + '" />\n'
if addon:
body_orig += addon
# print('[extract] item addon: ' + addon)
# if addon: print('[extract] addon: %s' % addon)
if mbody and mbody not in mbodies:
mbodies.add(mbody)
body_orig += mbody
if len(list(mbodies)) != len(media):
print(
"[extract] %d/%d media item bodies appended"
% (len(list(mbodies)), len(media))
)
# print('[extract] media items body: \n' + body_orig)
if not body_orig:
for up in entry.get("bodyHistory", []) or []:
body_orig = up.get("text", "") or ""
if body_orig:
print("[extract] got html body from history")
break
if not body_orig: if not body_orig:
print("[extract] empty HTML body") print("[extract] empty HTML body")
# body_html = str(BeautifulSoup(body_orig, features="html.parser")) body_html = str(BeautifulSoup(body_orig, features="html.parser"))
return body_orig return body_html

View File

@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from transliterate import translit from transliterate import translit
from base.orm import local_session from base.orm import local_session
from migration.extract import prepare_html_body from migration.extract import extract_html, extract_media
from orm.reaction import Reaction, ReactionKind from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
from orm.user import User from orm.user import User
@ -195,7 +195,8 @@ async def migrate(entry, storage):
entry["cover"] = r["cover"] entry["cover"] = r["cover"]
# body # body
r["body"], media = prepare_html_body(entry) r["body"] = extract_html(entry)
media = extract_media(entry)
if media: if media:
r["media"] = json.dumps(media, ensure_ascii=True) r["media"] = json.dumps(media, ensure_ascii=True)
# save shout to db # save shout to db

View File

@ -1,5 +1,6 @@
from base.orm import local_session from base.orm import local_session
from migration.extract import extract_md, html2text from migration.extract import extract_md
from migration.html2text import html2text
from orm import Topic from orm import Topic