normalize media fixed

2022-11-26 18:19:45 +03:00
parent 9ca9859563
commit 9a4cd6ba06
5 changed files with 15 additions and 64 deletions
--- a/migration/init.py
+++ b/migration/init.py
@@ -314,9 +314,6 @@ async def handle_auto():
 async def main():
    if len(sys.argv) > 1:
        cmd = sys.argv[1]
        if type(cmd) == str:
            print("[migration] command: " + cmd)
        init_tables()
        await handle_auto()
    else:
--- a/migration/export.py
+++ b/migration/export.py
@@ -4,7 +4,7 @@ from datetime import datetime, timezone
 import frontmatter
-from .extract import extract_html, prepare_html_body, extract_media
+from .extract import extract_html, extract_media
 from .utils import DateTimeEncoder
 OLD_DATE = "2016-03-05 22:22:00.350000"
@@ -50,11 +50,12 @@ def export_mdx(r):
 def export_body(shout, storage):
    entry = storage["content_items"]["by_oid"][shout["oid"]]
    if entry:
-        shout["body"] = prepare_html_body(entry)  # prepare_md_body(entry)
+        body = extract_html(entry)
-        shout["media"] = extract_media(entry)
+        media = extract_media(entry)
        shout["body"] = body  # prepare_html_body(entry)  # prepare_md_body(entry)
        shout["media"] = media
        export_mdx(shout)
        print("[export] html for %s" % shout["slug"])
        body = extract_html(entry)
        open(contentDir + shout["slug"] + ".html", "w").write(body)
    else:
        raise Exception("no content_items entry found")
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -3,6 +3,9 @@ import os
 import re
 import uuid
 from bs4 import BeautifulSoup
 TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
 contentDir = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
@@ -343,59 +346,7 @@ def prepare_html_body(entry):
 def extract_html(entry):
    body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
    media = entry.get("media", [])
    kind = entry.get("type") or ""
    print("[extract] kind: " + kind)
    mbodies = set([])
    if media:
        # print('[extract] media is found')
        for m in media:
            mbody = m.get("body", "")
            addon = ""
            if kind == "Literature":
                mbody = m.get("literatureBody") or m.get("body", "")
            elif kind == "Image":
                cover = ""
                if "thumborId" in entry:
                    cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
                if not cover:
                    if "image" in entry:
                        cover = entry["image"].get("url", "")
                    if "cloudinary" in cover:
                        cover = ""
                # else: print('[extract] cover: ' + cover)
                title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
                u = m.get("thumborId") or cover or ""
                if title:
                    addon += "<h4>" + title + "</h4>\n"
                if not u.startswith("http"):
                    u = s3 + u
                if not u:
                    print("[extract] no image url for " + str(m))
                if "cloudinary" in u:
                    u = "img/lost.svg"
                if u != cover or (u == cover and media.index(m) == 0):
                    addon += '<img src="' + u + '" alt="' + title + '" />\n'
            if addon:
                body_orig += addon
                # print('[extract] item addon: ' + addon)
            # if addon: print('[extract] addon: %s' % addon)
            if mbody and mbody not in mbodies:
                mbodies.add(mbody)
                body_orig += mbody
        if len(list(mbodies)) != len(media):
            print(
                "[extract] %d/%d media item bodies appended"
                % (len(list(mbodies)), len(media))
            )
        # print('[extract] media items body: \n' + body_orig)
    if not body_orig:
        for up in entry.get("bodyHistory", []) or []:
            body_orig = up.get("text", "") or ""
            if body_orig:
                print("[extract] got html body from history")
                break
    if not body_orig:
        print("[extract] empty HTML body")
-    # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+    body_html = str(BeautifulSoup(body_orig, features="html.parser"))
-    return body_orig
+    return body_html
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
 from sqlalchemy.exc import IntegrityError
 from transliterate import translit
 from base.orm import local_session
-from migration.extract import prepare_html_body
+from migration.extract import extract_html, extract_media
 from orm.reaction import Reaction, ReactionKind
 from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
 from orm.user import User
@@ -195,7 +195,8 @@ async def migrate(entry, storage):
    entry["cover"] = r["cover"]
    # body
-    r["body"], media = prepare_html_body(entry)
+    r["body"] = extract_html(entry)
    media = extract_media(entry)
    if media:
        r["media"] = json.dumps(media, ensure_ascii=True)
    # save shout to db
--- a/migration/tables/topics.py
+++ b/migration/tables/topics.py
@@ -1,5 +1,6 @@
 from base.orm import local_session
-from migration.extract import extract_md, html2text
+from migration.extract import extract_md
 from migration.html2text import html2text
 from orm import Topic