normalize media fixed

2022-11-26 18:19:45 +03:00
parent 9ca9859563
commit 9a4cd6ba06
5 changed files with 15 additions and 64 deletions
--- a/migration/init.py
+++ b/migration/init.py
@@ -314,9 +314,6 @@ async def handle_auto():

 async def main():
    if len(sys.argv) > 1:
-        cmd = sys.argv[1]
-        if type(cmd) == str:
-            print("[migration] command: " + cmd)
        init_tables()
        await handle_auto()
    else:
--- a/migration/export.py
+++ b/migration/export.py
@@ -4,7 +4,7 @@ from datetime import datetime, timezone

 import frontmatter

-from .extract import extract_html, prepare_html_body, extract_media
+from .extract import extract_html, extract_media
 from .utils import DateTimeEncoder

 OLD_DATE = "2016-03-05 22:22:00.350000"
@@ -50,11 +50,12 @@ def export_mdx(r):
 def export_body(shout, storage):
    entry = storage["content_items"]["by_oid"][shout["oid"]]
    if entry:
-        shout["body"] = prepare_html_body(entry)  # prepare_md_body(entry)
-        shout["media"] = extract_media(entry)
+        body = extract_html(entry)
+        media = extract_media(entry)
+        shout["body"] = body  # prepare_html_body(entry)  # prepare_md_body(entry)
+        shout["media"] = media
        export_mdx(shout)
        print("[export] html for %s" % shout["slug"])
-        body = extract_html(entry)
        open(contentDir + shout["slug"] + ".html", "w").write(body)
    else:
        raise Exception("no content_items entry found")
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -3,6 +3,9 @@ import os
 import re
 import uuid

+from bs4 import BeautifulSoup
+
+
 TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
 contentDir = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
@@ -343,59 +346,7 @@ def prepare_html_body(entry):

 def extract_html(entry):
    body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
-    media = entry.get("media", [])
-    kind = entry.get("type") or ""
-    print("[extract] kind: " + kind)
-    mbodies = set([])
-    if media:
-        # print('[extract] media is found')
-        for m in media:
-            mbody = m.get("body", "")
-            addon = ""
-            if kind == "Literature":
-                mbody = m.get("literatureBody") or m.get("body", "")
-            elif kind == "Image":
-                cover = ""
-                if "thumborId" in entry:
-                    cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
-                if not cover:
-                    if "image" in entry:
-                        cover = entry["image"].get("url", "")
-                    if "cloudinary" in cover:
-                        cover = ""
-                # else: print('[extract] cover: ' + cover)
-                title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
-                u = m.get("thumborId") or cover or ""
-                if title:
-                    addon += "<h4>" + title + "</h4>\n"
-                if not u.startswith("http"):
-                    u = s3 + u
-                if not u:
-                    print("[extract] no image url for " + str(m))
-                if "cloudinary" in u:
-                    u = "img/lost.svg"
-                if u != cover or (u == cover and media.index(m) == 0):
-                    addon += '<img src="' + u + '" alt="' + title + '" />\n'
-            if addon:
-                body_orig += addon
-                # print('[extract] item addon: ' + addon)
-            # if addon: print('[extract] addon: %s' % addon)
-            if mbody and mbody not in mbodies:
-                mbodies.add(mbody)
-                body_orig += mbody
-        if len(list(mbodies)) != len(media):
-            print(
-                "[extract] %d/%d media item bodies appended"
-                % (len(list(mbodies)), len(media))
-            )
-        # print('[extract] media items body: \n' + body_orig)
-    if not body_orig:
-        for up in entry.get("bodyHistory", []) or []:
-            body_orig = up.get("text", "") or ""
-            if body_orig:
-                print("[extract] got html body from history")
-                break
    if not body_orig:
        print("[extract] empty HTML body")
-    # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
-    return body_orig
+    body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+    return body_html
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
 from sqlalchemy.exc import IntegrityError
 from transliterate import translit
 from base.orm import local_session
-from migration.extract import prepare_html_body
+from migration.extract import extract_html, extract_media
 from orm.reaction import Reaction, ReactionKind
 from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
 from orm.user import User
@@ -195,7 +195,8 @@ async def migrate(entry, storage):
    entry["cover"] = r["cover"]

    # body
-    r["body"], media = prepare_html_body(entry)
+    r["body"] = extract_html(entry)
+    media = extract_media(entry)
    if media:
        r["media"] = json.dumps(media, ensure_ascii=True)
    # save shout to db
--- a/migration/tables/topics.py
+++ b/migration/tables/topics.py
@@ -1,5 +1,6 @@
 from base.orm import local_session
-from migration.extract import extract_md, html2text
+from migration.extract import extract_md
+from migration.html2text import html2text
 from orm import Topic