Merge remote-tracking branch 'origin/main' into storages-to-qeuries

2022-11-28 13:54:22 +01:00
parent cfc1beeec0 96e4150e07
commit 70744966fa
30 changed files with 605 additions and 430 deletions
--- a/migration/init.py
+++ b/migration/init.py
@@ -314,9 +314,6 @@ async def handle_auto():

 async def main():
    if len(sys.argv) > 1:
-        cmd = sys.argv[1]
-        if type(cmd) == str:
-            print("[migration] command: " + cmd)
        init_tables()
        await handle_auto()
    else:
--- a/migration/export.py
+++ b/migration/export.py
@@ -4,7 +4,7 @@ from datetime import datetime, timezone

 import frontmatter

-from .extract import extract_html, prepare_html_body
+from .extract import extract_html, extract_media
 from .utils import DateTimeEncoder

 OLD_DATE = "2016-03-05 22:22:00.350000"
@@ -50,11 +50,12 @@ def export_mdx(r):
 def export_body(shout, storage):
    entry = storage["content_items"]["by_oid"][shout["oid"]]
    if entry:
-        shout["body"], media = prepare_html_body(entry)  # prepare_md_body(entry)
+        body = extract_html(entry)
+        media = extract_media(entry)
+        shout["body"] = body  # prepare_html_body(entry)  # prepare_md_body(entry)
        shout["media"] = media
        export_mdx(shout)
        print("[export] html for %s" % shout["slug"])
-        body, _media = extract_html(entry)
        open(contentDir + shout["slug"] + ".html", "w").write(body)
    else:
        raise Exception("no content_items entry found")
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -3,7 +3,8 @@ import os
 import re
 import uuid

-from .html2text import html2text
+from bs4 import BeautifulSoup
+

 TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
 contentDir = os.path.join(
@@ -258,47 +259,44 @@ def extract_md(body, oid=""):
    return newbody


-def prepare_md_body(entry):
-    # body modifications
-    body = ""
+def extract_media(entry):
+    ''' normalized media extraction method '''
+    # media [ { title pic url body } ]}
    kind = entry.get("type")
-    addon = ""
-    if kind == "Video":
-        addon = ""
-        for m in entry.get("media", []):
-            if "youtubeId" in m:
-                addon += "<VideoPlayer youtubeId='" + m["youtubeId"] + "' />\n"
+    if not kind:
+        print(entry)
+        raise Exception("shout no layout")
+    media = []
+    for m in entry.get("media") or []:
+        # title
+        title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
+        artist = m.get("performer") or m.get("artist")
+        if artist:
+            title = artist + " - " + title
+
+        # pic
+        url = m.get("fileUrl") or m.get("url", "")
+        pic = ""
+        if m.get("thumborId"):
+            pic = cdn + "/unsafe/1600x/" + m["thumborId"]
+
+        # url
+        if not url:
+            if kind == "Image":
+                url = pic
+            elif "youtubeId" in m:
+                url = "https://youtube.com/?watch=" + m["youtubeId"]
            elif "vimeoId" in m:
-                addon += "<VideoPlayer vimeoId='" + m["vimeoId"] + "' />\n"
-            else:
-                print("[extract] media is not supported")
-                print(m)
-        body = "import VideoPlayer from '$/components/Article/VideoPlayer'\n\n" + addon
-
-    elif kind == "Music":
-        addon = ""
-        for m in entry.get("media", []):
-            artist = m.get("performer")
-            trackname = ""
-            if artist:
-                trackname += artist + " - "
-            if "title" in m:
-                trackname += m.get("title", "")
-            addon += (
-                '<AudioPlayer src="'
-                + m.get("fileUrl", "")
-                + '" title="'
-                + trackname
-                + '" />\n'
-            )
-        body = "import AudioPlayer from '$/components/Article/AudioPlayer'\n\n" + addon
-
-    body_orig, media = extract_html(entry)
-    if body_orig:
-        body += extract_md(html2text(body_orig), entry["_id"])
-    if not body:
-        print("[extract] empty MDX body")
-    return body, media
+                url = "https://vimeo.com/" + m["vimeoId"]
+        # body
+        body = m.get("body") or m.get("literatureBody") or ""
+        media.append({
+            "url": url,
+            "pic": pic,
+            "title": title,
+            "body": body
+        })
+    return media


 def prepare_html_body(entry):
@@ -308,7 +306,7 @@ def prepare_html_body(entry):
    addon = ""
    if kind == "Video":
        addon = ""
-        for m in entry.get("media", []):
+        for m in entry.get("media") or []:
            if "youtubeId" in m:
                addon += '<iframe width="420" height="345" src="http://www.youtube.com/embed/'
                addon += m["youtubeId"]
@@ -325,7 +323,7 @@ def prepare_html_body(entry):

    elif kind == "Music":
        addon = ""
-        for m in entry.get("media", []):
+        for m in entry.get("media") or []:
            artist = m.get("performer")
            trackname = ""
            if artist:
@@ -339,68 +337,12 @@ def prepare_html_body(entry):
            addon += '"></audio></figure>'
        body += addon

-    body, media = extract_html(entry)
+    body = extract_html(entry)
    # if body_orig: body += extract_md(html2text(body_orig), entry['_id'])
-    if not body:
-        print("[extract] empty HTML body")
-    return body, media
+    return body


 def extract_html(entry):
    body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
-    media = entry.get("media", [])
-    kind = entry.get("type") or ""
-    print("[extract] kind: " + kind)
-    mbodies = set([])
-    if media:
-        # print('[extract] media is found')
-        for m in media:
-            mbody = m.get("body", "")
-            addon = ""
-            if kind == "Literature":
-                mbody = m.get("literatureBody") or m.get("body", "")
-            elif kind == "Image":
-                cover = ""
-                if "thumborId" in entry:
-                    cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
-                if not cover:
-                    if "image" in entry:
-                        cover = entry["image"].get("url", "")
-                    if "cloudinary" in cover:
-                        cover = ""
-                # else: print('[extract] cover: ' + cover)
-                title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
-                u = m.get("thumborId") or cover or ""
-                if title:
-                    addon += "<h4>" + title + "</h4>\n"
-                if not u.startswith("http"):
-                    u = s3 + u
-                if not u:
-                    print("[extract] no image url for " + str(m))
-                if "cloudinary" in u:
-                    u = "img/lost.svg"
-                if u != cover or (u == cover and media.index(m) == 0):
-                    addon += '<img src="' + u + '" alt="' + title + '" />\n'
-            if addon:
-                body_orig += addon
-                # print('[extract] item addon: ' + addon)
-            # if addon: print('[extract] addon: %s' % addon)
-            if mbody and mbody not in mbodies:
-                mbodies.add(mbody)
-                body_orig += mbody
-        if len(list(mbodies)) != len(media):
-            print(
-                "[extract] %d/%d media item bodies appended"
-                % (len(list(mbodies)), len(media))
-            )
-        # print('[extract] media items body: \n' + body_orig)
-    if not body_orig:
-        for up in entry.get("bodyHistory", []) or []:
-            body_orig = up.get("text", "") or ""
-            if body_orig:
-                print("[extract] got html body from history")
-                break
-    if not body_orig:
-        print("[extract] empty HTML body")
-    # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
-    return body_orig, media
+    body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+    return body_html
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
 from sqlalchemy.exc import IntegrityError
 from transliterate import translit
 from base.orm import local_session
-from migration.extract import prepare_html_body
+from migration.extract import extract_html, extract_media
 from orm.reaction import Reaction, ReactionKind
 from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
 from orm.user import User
@@ -103,11 +103,11 @@ async def migrate(entry, storage):
        "authors": [],
        "topics": set([])
    }
-    topics_by_oid = storage["topics"]["by_oid"]
-    users_by_oid = storage["users"]["by_oid"]
+
    # author
-    oid = entry.get("createdBy", entry.get("_id", entry.get("oid")))
-    userdata = users_by_oid.get(oid)
+    users_by_oid = storage["users"]["by_oid"]
+    user_oid = entry.get("createdBy", "")
+    userdata = users_by_oid.get(user_oid)
    user = None
    if not userdata:
        app = entry.get("application")
@@ -139,6 +139,8 @@ async def migrate(entry, storage):
    # timestamps
    r["createdAt"] = date_parse(entry.get("createdAt", OLD_DATE))
    r["updatedAt"] = date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts
+
+    # visibility
    if entry.get("published"):
        r["publishedAt"] = date_parse(entry.get("publishedAt", OLD_DATE))
        r["visibility"] = "public"
@@ -150,25 +152,67 @@ async def migrate(entry, storage):
            session.commit()
    else:
        r["visibility"] = "authors"
+
    if "deletedAt" in entry:
        r["deletedAt"] = date_parse(entry["deletedAt"])

    # topics
-    category = entry.get("category")
-    for oid in [category, ] + entry.get("tags", []):
-        t = storage["topics"]["by_oid"].get(oid)
-        if t:
-            tslug = storage["topics"]["by_oid"][oid]["slug"]
-            r["topics"].add(tslug)
-    r["topics"] = list(r["topics"])
-    # main topic
-    mt = topics_by_oid.get(category)
-    if mt and mt.get("slug"):
-        r["mainTopic"] = storage["replacements"].get(mt["slug"]) or r["topics"][0]
+    r['topics'] = await add_topics_follower(entry, storage, userslug)
+    r['mainTopic'] = r['topics'][0]

+    entry["topics"] = r["topics"]
+    entry["cover"] = r["cover"]
+
+    # body
+    r["body"] = extract_html(entry)
+    media = extract_media(entry)
+    if media:
+        r["media"] = json.dumps(media, ensure_ascii=True)
+
+    shout_dict = r.copy()
+
+    # user
+    user = await get_user(userslug, userdata, storage, user_oid)
+    shout_dict["authors"] = [user, ]
+    del shout_dict["topics"]
+    try:
+        # save shout to db
+        await create_shout(shout_dict, userslug)
+    except IntegrityError as e:
+        print(e)
+        await resolve_create_shout(shout_dict, userslug)
+    except Exception as e:
+        raise Exception(e)
+
+    # shout topics aftermath
+    shout_dict["topics"] = await topics_aftermath(r, storage)
+
+    # content_item ratings to reactions
+    await content_ratings_to_reactions(entry, shout_dict["slug"])
+
+    # shout views
+    await ViewedStorage.increment(shout_dict["slug"], amount=entry.get("views", 1))
+    # del shout_dict['ratings']
+
+    shout_dict["oid"] = entry.get("_id", "")
+    storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
+    storage["shouts"]["by_slug"][slug] = shout_dict
+    return shout_dict
+
+
+async def add_topics_follower(entry, storage, userslug):
+    topics = set([])
+    category = entry.get("category")
+    topics_by_oid = storage["topics"]["by_oid"]
+    oids = [category, ] + entry.get("tags", [])
+    for toid in oids:
+        tslug = topics_by_oid.get(toid, {}).get("slug")
+        if tslug:
+            topics.add(tslug)
+    ttt = list(topics)
    # add author as TopicFollower
    with local_session() as session:
-        for tpc in r['topics']:
+        for tpc in topics:
            try:
                tf = session.query(
                    TopicFollower
@@ -184,24 +228,19 @@ async def migrate(entry, storage):
                        auto=True
                    )
                    session.add(tf)
+                    session.commit()
            except IntegrityError:
                print('[migration.shout] hidden by topic ' + tpc)
-                r["visibility"] = "authors"
-                r["publishedAt"] = None
-                r["topics"].remove(tpc)
+    # main topic
+    maintopic = storage["replacements"].get(topics_by_oid.get(category, {}).get("slug"))
+    if maintopic in ttt:
+        ttt.remove(maintopic)
+    ttt.insert(0, maintopic)
+    return ttt

-    entry["topics"] = r["topics"]
-    entry["cover"] = r["cover"]

-    # body
-    r["body"], media = prepare_html_body(entry)
-    if media:
-        r["media"] = json.dumps(media, ensure_ascii=True)
-    # save shout to db
-    s = object()
-    shout_dict = r.copy()
+async def get_user(userslug, userdata, storage, oid):
    user = None
-    del shout_dict["topics"]
    with local_session() as session:
        if not user and userslug:
            user = session.query(User).filter(User.slug == userslug).first()
@@ -216,60 +255,56 @@ async def migrate(entry, storage):
            userdata["id"] = user.id
            userdata["createdAt"] = user.createdAt
            storage["users"]["by_slug"][userdata["slug"]] = userdata
-            storage["users"]["by_oid"][entry["_id"]] = userdata
-
+            storage["users"]["by_oid"][oid] = userdata
    if not user:
        raise Exception("could not get a user")
-    shout_dict["authors"] = [user, ]
-    try:
-        await create_shout(shout_dict, userslug)
-    except IntegrityError as e:
-        with local_session() as session:
-            s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
-            bump = False
-            if s:
-                if s.authors[0] != userslug:
-                    # create new with different slug
-                    shout_dict["slug"] += '-' + shout_dict["layout"]
-                    try:
-                        await create_shout(shout_dict, userslug)
-                    except IntegrityError as e:
-                        print(e)
-                        bump = True
-                else:
-                    # update old
-                    for key in shout_dict:
-                        if key in s.__dict__:
-                            if s.__dict__[key] != shout_dict[key]:
-                                print(
-                                    "[migration] shout already exists, but differs in %s"
-                                    % key
-                                )
-                                bump = True
-                        else:
-                            print("[migration] shout already exists, but lacks %s" % key)
-                            bump = True
-                    if bump:
-                        s.update(shout_dict)
-            else:
-                print("[migration] something went wrong with shout: \n%r" % shout_dict)
-                raise e
-            session.commit()
-    except Exception as e:
-        print(e)
-        print(s)
-        raise Exception
+    return user

-    # shout topics aftermath
-    shout_dict["topics"] = []
-    for tpc in r["topics"]:
+
+async def resolve_create_shout(shout_dict, userslug):
+    with local_session() as session:
+        s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
+        bump = False
+        if s:
+            if s.authors[0] != userslug:
+                # create new with different slug
+                shout_dict["slug"] += '-' + shout_dict["layout"]
+                try:
+                    await create_shout(shout_dict, userslug)
+                except IntegrityError as e:
+                    print(e)
+                    bump = True
+            else:
+                # update old
+                for key in shout_dict:
+                    if key in s.__dict__:
+                        if s.__dict__[key] != shout_dict[key]:
+                            print(
+                                "[migration] shout already exists, but differs in %s"
+                                % key
+                            )
+                            bump = True
+                    else:
+                        print("[migration] shout already exists, but lacks %s" % key)
+                        bump = True
+                if bump:
+                    s.update(shout_dict)
+        else:
+            print("[migration] something went wrong with shout: \n%r" % shout_dict)
+            raise Exception("")
+        session.commit()
+
+
+async def topics_aftermath(entry, storage):
+    r = []
+    for tpc in filter(lambda x: bool(x), entry["topics"]):
        oldslug = tpc
        newslug = storage["replacements"].get(oldslug, oldslug)
        if newslug:
            with local_session() as session:
                shout_topic_old = (
                    session.query(ShoutTopic)
-                    .filter(ShoutTopic.shout == shout_dict["slug"])
+                    .filter(ShoutTopic.shout == entry["slug"])
                    .filter(ShoutTopic.topic == oldslug)
                    .first()
                )
@@ -278,25 +313,27 @@ async def migrate(entry, storage):
                else:
                    shout_topic_new = (
                        session.query(ShoutTopic)
-                        .filter(ShoutTopic.shout == shout_dict["slug"])
+                        .filter(ShoutTopic.shout == entry["slug"])
                        .filter(ShoutTopic.topic == newslug)
                        .first()
                    )
                    if not shout_topic_new:
                        try:
                            ShoutTopic.create(
-                                **{"shout": shout_dict["slug"], "topic": newslug}
+                                **{"shout": entry["slug"], "topic": newslug}
                            )
                        except Exception:
                            print("[migration] shout topic error: " + newslug)
                session.commit()
-            if newslug not in shout_dict["topics"]:
-                shout_dict["topics"].append(newslug)
+            if newslug not in r:
+                r.append(newslug)
        else:
            print("[migration] ignored topic slug: \n%r" % tpc["slug"])
            # raise Exception
+    return r

-    # content_item ratings to reactions
+
+async def content_ratings_to_reactions(entry, slug):
    try:
        with local_session() as session:
            for content_rating in entry.get("ratings", []):
@@ -316,7 +353,7 @@ async def migrate(entry, storage):
                        if content_rating["value"] > 0
                        else ReactionKind.DISLIKE,
                        "createdBy": reactedBy.slug,
-                        "shout": shout_dict["slug"],
+                        "shout": slug,
                    }
                    cts = content_rating.get("createdAt")
                    if cts:
@@ -340,11 +377,3 @@ async def migrate(entry, storage):
            session.commit()
    except Exception:
        raise Exception("[migration] content_item.ratings error: \n%r" % content_rating)
-
-    # shout views
-    await ViewedStorage.increment(shout_dict["slug"], amount=entry.get("views", 1))
-    # del shout_dict['ratings']
-    shout_dict["oid"] = entry.get("_id")
-    storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
-    storage["shouts"]["by_slug"][slug] = shout_dict
-    return shout_dict
--- a/migration/tables/replacements.json
+++ b/migration/tables/replacements.json
@@ -547,6 +547,7 @@
    "poetry-slam": "poetry-slam",
    "pokoy": "peace",
    "police": "police",
+    "politicheskoe-fentezi": "political-fantasy",
    "politics": "politics",
    "politzaklyuchennye": "political-prisoners",
    "polsha": "poland",
--- a/migration/tables/topics.py
+++ b/migration/tables/topics.py
@@ -1,5 +1,6 @@
 from base.orm import local_session
-from migration.extract import extract_md, html2text
+from migration.extract import extract_md
+from migration.html2text import html2text
 from orm import Topic


--- a/migration/tables/users.py
+++ b/migration/tables/users.py
@@ -17,7 +17,7 @@ def migrate(entry):
        "username": email,
        "email": email,
        "createdAt": parse(entry["createdAt"]),
-        "emailConfirmed": bool(entry["emails"][0]["verified"]),
+        "emailConfirmed": ("@discours.io" in email) or bool(entry["emails"][0]["verified"]),
        "muted": False,  # amnesty
        "bio": entry["profile"].get("bio", ""),
        "notifications": [],