Merge remote-tracking branch 'origin/main' into storages-to-qeuries

This commit is contained in:
Igor Lobanov
2022-11-28 13:54:22 +01:00
30 changed files with 605 additions and 430 deletions

View File

@@ -314,9 +314,6 @@ async def handle_auto():
async def main():
if len(sys.argv) > 1:
cmd = sys.argv[1]
if type(cmd) == str:
print("[migration] command: " + cmd)
init_tables()
await handle_auto()
else:

View File

@@ -4,7 +4,7 @@ from datetime import datetime, timezone
import frontmatter
from .extract import extract_html, prepare_html_body
from .extract import extract_html, extract_media
from .utils import DateTimeEncoder
OLD_DATE = "2016-03-05 22:22:00.350000"
@@ -50,11 +50,12 @@ def export_mdx(r):
def export_body(shout, storage):
entry = storage["content_items"]["by_oid"][shout["oid"]]
if entry:
shout["body"], media = prepare_html_body(entry) # prepare_md_body(entry)
body = extract_html(entry)
media = extract_media(entry)
shout["body"] = body # prepare_html_body(entry) # prepare_md_body(entry)
shout["media"] = media
export_mdx(shout)
print("[export] html for %s" % shout["slug"])
body, _media = extract_html(entry)
open(contentDir + shout["slug"] + ".html", "w").write(body)
else:
raise Exception("no content_items entry found")

View File

@@ -3,7 +3,8 @@ import os
import re
import uuid
from .html2text import html2text
from bs4 import BeautifulSoup
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
contentDir = os.path.join(
@@ -258,47 +259,44 @@ def extract_md(body, oid=""):
return newbody
def prepare_md_body(entry):
# body modifications
body = ""
def extract_media(entry):
''' normalized media extraction method '''
# media [ { title pic url body } ]}
kind = entry.get("type")
addon = ""
if kind == "Video":
addon = ""
for m in entry.get("media", []):
if "youtubeId" in m:
addon += "<VideoPlayer youtubeId='" + m["youtubeId"] + "' />\n"
if not kind:
print(entry)
raise Exception("shout no layout")
media = []
for m in entry.get("media") or []:
# title
title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
artist = m.get("performer") or m.get("artist")
if artist:
title = artist + " - " + title
# pic
url = m.get("fileUrl") or m.get("url", "")
pic = ""
if m.get("thumborId"):
pic = cdn + "/unsafe/1600x/" + m["thumborId"]
# url
if not url:
if kind == "Image":
url = pic
elif "youtubeId" in m:
url = "https://youtube.com/?watch=" + m["youtubeId"]
elif "vimeoId" in m:
addon += "<VideoPlayer vimeoId='" + m["vimeoId"] + "' />\n"
else:
print("[extract] media is not supported")
print(m)
body = "import VideoPlayer from '$/components/Article/VideoPlayer'\n\n" + addon
elif kind == "Music":
addon = ""
for m in entry.get("media", []):
artist = m.get("performer")
trackname = ""
if artist:
trackname += artist + " - "
if "title" in m:
trackname += m.get("title", "")
addon += (
'<AudioPlayer src="'
+ m.get("fileUrl", "")
+ '" title="'
+ trackname
+ '" />\n'
)
body = "import AudioPlayer from '$/components/Article/AudioPlayer'\n\n" + addon
body_orig, media = extract_html(entry)
if body_orig:
body += extract_md(html2text(body_orig), entry["_id"])
if not body:
print("[extract] empty MDX body")
return body, media
url = "https://vimeo.com/" + m["vimeoId"]
# body
body = m.get("body") or m.get("literatureBody") or ""
media.append({
"url": url,
"pic": pic,
"title": title,
"body": body
})
return media
def prepare_html_body(entry):
@@ -308,7 +306,7 @@ def prepare_html_body(entry):
addon = ""
if kind == "Video":
addon = ""
for m in entry.get("media", []):
for m in entry.get("media") or []:
if "youtubeId" in m:
addon += '<iframe width="420" height="345" src="http://www.youtube.com/embed/'
addon += m["youtubeId"]
@@ -325,7 +323,7 @@ def prepare_html_body(entry):
elif kind == "Music":
addon = ""
for m in entry.get("media", []):
for m in entry.get("media") or []:
artist = m.get("performer")
trackname = ""
if artist:
@@ -339,68 +337,12 @@ def prepare_html_body(entry):
addon += '"></audio></figure>'
body += addon
body, media = extract_html(entry)
body = extract_html(entry)
# if body_orig: body += extract_md(html2text(body_orig), entry['_id'])
if not body:
print("[extract] empty HTML body")
return body, media
return body
def extract_html(entry):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
media = entry.get("media", [])
kind = entry.get("type") or ""
print("[extract] kind: " + kind)
mbodies = set([])
if media:
# print('[extract] media is found')
for m in media:
mbody = m.get("body", "")
addon = ""
if kind == "Literature":
mbody = m.get("literatureBody") or m.get("body", "")
elif kind == "Image":
cover = ""
if "thumborId" in entry:
cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
if not cover:
if "image" in entry:
cover = entry["image"].get("url", "")
if "cloudinary" in cover:
cover = ""
# else: print('[extract] cover: ' + cover)
title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
u = m.get("thumborId") or cover or ""
if title:
addon += "<h4>" + title + "</h4>\n"
if not u.startswith("http"):
u = s3 + u
if not u:
print("[extract] no image url for " + str(m))
if "cloudinary" in u:
u = "img/lost.svg"
if u != cover or (u == cover and media.index(m) == 0):
addon += '<img src="' + u + '" alt="' + title + '" />\n'
if addon:
body_orig += addon
# print('[extract] item addon: ' + addon)
# if addon: print('[extract] addon: %s' % addon)
if mbody and mbody not in mbodies:
mbodies.add(mbody)
body_orig += mbody
if len(list(mbodies)) != len(media):
print(
"[extract] %d/%d media item bodies appended"
% (len(list(mbodies)), len(media))
)
# print('[extract] media items body: \n' + body_orig)
if not body_orig:
for up in entry.get("bodyHistory", []) or []:
body_orig = up.get("text", "") or ""
if body_orig:
print("[extract] got html body from history")
break
if not body_orig:
print("[extract] empty HTML body")
# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
return body_orig, media
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
return body_html

View File

@@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
from sqlalchemy.exc import IntegrityError
from transliterate import translit
from base.orm import local_session
from migration.extract import prepare_html_body
from migration.extract import extract_html, extract_media
from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
from orm.user import User
@@ -103,11 +103,11 @@ async def migrate(entry, storage):
"authors": [],
"topics": set([])
}
topics_by_oid = storage["topics"]["by_oid"]
users_by_oid = storage["users"]["by_oid"]
# author
oid = entry.get("createdBy", entry.get("_id", entry.get("oid")))
userdata = users_by_oid.get(oid)
users_by_oid = storage["users"]["by_oid"]
user_oid = entry.get("createdBy", "")
userdata = users_by_oid.get(user_oid)
user = None
if not userdata:
app = entry.get("application")
@@ -139,6 +139,8 @@ async def migrate(entry, storage):
# timestamps
r["createdAt"] = date_parse(entry.get("createdAt", OLD_DATE))
r["updatedAt"] = date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts
# visibility
if entry.get("published"):
r["publishedAt"] = date_parse(entry.get("publishedAt", OLD_DATE))
r["visibility"] = "public"
@@ -150,25 +152,67 @@ async def migrate(entry, storage):
session.commit()
else:
r["visibility"] = "authors"
if "deletedAt" in entry:
r["deletedAt"] = date_parse(entry["deletedAt"])
# topics
category = entry.get("category")
for oid in [category, ] + entry.get("tags", []):
t = storage["topics"]["by_oid"].get(oid)
if t:
tslug = storage["topics"]["by_oid"][oid]["slug"]
r["topics"].add(tslug)
r["topics"] = list(r["topics"])
# main topic
mt = topics_by_oid.get(category)
if mt and mt.get("slug"):
r["mainTopic"] = storage["replacements"].get(mt["slug"]) or r["topics"][0]
r['topics'] = await add_topics_follower(entry, storage, userslug)
r['mainTopic'] = r['topics'][0]
entry["topics"] = r["topics"]
entry["cover"] = r["cover"]
# body
r["body"] = extract_html(entry)
media = extract_media(entry)
if media:
r["media"] = json.dumps(media, ensure_ascii=True)
shout_dict = r.copy()
# user
user = await get_user(userslug, userdata, storage, user_oid)
shout_dict["authors"] = [user, ]
del shout_dict["topics"]
try:
# save shout to db
await create_shout(shout_dict, userslug)
except IntegrityError as e:
print(e)
await resolve_create_shout(shout_dict, userslug)
except Exception as e:
raise Exception(e)
# shout topics aftermath
shout_dict["topics"] = await topics_aftermath(r, storage)
# content_item ratings to reactions
await content_ratings_to_reactions(entry, shout_dict["slug"])
# shout views
await ViewedStorage.increment(shout_dict["slug"], amount=entry.get("views", 1))
# del shout_dict['ratings']
shout_dict["oid"] = entry.get("_id", "")
storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
storage["shouts"]["by_slug"][slug] = shout_dict
return shout_dict
async def add_topics_follower(entry, storage, userslug):
topics = set([])
category = entry.get("category")
topics_by_oid = storage["topics"]["by_oid"]
oids = [category, ] + entry.get("tags", [])
for toid in oids:
tslug = topics_by_oid.get(toid, {}).get("slug")
if tslug:
topics.add(tslug)
ttt = list(topics)
# add author as TopicFollower
with local_session() as session:
for tpc in r['topics']:
for tpc in topics:
try:
tf = session.query(
TopicFollower
@@ -184,24 +228,19 @@ async def migrate(entry, storage):
auto=True
)
session.add(tf)
session.commit()
except IntegrityError:
print('[migration.shout] hidden by topic ' + tpc)
r["visibility"] = "authors"
r["publishedAt"] = None
r["topics"].remove(tpc)
# main topic
maintopic = storage["replacements"].get(topics_by_oid.get(category, {}).get("slug"))
if maintopic in ttt:
ttt.remove(maintopic)
ttt.insert(0, maintopic)
return ttt
entry["topics"] = r["topics"]
entry["cover"] = r["cover"]
# body
r["body"], media = prepare_html_body(entry)
if media:
r["media"] = json.dumps(media, ensure_ascii=True)
# save shout to db
s = object()
shout_dict = r.copy()
async def get_user(userslug, userdata, storage, oid):
user = None
del shout_dict["topics"]
with local_session() as session:
if not user and userslug:
user = session.query(User).filter(User.slug == userslug).first()
@@ -216,60 +255,56 @@ async def migrate(entry, storage):
userdata["id"] = user.id
userdata["createdAt"] = user.createdAt
storage["users"]["by_slug"][userdata["slug"]] = userdata
storage["users"]["by_oid"][entry["_id"]] = userdata
storage["users"]["by_oid"][oid] = userdata
if not user:
raise Exception("could not get a user")
shout_dict["authors"] = [user, ]
try:
await create_shout(shout_dict, userslug)
except IntegrityError as e:
with local_session() as session:
s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
bump = False
if s:
if s.authors[0] != userslug:
# create new with different slug
shout_dict["slug"] += '-' + shout_dict["layout"]
try:
await create_shout(shout_dict, userslug)
except IntegrityError as e:
print(e)
bump = True
else:
# update old
for key in shout_dict:
if key in s.__dict__:
if s.__dict__[key] != shout_dict[key]:
print(
"[migration] shout already exists, but differs in %s"
% key
)
bump = True
else:
print("[migration] shout already exists, but lacks %s" % key)
bump = True
if bump:
s.update(shout_dict)
else:
print("[migration] something went wrong with shout: \n%r" % shout_dict)
raise e
session.commit()
except Exception as e:
print(e)
print(s)
raise Exception
return user
# shout topics aftermath
shout_dict["topics"] = []
for tpc in r["topics"]:
async def resolve_create_shout(shout_dict, userslug):
with local_session() as session:
s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
bump = False
if s:
if s.authors[0] != userslug:
# create new with different slug
shout_dict["slug"] += '-' + shout_dict["layout"]
try:
await create_shout(shout_dict, userslug)
except IntegrityError as e:
print(e)
bump = True
else:
# update old
for key in shout_dict:
if key in s.__dict__:
if s.__dict__[key] != shout_dict[key]:
print(
"[migration] shout already exists, but differs in %s"
% key
)
bump = True
else:
print("[migration] shout already exists, but lacks %s" % key)
bump = True
if bump:
s.update(shout_dict)
else:
print("[migration] something went wrong with shout: \n%r" % shout_dict)
raise Exception("")
session.commit()
async def topics_aftermath(entry, storage):
r = []
for tpc in filter(lambda x: bool(x), entry["topics"]):
oldslug = tpc
newslug = storage["replacements"].get(oldslug, oldslug)
if newslug:
with local_session() as session:
shout_topic_old = (
session.query(ShoutTopic)
.filter(ShoutTopic.shout == shout_dict["slug"])
.filter(ShoutTopic.shout == entry["slug"])
.filter(ShoutTopic.topic == oldslug)
.first()
)
@@ -278,25 +313,27 @@ async def migrate(entry, storage):
else:
shout_topic_new = (
session.query(ShoutTopic)
.filter(ShoutTopic.shout == shout_dict["slug"])
.filter(ShoutTopic.shout == entry["slug"])
.filter(ShoutTopic.topic == newslug)
.first()
)
if not shout_topic_new:
try:
ShoutTopic.create(
**{"shout": shout_dict["slug"], "topic": newslug}
**{"shout": entry["slug"], "topic": newslug}
)
except Exception:
print("[migration] shout topic error: " + newslug)
session.commit()
if newslug not in shout_dict["topics"]:
shout_dict["topics"].append(newslug)
if newslug not in r:
r.append(newslug)
else:
print("[migration] ignored topic slug: \n%r" % tpc["slug"])
# raise Exception
return r
# content_item ratings to reactions
async def content_ratings_to_reactions(entry, slug):
try:
with local_session() as session:
for content_rating in entry.get("ratings", []):
@@ -316,7 +353,7 @@ async def migrate(entry, storage):
if content_rating["value"] > 0
else ReactionKind.DISLIKE,
"createdBy": reactedBy.slug,
"shout": shout_dict["slug"],
"shout": slug,
}
cts = content_rating.get("createdAt")
if cts:
@@ -340,11 +377,3 @@ async def migrate(entry, storage):
session.commit()
except Exception:
raise Exception("[migration] content_item.ratings error: \n%r" % content_rating)
# shout views
await ViewedStorage.increment(shout_dict["slug"], amount=entry.get("views", 1))
# del shout_dict['ratings']
shout_dict["oid"] = entry.get("_id")
storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
storage["shouts"]["by_slug"][slug] = shout_dict
return shout_dict

View File

@@ -547,6 +547,7 @@
"poetry-slam": "poetry-slam",
"pokoy": "peace",
"police": "police",
"politicheskoe-fentezi": "political-fantasy",
"politics": "politics",
"politzaklyuchennye": "political-prisoners",
"polsha": "poland",

View File

@@ -1,5 +1,6 @@
from base.orm import local_session
from migration.extract import extract_md, html2text
from migration.extract import extract_md
from migration.html2text import html2text
from orm import Topic

View File

@@ -17,7 +17,7 @@ def migrate(entry):
"username": email,
"email": email,
"createdAt": parse(entry["createdAt"]),
"emailConfirmed": bool(entry["emails"][0]["verified"]),
"emailConfirmed": ("@discours.io" in email) or bool(entry["emails"][0]["verified"]),
"muted": False, # amnesty
"bio": entry["profile"].get("bio", ""),
"notifications": [],