normalize media fixed
This commit is contained in:
parent
9ca9859563
commit
9a4cd6ba06
|
@ -314,9 +314,6 @@ async def handle_auto():
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
cmd = sys.argv[1]
|
|
||||||
if type(cmd) == str:
|
|
||||||
print("[migration] command: " + cmd)
|
|
||||||
init_tables()
|
init_tables()
|
||||||
await handle_auto()
|
await handle_auto()
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -4,7 +4,7 @@ from datetime import datetime, timezone
|
||||||
|
|
||||||
import frontmatter
|
import frontmatter
|
||||||
|
|
||||||
from .extract import extract_html, prepare_html_body, extract_media
|
from .extract import extract_html, extract_media
|
||||||
from .utils import DateTimeEncoder
|
from .utils import DateTimeEncoder
|
||||||
|
|
||||||
OLD_DATE = "2016-03-05 22:22:00.350000"
|
OLD_DATE = "2016-03-05 22:22:00.350000"
|
||||||
|
@ -50,11 +50,12 @@ def export_mdx(r):
|
||||||
def export_body(shout, storage):
|
def export_body(shout, storage):
|
||||||
entry = storage["content_items"]["by_oid"][shout["oid"]]
|
entry = storage["content_items"]["by_oid"][shout["oid"]]
|
||||||
if entry:
|
if entry:
|
||||||
shout["body"] = prepare_html_body(entry) # prepare_md_body(entry)
|
body = extract_html(entry)
|
||||||
shout["media"] = extract_media(entry)
|
media = extract_media(entry)
|
||||||
|
shout["body"] = body # prepare_html_body(entry) # prepare_md_body(entry)
|
||||||
|
shout["media"] = media
|
||||||
export_mdx(shout)
|
export_mdx(shout)
|
||||||
print("[export] html for %s" % shout["slug"])
|
print("[export] html for %s" % shout["slug"])
|
||||||
body = extract_html(entry)
|
|
||||||
open(contentDir + shout["slug"] + ".html", "w").write(body)
|
open(contentDir + shout["slug"] + ".html", "w").write(body)
|
||||||
else:
|
else:
|
||||||
raise Exception("no content_items entry found")
|
raise Exception("no content_items entry found")
|
||||||
|
|
|
@ -3,6 +3,9 @@ import os
|
||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
|
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
|
||||||
contentDir = os.path.join(
|
contentDir = os.path.join(
|
||||||
os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
|
os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
|
||||||
|
@ -343,59 +346,7 @@ def prepare_html_body(entry):
|
||||||
|
|
||||||
def extract_html(entry):
|
def extract_html(entry):
|
||||||
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
|
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
|
||||||
media = entry.get("media", [])
|
|
||||||
kind = entry.get("type") or ""
|
|
||||||
print("[extract] kind: " + kind)
|
|
||||||
mbodies = set([])
|
|
||||||
if media:
|
|
||||||
# print('[extract] media is found')
|
|
||||||
for m in media:
|
|
||||||
mbody = m.get("body", "")
|
|
||||||
addon = ""
|
|
||||||
if kind == "Literature":
|
|
||||||
mbody = m.get("literatureBody") or m.get("body", "")
|
|
||||||
elif kind == "Image":
|
|
||||||
cover = ""
|
|
||||||
if "thumborId" in entry:
|
|
||||||
cover = cdn + "/unsafe/1600x/" + entry["thumborId"]
|
|
||||||
if not cover:
|
|
||||||
if "image" in entry:
|
|
||||||
cover = entry["image"].get("url", "")
|
|
||||||
if "cloudinary" in cover:
|
|
||||||
cover = ""
|
|
||||||
# else: print('[extract] cover: ' + cover)
|
|
||||||
title = m.get("title", "").replace("\n", " ").replace(" ", " ")
|
|
||||||
u = m.get("thumborId") or cover or ""
|
|
||||||
if title:
|
|
||||||
addon += "<h4>" + title + "</h4>\n"
|
|
||||||
if not u.startswith("http"):
|
|
||||||
u = s3 + u
|
|
||||||
if not u:
|
|
||||||
print("[extract] no image url for " + str(m))
|
|
||||||
if "cloudinary" in u:
|
|
||||||
u = "img/lost.svg"
|
|
||||||
if u != cover or (u == cover and media.index(m) == 0):
|
|
||||||
addon += '<img src="' + u + '" alt="' + title + '" />\n'
|
|
||||||
if addon:
|
|
||||||
body_orig += addon
|
|
||||||
# print('[extract] item addon: ' + addon)
|
|
||||||
# if addon: print('[extract] addon: %s' % addon)
|
|
||||||
if mbody and mbody not in mbodies:
|
|
||||||
mbodies.add(mbody)
|
|
||||||
body_orig += mbody
|
|
||||||
if len(list(mbodies)) != len(media):
|
|
||||||
print(
|
|
||||||
"[extract] %d/%d media item bodies appended"
|
|
||||||
% (len(list(mbodies)), len(media))
|
|
||||||
)
|
|
||||||
# print('[extract] media items body: \n' + body_orig)
|
|
||||||
if not body_orig:
|
|
||||||
for up in entry.get("bodyHistory", []) or []:
|
|
||||||
body_orig = up.get("text", "") or ""
|
|
||||||
if body_orig:
|
|
||||||
print("[extract] got html body from history")
|
|
||||||
break
|
|
||||||
if not body_orig:
|
if not body_orig:
|
||||||
print("[extract] empty HTML body")
|
print("[extract] empty HTML body")
|
||||||
# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
|
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
|
||||||
return body_orig
|
return body_html
|
||||||
|
|
|
@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
from transliterate import translit
|
from transliterate import translit
|
||||||
from base.orm import local_session
|
from base.orm import local_session
|
||||||
from migration.extract import prepare_html_body
|
from migration.extract import extract_html, extract_media
|
||||||
from orm.reaction import Reaction, ReactionKind
|
from orm.reaction import Reaction, ReactionKind
|
||||||
from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
|
from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
|
||||||
from orm.user import User
|
from orm.user import User
|
||||||
|
@ -195,7 +195,8 @@ async def migrate(entry, storage):
|
||||||
entry["cover"] = r["cover"]
|
entry["cover"] = r["cover"]
|
||||||
|
|
||||||
# body
|
# body
|
||||||
r["body"], media = prepare_html_body(entry)
|
r["body"] = extract_html(entry)
|
||||||
|
media = extract_media(entry)
|
||||||
if media:
|
if media:
|
||||||
r["media"] = json.dumps(media, ensure_ascii=True)
|
r["media"] = json.dumps(media, ensure_ascii=True)
|
||||||
# save shout to db
|
# save shout to db
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from base.orm import local_session
|
from base.orm import local_session
|
||||||
from migration.extract import extract_md, html2text
|
from migration.extract import extract_md
|
||||||
|
from migration.html2text import html2text
|
||||||
from orm import Topic
|
from orm import Topic
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user