This commit is contained in:
Tony Rewin 2023-10-05 23:18:06 +03:00
parent 3d659caa6e
commit 5fedd007c7
23 changed files with 8 additions and 4488 deletions

View File

@ -1,292 +0,0 @@
""" cmd managed migration """
import asyncio
import gc
import json
import sys
from datetime import datetime, timezone
import bs4
from migration.export import export_mdx
from migration.tables.comments import migrate as migrateComment
from migration.tables.comments import migrate_2stage as migrateComment_2stage
from migration.tables.content_items import get_shout_slug
from migration.tables.content_items import migrate as migrateShout
from migration.tables.remarks import migrate as migrateRemark
from migration.tables.topics import migrate as migrateTopic
from migration.tables.users import migrate as migrateUser, post_migrate as users_post_migrate
from migration.tables.users import migrate_2stage as migrateUser_2stage
from orm import init_tables
from orm.reaction import Reaction
TODAY = datetime.strftime(datetime.now(tz=timezone.utc), "%Y%m%d")
OLD_DATE = "2016-03-05 22:22:00.350000"
async def users_handle(storage):
"""migrating users first"""
counter = 0
id_map = {}
print("[migration] migrating %d users" % (len(storage["users"]["data"])))
for entry in storage["users"]["data"]:
oid = entry["_id"]
user = migrateUser(entry)
storage["users"]["by_oid"][oid] = user # full
del user["password"]
del user["emailConfirmed"]
del user["username"]
del user["email"]
storage["users"]["by_slug"][user["slug"]] = user # public
id_map[user["oid"]] = user["slug"]
counter += 1
ce = 0
for entry in storage["users"]["data"]:
ce += migrateUser_2stage(entry, id_map)
users_post_migrate()
async def topics_handle(storage):
"""topics from categories and tags"""
counter = 0
for t in storage["topics"]["tags"] + storage["topics"]["cats"]:
if t["slug"] in storage["replacements"]:
t["slug"] = storage["replacements"][t["slug"]]
topic = migrateTopic(t)
storage["topics"]["by_oid"][t["_id"]] = topic
storage["topics"]["by_slug"][t["slug"]] = topic
counter += 1
else:
print("[migration] topic " + t["slug"] + " ignored")
for oldslug, newslug in storage["replacements"].items():
if oldslug != newslug and oldslug in storage["topics"]["by_slug"]:
oid = storage["topics"]["by_slug"][oldslug]["_id"]
del storage["topics"]["by_slug"][oldslug]
storage["topics"]["by_oid"][oid] = storage["topics"]["by_slug"][newslug]
print("[migration] " + str(counter) + " topics migrated")
print(
"[migration] "
+ str(len(storage["topics"]["by_oid"].values()))
+ " topics by oid"
)
print(
"[migration] "
+ str(len(storage["topics"]["by_slug"].values()))
+ " topics by slug"
)
async def shouts_handle(storage, args):
"""migrating content items one by one"""
counter = 0
discours_author = 0
anonymous_author = 0
pub_counter = 0
ignored = 0
topics_dataset_bodies = []
topics_dataset_tlist = []
for entry in storage["shouts"]["data"]:
gc.collect()
# slug
slug = get_shout_slug(entry)
# single slug mode
if "-" in args and slug not in args:
continue
# migrate
shout_dict = await migrateShout(entry, storage)
if shout_dict:
storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
storage["shouts"]["by_slug"][shout_dict["slug"]] = shout_dict
# shouts.topics
if not shout_dict["topics"]:
print("[migration] no topics!")
# with author
author = shout_dict["authors"][0]
if author["slug"] == "discours":
discours_author += 1
if author["slug"] == "anonymous":
anonymous_author += 1
# print('[migration] ' + shout['slug'] + ' with author ' + author)
if entry.get("published"):
if "mdx" in args:
export_mdx(shout_dict)
pub_counter += 1
# print main counter
counter += 1
print('[migration] shouts_handle %d: %s @%s' % (
(counter + 1), shout_dict["slug"], author["slug"]
))
b = bs4.BeautifulSoup(shout_dict["body"], "html.parser")
texts = [shout_dict["title"].lower().replace(r"[^а-яА-Яa-zA-Z]", "")]
texts = texts + b.findAll(text=True)
topics_dataset_bodies.append(" ".join([x.strip().lower() for x in texts]))
topics_dataset_tlist.append(shout_dict["topics"])
else:
ignored += 1
# np.savetxt('topics_dataset.csv', (topics_dataset_bodies, topics_dataset_tlist), delimiter=',
# ', fmt='%s')
print("[migration] " + str(counter) + " content items were migrated")
print("[migration] " + str(pub_counter) + " have been published")
print("[migration] " + str(discours_author) + " authored by @discours")
print("[migration] " + str(anonymous_author) + " authored by @anonymous")
async def remarks_handle(storage):
print("[migration] comments")
c = 0
for entry_remark in storage["remarks"]["data"]:
remark = await migrateRemark(entry_remark, storage)
c += 1
print("[migration] " + str(c) + " remarks migrated")
async def comments_handle(storage):
print("[migration] comments")
id_map = {}
ignored_counter = 0
missed_shouts = {}
for oldcomment in storage["reactions"]["data"]:
if not oldcomment.get("deleted"):
reaction = await migrateComment(oldcomment, storage)
if type(reaction) == str:
missed_shouts[reaction] = oldcomment
elif type(reaction) == Reaction:
reaction = reaction.dict()
rid = reaction["id"]
oid = reaction["oid"]
id_map[oid] = rid
else:
ignored_counter += 1
for reaction in storage["reactions"]["data"]:
migrateComment_2stage(reaction, id_map)
print("[migration] " + str(len(id_map)) + " comments migrated")
print("[migration] " + str(ignored_counter) + " comments ignored")
print("[migration] " + str(len(missed_shouts.keys())) + " commented shouts missed")
missed_counter = 0
for missed in missed_shouts.values():
missed_counter += len(missed)
print("[migration] " + str(missed_counter) + " comments dropped")
async def all_handle(storage, args):
print("[migration] handle everything")
await users_handle(storage)
await topics_handle(storage)
print("[migration] users and topics are migrated")
await shouts_handle(storage, args)
# print("[migration] remarks...")
# await remarks_handle(storage)
print("[migration] migrating comments")
await comments_handle(storage)
# export_email_subscriptions()
print("[migration] done!")
def data_load():
storage = {
"content_items": {
"by_oid": {},
"by_slug": {},
},
"shouts": {"by_oid": {}, "by_slug": {}, "data": []},
"reactions": {"by_oid": {}, "by_slug": {}, "by_content": {}, "data": []},
"topics": {
"by_oid": {},
"by_slug": {},
"cats": [],
"tags": [],
},
"remarks": {"data": []},
"users": {"by_oid": {}, "by_slug": {}, "data": []},
"replacements": json.loads(open("migration/tables/replacements.json").read()),
}
try:
users_data = json.loads(open("migration/data/users.json").read())
print("[migration.load] " + str(len(users_data)) + " users ")
tags_data = json.loads(open("migration/data/tags.json").read())
storage["topics"]["tags"] = tags_data
print("[migration.load] " + str(len(tags_data)) + " tags ")
cats_data = json.loads(
open("migration/data/content_item_categories.json").read()
)
storage["topics"]["cats"] = cats_data
print("[migration.load] " + str(len(cats_data)) + " cats ")
comments_data = json.loads(open("migration/data/comments.json").read())
storage["reactions"]["data"] = comments_data
print("[migration.load] " + str(len(comments_data)) + " comments ")
content_data = json.loads(open("migration/data/content_items.json").read())
storage["shouts"]["data"] = content_data
print("[migration.load] " + str(len(content_data)) + " content items ")
remarks_data = json.loads(open("migration/data/remarks.json").read())
storage["remarks"]["data"] = remarks_data
print("[migration.load] " + str(len(remarks_data)) + " remarks data ")
# fill out storage
for x in users_data:
storage["users"]["by_oid"][x["_id"]] = x
# storage['users']['by_slug'][x['slug']] = x
# no user.slug yet
print(
"[migration.load] "
+ str(len(storage["users"]["by_oid"].keys()))
+ " users by oid"
)
for x in tags_data:
storage["topics"]["by_oid"][x["_id"]] = x
storage["topics"]["by_slug"][x["slug"]] = x
for x in cats_data:
storage["topics"]["by_oid"][x["_id"]] = x
storage["topics"]["by_slug"][x["slug"]] = x
print(
"[migration.load] "
+ str(len(storage["topics"]["by_slug"].keys()))
+ " topics by slug"
)
for item in content_data:
slug = get_shout_slug(item)
storage["content_items"]["by_slug"][slug] = item
storage["content_items"]["by_oid"][item["_id"]] = item
print("[migration.load] " + str(len(content_data)) + " content items")
for x in comments_data:
storage["reactions"]["by_oid"][x["_id"]] = x
cid = x["contentItem"]
storage["reactions"]["by_content"][cid] = x
ci = storage["content_items"]["by_oid"].get(cid, {})
if "slug" in ci:
storage["reactions"]["by_slug"][ci["slug"]] = x
print(
"[migration.load] "
+ str(len(storage["reactions"]["by_content"].keys()))
+ " with comments"
)
storage["users"]["data"] = users_data
storage["topics"]["tags"] = tags_data
storage["topics"]["cats"] = cats_data
storage["shouts"]["data"] = content_data
storage["reactions"]["data"] = comments_data
except Exception as e:
raise e
return storage
async def handling_migration():
init_tables()
await all_handle(data_load(), sys.argv)
def process():
loop = asyncio.get_event_loop()
loop.run_until_complete(handling_migration())
if __name__ == "__main__":
process()

View File

@ -1,32 +0,0 @@
import json
import os
import bson
import gc
from .utils import DateTimeEncoder
def json_tables():
print("[migration] unpack dump/discours/*.bson to migration/data/*.json")
data = {
"content_items": [],
"content_item_categories": [],
"tags": [],
"email_subscriptions": [],
"users": [],
"comments": [],
"remarks": []
}
for table in data.keys():
print('[migration] bson2json for ' + table)
gc.collect()
lc = []
bs = open("dump/discours/" + table + ".bson", "rb").read()
base = 0
while base < len(bs):
base, d = bson.decode_document(bs, base)
lc.append(d)
data[table] = lc
open(os.getcwd() + "/migration/data/" + table + ".json", "w").write(
json.dumps(lc, cls=DateTimeEncoder)
)

View File

@ -1,159 +0,0 @@
import json
import os
from datetime import datetime, timezone
import frontmatter
from .extract import extract_html, extract_media
from .utils import DateTimeEncoder
OLD_DATE = "2016-03-05 22:22:00.350000"
EXPORT_DEST = "../discoursio-web/data/"
parentDir = "/".join(os.getcwd().split("/")[:-1])
contentDir = parentDir + "/discoursio-web/content/"
ts = datetime.now(tz=timezone.utc)
def get_metadata(r):
authors = []
for a in r["authors"]:
authors.append(
{ # a short version for public listings
"slug": a.slug or "discours",
"name": a.name or "Дискурс",
"userpic": a.userpic or "https://discours.io/static/img/discours.png",
}
)
metadata = {}
metadata["title"] = r.get("title", "").replace("{", "(").replace("}", ")")
metadata["authors"] = authors
metadata["createdAt"] = r.get("createdAt", ts)
metadata["layout"] = r["layout"]
metadata["topics"] = [topic for topic in r["topics"]]
metadata["topics"].sort()
if r.get("cover", False):
metadata["cover"] = r.get("cover")
return metadata
def export_mdx(r):
# print('[export] mdx %s' % r['slug'])
content = ""
metadata = get_metadata(r)
content = frontmatter.dumps(frontmatter.Post(r["body"], **metadata))
ext = "mdx"
filepath = contentDir + r["slug"]
bc = bytes(content, "utf-8").decode("utf-8", "ignore")
open(filepath + "." + ext, "w").write(bc)
def export_body(shout, storage):
entry = storage["content_items"]["by_oid"][shout["oid"]]
if entry:
body = extract_html(entry)
media = extract_media(entry)
shout["body"] = body # prepare_html_body(entry) # prepare_md_body(entry)
shout["media"] = media
export_mdx(shout)
print("[export] html for %s" % shout["slug"])
open(contentDir + shout["slug"] + ".html", "w").write(body)
else:
raise Exception("no content_items entry found")
def export_slug(slug, storage):
shout = storage["shouts"]["by_slug"][slug]
shout = storage["shouts"]["by_slug"].get(slug)
assert shout, "[export] no shout found by slug: %s " % slug
author = shout["authors"][0]
assert author, "[export] no author error"
export_body(shout, storage)
def export_email_subscriptions():
email_subscriptions_data = json.loads(
open("migration/data/email_subscriptions.json").read()
)
for data in email_subscriptions_data:
# TODO: migrate to mailgun list manually
# migrate_email_subscription(data)
pass
print(
"[migration] "
+ str(len(email_subscriptions_data))
+ " email subscriptions exported"
)
def export_shouts(storage):
# update what was just migrated or load json again
if len(storage["users"]["by_slugs"].keys()) == 0:
storage["users"]["by_slugs"] = json.loads(
open(EXPORT_DEST + "authors.json").read()
)
print(
"[migration] "
+ str(len(storage["users"]["by_slugs"].keys()))
+ " exported authors "
)
if len(storage["shouts"]["by_slugs"].keys()) == 0:
storage["shouts"]["by_slugs"] = json.loads(
open(EXPORT_DEST + "articles.json").read()
)
print(
"[migration] "
+ str(len(storage["shouts"]["by_slugs"].keys()))
+ " exported articles "
)
for slug in storage["shouts"]["by_slugs"].keys():
export_slug(slug, storage)
def export_json(
export_articles={}, export_authors={}, export_topics={}, export_comments={}
):
open(EXPORT_DEST + "authors.json", "w").write(
json.dumps(
export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False,
)
)
print("[migration] " + str(len(export_authors.items())) + " authors exported")
open(EXPORT_DEST + "topics.json", "w").write(
json.dumps(
export_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False,
)
)
print("[migration] " + str(len(export_topics.keys())) + " topics exported")
open(EXPORT_DEST + "articles.json", "w").write(
json.dumps(
export_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False,
)
)
print("[migration] " + str(len(export_articles.items())) + " articles exported")
open(EXPORT_DEST + "comments.json", "w").write(
json.dumps(
export_comments,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False,
)
)
print(
"[migration] "
+ str(len(export_comments.items()))
+ " exported articles with comments"
)

View File

@ -1,434 +0,0 @@
import base64
import os
import re
import uuid
from bs4 import BeautifulSoup
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
contentDir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content"
)
s3 = "https://discours-io.s3.amazonaws.com/"
cdn = "https://assets.discours.io"
def replace_tooltips(body):
# change if you prefer regexp
newbody = body
matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
for match in matches:
newbody = body.replace(
match.group(1), '<Tooltip text="' + match.group(2) + '" />'
) # NOTE: doesn't work
if len(matches) > 0:
print("[extract] found %d tooltips" % len(matches))
return newbody
def extract_footnotes(body, shout_dict):
parts = body.split("&&&")
lll = len(parts)
newparts = list(parts)
placed = False
if lll & 1:
if lll > 1:
i = 1
print("[extract] found %d footnotes in body" % (lll - 1))
for part in parts[1:]:
if i & 1:
placed = True
if 'a class="footnote-url" href=' in part:
print("[extract] footnote: " + part)
fn = 'a class="footnote-url" href="'
exxtracted_link = part.split(fn, 1)[1].split('"', 1)[0]
extracted_body = part.split(fn, 1)[1].split('>', 1)[1].split('</a>', 1)[0]
print("[extract] footnote link: " + extracted_link)
with local_session() as session:
Reaction.create({
"shout": shout_dict['id'],
"kind": ReactionKind.FOOTNOTE,
"body": extracted_body,
"range": str(body.index(fn + link) - len('<')) + ':' + str(body.index(extracted_body) + len('</a>'))
})
newparts[i] = "<a href='#'></a>"
else:
newparts[i] = part
i += 1
return ("".join(newparts), placed)
def place_tooltips(body):
parts = body.split("&&&")
lll = len(parts)
newparts = list(parts)
placed = False
if lll & 1:
if lll > 1:
i = 1
print("[extract] found %d tooltips" % (lll - 1))
for part in parts[1:]:
if i & 1:
placed = True
if 'a class="footnote-url" href=' in part:
print("[extract] footnote: " + part)
fn = 'a class="footnote-url" href="'
link = part.split(fn, 1)[1].split('"', 1)[0]
extracted_part = (
part.split(fn, 1)[0] + " " + part.split("/", 1)[-1]
)
newparts[i] = (
"<Tooltip"
+ (' link="' + link + '" ' if link else "")
+ ">"
+ extracted_part
+ "</Tooltip>"
)
else:
newparts[i] = "<Tooltip>%s</Tooltip>" % part
# print('[extract] ' + newparts[i])
else:
# print('[extract] ' + part[:10] + '..')
newparts[i] = part
i += 1
return ("".join(newparts), placed)
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}="
IMG_REGEX += r"|[A-Za-z\d+\/]{2}==)))\)"
parentDir = "/".join(os.getcwd().split("/")[:-1])
public = parentDir + "/discoursio-web/public"
cache = {}
def reextract_images(body, oid):
# change if you prefer regexp
matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
i = 0
for match in matches:
print("[extract] image " + match.group(1))
ext = match.group(3)
name = oid + str(i)
link = public + "/upload/image-" + name + "." + ext
img = match.group(4)
title = match.group(1) # NOTE: this is not the title
if img not in cache:
content = base64.b64decode(img + "==")
print(str(len(img)) + " image bytes been written")
open("../" + link, "wb").write(content)
cache[img] = name
i += 1
else:
print("[extract] image cached " + cache[img])
body.replace(
str(match), "![" + title + "](" + cdn + link + ")"
) # WARNING: this does not work
return body
IMAGES = {
"data:image/png": "png",
"data:image/jpg": "jpg",
"data:image/jpeg": "jpg",
}
b64 = ";base64,"
def extract_imageparts(bodyparts, prefix):
# recursive loop
newparts = list(bodyparts)
for current in bodyparts:
i = bodyparts.index(current)
for mime in IMAGES.keys():
if mime == current[-len(mime) :] and (i + 1 < len(bodyparts)):
print("[extract] " + mime)
next = bodyparts[i + 1]
ext = IMAGES[mime]
b64end = next.index(")")
b64encoded = next[:b64end]
name = prefix + "-" + str(len(cache))
link = "/upload/image-" + name + "." + ext
print("[extract] name: " + name)
print("[extract] link: " + link)
print("[extract] %d bytes" % len(b64encoded))
if b64encoded not in cache:
try:
content = base64.b64decode(b64encoded + "==")
open(public + link, "wb").write(content)
print(
"[extract] "
+ str(len(content))
+ " image bytes been written"
)
cache[b64encoded] = name
except Exception:
raise Exception
# raise Exception('[extract] error decoding image %r' %b64encoded)
else:
print("[extract] cached link " + cache[b64encoded])
name = cache[b64encoded]
link = cdn + "/upload/image-" + name + "." + ext
newparts[i] = (
current[: -len(mime)]
+ current[-len(mime) :]
+ link
+ next[-b64end:]
)
newparts[i + 1] = next[:-b64end]
break
return (
extract_imageparts(
newparts[i] + newparts[i + 1] + b64.join(bodyparts[(i + 2) :]), prefix
)
if len(bodyparts) > (i + 1)
else "".join(newparts)
)
def extract_dataimages(parts, prefix):
newparts = list(parts)
for part in parts:
i = parts.index(part)
if part.endswith("]("):
[ext, rest] = parts[i + 1].split(b64)
name = prefix + "-" + str(len(cache))
if ext == "/jpeg":
ext = "jpg"
else:
ext = ext.replace("/", "")
link = "/upload/image-" + name + "." + ext
print("[extract] filename: " + link)
b64end = rest.find(")")
if b64end != -1:
b64encoded = rest[:b64end]
print("[extract] %d text bytes" % len(b64encoded))
# write if not cached
if b64encoded not in cache:
try:
content = base64.b64decode(b64encoded + "==")
open(public + link, "wb").write(content)
print("[extract] " + str(len(content)) + " image bytes")
cache[b64encoded] = name
except Exception:
raise Exception
# raise Exception('[extract] error decoding image %r' %b64encoded)
else:
print("[extract] 0 image bytes, cached for " + cache[b64encoded])
name = cache[b64encoded]
# update link with CDN
link = cdn + "/upload/image-" + name + "." + ext
# patch newparts
newparts[i + 1] = link + rest[b64end:]
else:
raise Exception("cannot find the end of base64 encoded string")
else:
print("[extract] dataimage skipping part " + str(i))
continue
return "".join(newparts)
di = "data:image"
def extract_md_images(body, prefix):
newbody = ""
body = (
body.replace("\n! [](" + di, "\n ![](" + di)
.replace("\n[](" + di, "\n![](" + di)
.replace(" [](" + di, " ![](" + di)
)
parts = body.split(di)
if len(parts) > 1:
newbody = extract_dataimages(parts, prefix)
else:
newbody = body
return newbody
def cleanup_md(body):
newbody = (
body.replace("<", "")
.replace(">", "")
.replace("{", "(")
.replace("}", ")")
.replace("", "...")
.replace(" __ ", " ")
.replace("_ _", " ")
.replace("****", "")
.replace("\u00a0", " ")
.replace("\u02c6", "^")
.replace("\u00a0", " ")
.replace("\ufeff", "")
.replace("\u200b", "")
.replace("\u200c", "")
) # .replace('\u2212', '-')
return newbody
def extract_md(body, shout_dict = None):
newbody = body
if newbody:
newbody = cleanup_md(newbody)
if not newbody:
raise Exception("cleanup error")
if shout_dict:
uid = shout_dict['id'] or uuid.uuid4()
newbody = extract_md_images(newbody, uid)
if not newbody:
raise Exception("extract_images error")
newbody, placed = extract_footnotes(body, shout_dict)
if not newbody:
raise Exception("extract_footnotes error")
return newbody
def extract_media(entry):
''' normalized media extraction method '''
# media [ { title pic url body } ]}
kind = entry.get("type")
if not kind:
print(entry)
raise Exception("shout no layout")
media = []
for m in entry.get("media") or []:
# title
title = m.get("title", "").replace("\n", " ").replace("&nbsp;", " ")
artist = m.get("performer") or m.get("artist")
if artist:
title = artist + " - " + title
# pic
url = m.get("fileUrl") or m.get("url", "")
pic = ""
if m.get("thumborId"):
pic = cdn + "/unsafe/1600x/" + m["thumborId"]
# url
if not url:
if kind == "Image":
url = pic
elif "youtubeId" in m:
url = "https://youtube.com/?watch=" + m["youtubeId"]
elif "vimeoId" in m:
url = "https://vimeo.com/" + m["vimeoId"]
# body
body = m.get("body") or m.get("literatureBody") or ""
media.append({
"url": url,
"pic": pic,
"title": title,
"body": body
})
return media
def prepare_html_body(entry):
# body modifications
body = ""
kind = entry.get("type")
addon = ""
if kind == "Video":
addon = ""
for m in entry.get("media") or []:
if "youtubeId" in m:
addon += '<iframe width="420" height="345" src="http://www.youtube.com/embed/'
addon += m["youtubeId"]
addon += '?autoplay=1" frameborder="0" allowfullscreen></iframe>\n'
elif "vimeoId" in m:
addon += '<iframe src="https://player.vimeo.com/video/'
addon += m["vimeoId"]
addon += ' width="420" height="345" frameborder="0" allow="autoplay; fullscreen"'
addon += " allowfullscreen></iframe>"
else:
print("[extract] media is not supported")
print(m)
body += addon
elif kind == "Music":
addon = ""
for m in entry.get("media") or []:
artist = m.get("performer")
trackname = ""
if artist:
trackname += artist + " - "
if "title" in m:
trackname += m.get("title", "")
addon += "<figure><figcaption>"
addon += trackname
addon += '</figcaption><audio controls src="'
addon += m.get("fileUrl", "")
addon += '"></audio></figure>'
body += addon
body = extract_html(entry)
# if body_orig: body += extract_md(html2text(body_orig), entry['_id'])
return body
def cleanup_html(body: str) -> str:
new_body = body
regex_remove = [
r"style=\"width:\s*\d+px;height:\s*\d+px;\"",
r"style=\"width:\s*\d+px;\"",
r"style=\"color: #000000;\"",
r"style=\"float: none;\"",
r"style=\"background: white;\"",
r"class=\"Apple-interchange-newline\"",
r"class=\"MsoNormalCxSpMiddle\"",
r"class=\"MsoNormal\"",
r"lang=\"EN-US\"",
r"id=\"docs-internal-guid-[\w-]+\"",
r"<p>\s*</p>",
r"<span></span>",
r"<i>\s*</i>",
r"<b>\s*</b>",
r"<h1>\s*</h1>",
r"<h2>\s*</h2>",
r"<h3>\s*</h3>",
r"<h4>\s*</h4>",
r"<div>\s*</div>",
]
regex_replace = {
r"<br>\s*</p>": "</p>"
}
changed = True
while changed:
# we need several iterations to clean nested tags this way
changed = False
new_body_iteration = new_body
for regex in regex_remove:
new_body = re.sub(regex, "", new_body)
for regex, replace in regex_replace.items():
new_body = re.sub(regex, replace, new_body)
if new_body_iteration != new_body:
changed = True
return new_body
def extract_html(entry, shout_id = None, cleanup=False):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
if cleanup:
# we do that before bs parsing to catch the invalid html
body_clean = cleanup_html(body_orig)
if body_clean != body_orig:
print(f"[migration] html cleaned for slug {entry.get('slug', None)}")
body_orig = body_clean
if shout_id:
extract_footnotes(body_orig, shout_id)
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
if cleanup:
# we do that after bs parsing because it can add dummy tags
body_clean_html = cleanup_html(body_html)
if body_clean_html != body_html:
print(f"[migration] html cleaned after bs4 for slug {entry.get('slug', None)}")
body_html = body_clean_html
return body_html

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
from .cli import main
main()

View File

@ -1,323 +0,0 @@
import argparse
import sys
from . import HTML2Text, __version__, config
# noinspection DuplicatedCode
def main() -> None:
baseurl = ""
class bcolors:
HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
p = argparse.ArgumentParser()
p.add_argument(
"--default-image-alt",
dest="default_image_alt",
default=config.DEFAULT_IMAGE_ALT,
help="The default alt string for images with missing ones",
)
p.add_argument(
"--pad-tables",
dest="pad_tables",
action="store_true",
default=config.PAD_TABLES,
help="pad the cells to equal column width in tables",
)
p.add_argument(
"--no-wrap-links",
dest="wrap_links",
action="store_false",
default=config.WRAP_LINKS,
help="don't wrap links during conversion",
)
p.add_argument(
"--wrap-list-items",
dest="wrap_list_items",
action="store_true",
default=config.WRAP_LIST_ITEMS,
help="wrap list items during conversion",
)
p.add_argument(
"--wrap-tables",
dest="wrap_tables",
action="store_true",
default=config.WRAP_TABLES,
help="wrap tables",
)
p.add_argument(
"--ignore-emphasis",
dest="ignore_emphasis",
action="store_true",
default=config.IGNORE_EMPHASIS,
help="don't include any formatting for emphasis",
)
p.add_argument(
"--reference-links",
dest="inline_links",
action="store_false",
default=config.INLINE_LINKS,
help="use reference style links instead of inline links",
)
p.add_argument(
"--ignore-links",
dest="ignore_links",
action="store_true",
default=config.IGNORE_ANCHORS,
help="don't include any formatting for links",
)
p.add_argument(
"--ignore-mailto-links",
action="store_true",
dest="ignore_mailto_links",
default=config.IGNORE_MAILTO_LINKS,
help="don't include mailto: links",
)
p.add_argument(
"--protect-links",
dest="protect_links",
action="store_true",
default=config.PROTECT_LINKS,
help="protect links from line breaks surrounding them with angle brackets",
)
p.add_argument(
"--ignore-images",
dest="ignore_images",
action="store_true",
default=config.IGNORE_IMAGES,
help="don't include any formatting for images",
)
p.add_argument(
"--images-as-html",
dest="images_as_html",
action="store_true",
default=config.IMAGES_AS_HTML,
help=(
"Always write image tags as raw html; preserves `height`, `width` and "
"`alt` if possible."
),
)
p.add_argument(
"--images-to-alt",
dest="images_to_alt",
action="store_true",
default=config.IMAGES_TO_ALT,
help="Discard image data, only keep alt text",
)
p.add_argument(
"--images-with-size",
dest="images_with_size",
action="store_true",
default=config.IMAGES_WITH_SIZE,
help=(
"Write image tags with height and width attrs as raw html to retain "
"dimensions"
),
)
p.add_argument(
"-g",
"--google-doc",
action="store_true",
dest="google_doc",
default=False,
help="convert an html-exported Google Document",
)
p.add_argument(
"-d",
"--dash-unordered-list",
action="store_true",
dest="ul_style_dash",
default=False,
help="use a dash rather than a star for unordered list items",
)
p.add_argument(
"-e",
"--asterisk-emphasis",
action="store_true",
dest="em_style_asterisk",
default=False,
help="use an asterisk rather than an underscore for emphasized text",
)
p.add_argument(
"-b",
"--body-width",
dest="body_width",
type=int,
default=config.BODY_WIDTH,
help="number of characters per output line, 0 for no wrap",
)
p.add_argument(
"-i",
"--google-list-indent",
dest="list_indent",
type=int,
default=config.GOOGLE_LIST_INDENT,
help="number of pixels Google indents nested lists",
)
p.add_argument(
"-s",
"--hide-strikethrough",
action="store_true",
dest="hide_strikethrough",
default=False,
help="hide strike-through text. only relevant when -g is " "specified as well",
)
p.add_argument(
"--escape-all",
action="store_true",
dest="escape_snob",
default=False,
help=(
"Escape all special characters. Output is less readable, but avoids "
"corner case formatting issues."
),
)
p.add_argument(
"--bypass-tables",
action="store_true",
dest="bypass_tables",
default=config.BYPASS_TABLES,
help="Format tables in HTML rather than Markdown syntax.",
)
p.add_argument(
"--ignore-tables",
action="store_true",
dest="ignore_tables",
default=config.IGNORE_TABLES,
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
)
p.add_argument(
"--single-line-break",
action="store_true",
dest="single_line_break",
default=config.SINGLE_LINE_BREAK,
help=(
"Use a single line break after a block element rather than two line "
"breaks. NOTE: Requires --body-width=0"
),
)
p.add_argument(
"--unicode-snob",
action="store_true",
dest="unicode_snob",
default=config.UNICODE_SNOB,
help="Use unicode throughout document",
)
p.add_argument(
"--no-automatic-links",
action="store_false",
dest="use_automatic_links",
default=config.USE_AUTOMATIC_LINKS,
help="Do not use automatic links wherever applicable",
)
p.add_argument(
"--no-skip-internal-links",
action="store_false",
dest="skip_internal_links",
default=config.SKIP_INTERNAL_LINKS,
help="Do not skip internal links",
)
p.add_argument(
"--links-after-para",
action="store_true",
dest="links_each_paragraph",
default=config.LINKS_EACH_PARAGRAPH,
help="Put links after each paragraph instead of document",
)
p.add_argument(
"--mark-code",
action="store_true",
dest="mark_code",
default=config.MARK_CODE,
help="Mark program code blocks with [code]...[/code]",
)
p.add_argument(
"--decode-errors",
dest="decode_errors",
default=config.DECODE_ERRORS,
help=(
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
"acceptable values"
),
)
p.add_argument(
"--open-quote",
dest="open_quote",
default=config.OPEN_QUOTE,
help="The character used to open quotes",
)
p.add_argument(
"--close-quote",
dest="close_quote",
default=config.CLOSE_QUOTE,
help="The character used to close quotes",
)
p.add_argument(
"--version", action="version", version=".".join(map(str, __version__))
)
p.add_argument("filename", nargs="?")
p.add_argument("encoding", nargs="?", default="utf-8")
args = p.parse_args()
if args.filename and args.filename != "-":
with open(args.filename, "rb") as fp:
data = fp.read()
else:
data = sys.stdin.buffer.read()
try:
html = data.decode(args.encoding, args.decode_errors)
except UnicodeDecodeError as err:
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
warning += " Use the " + bcolors.OKGREEN
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
print(warning)
raise err
h = HTML2Text(baseurl=baseurl)
# handle options
if args.ul_style_dash:
h.ul_item_mark = "-"
if args.em_style_asterisk:
h.emphasis_mark = "*"
h.strong_mark = "__"
h.body_width = args.body_width
h.google_list_indent = args.list_indent
h.ignore_emphasis = args.ignore_emphasis
h.ignore_links = args.ignore_links
h.ignore_mailto_links = args.ignore_mailto_links
h.protect_links = args.protect_links
h.ignore_images = args.ignore_images
h.images_as_html = args.images_as_html
h.images_to_alt = args.images_to_alt
h.images_with_size = args.images_with_size
h.google_doc = args.google_doc
h.hide_strikethrough = args.hide_strikethrough
h.escape_snob = args.escape_snob
h.bypass_tables = args.bypass_tables
h.ignore_tables = args.ignore_tables
h.single_line_break = args.single_line_break
h.inline_links = args.inline_links
h.unicode_snob = args.unicode_snob
h.use_automatic_links = args.use_automatic_links
h.skip_internal_links = args.skip_internal_links
h.links_each_paragraph = args.links_each_paragraph
h.mark_code = args.mark_code
h.wrap_links = args.wrap_links
h.wrap_list_items = args.wrap_list_items
h.wrap_tables = args.wrap_tables
h.pad_tables = args.pad_tables
h.default_image_alt = args.default_image_alt
h.open_quote = args.open_quote
h.close_quote = args.close_quote
sys.stdout.write(h.handle(html))

View File

@ -1,164 +0,0 @@
import re
# Use Unicode characters instead of their ascii pseudo-replacements
UNICODE_SNOB = True
# Marker to use for marking tables for padding post processing
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
# Escape all special characters. Output is less readable, but avoids
# corner case formatting issues.
ESCAPE_SNOB = True
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = False
# Wrap long lines at position. 0 for no wrapping.
BODY_WIDTH = 0
# Don't show internal links (href="#local-anchor") -- corresponding link
# targets won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = False
# Use inline, rather than reference, formatting for images and links
INLINE_LINKS = True
# Protect links from line breaks surrounding them with angle brackets (in
# addition to their square brackets)
PROTECT_LINKS = True
WRAP_LINKS = True
# Wrap list items.
WRAP_LIST_ITEMS = False
# Wrap tables
WRAP_TABLES = False
# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36
# Values Google and others may use to indicate bold text
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
IGNORE_ANCHORS = False
IGNORE_MAILTO_LINKS = False
IGNORE_IMAGES = False
IMAGES_AS_HTML = False
IMAGES_TO_ALT = False
IMAGES_WITH_SIZE = False
IGNORE_EMPHASIS = False
MARK_CODE = True
DECODE_ERRORS = "strict"
DEFAULT_IMAGE_ALT = ""
PAD_TABLES = True
# Convert links with same href and text to <href> format
# if they are absolute links
USE_AUTOMATIC_LINKS = True
# For checking space-only lines on line 771
RE_SPACE = re.compile(r"\s\+")
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
# to find links in the text
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
# to find table separators
RE_TABLE = re.compile(r" \| ")
RE_MD_DOT_MATCHER = re.compile(
r"""
^ # start of line
(\s*\d+) # optional whitespace and a number
(\.) # dot
(?=\s) # lookahead assert whitespace
""",
re.MULTILINE | re.VERBOSE,
)
RE_MD_PLUS_MATCHER = re.compile(
r"""
^
(\s*)
(\+)
(?=\s)
""",
flags=re.MULTILINE | re.VERBOSE,
)
RE_MD_DASH_MATCHER = re.compile(
r"""
^
(\s*)
(-)
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
# or another dash (header or hr)
""",
flags=re.MULTILINE | re.VERBOSE,
)
RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
RE_MD_BACKSLASH_MATCHER = re.compile(
r"""
(\\) # match one slash
(?=[%s]) # followed by a char that requires escaping
"""
% re.escape(RE_SLASH_CHARS),
flags=re.VERBOSE,
)
UNIFIABLE = {
"rsquo": "'",
"lsquo": "'",
"rdquo": '"',
"ldquo": '"',
"copy": "(C)",
"mdash": "--",
"nbsp": " ",
"rarr": "->",
"larr": "<-",
"middot": "*",
"ndash": "-",
"oelig": "oe",
"aelig": "ae",
"agrave": "a",
"aacute": "a",
"acirc": "a",
"atilde": "a",
"auml": "a",
"aring": "a",
"egrave": "e",
"eacute": "e",
"ecirc": "e",
"euml": "e",
"igrave": "i",
"iacute": "i",
"icirc": "i",
"iuml": "i",
"ograve": "o",
"oacute": "o",
"ocirc": "o",
"otilde": "o",
"ouml": "o",
"ugrave": "u",
"uacute": "u",
"ucirc": "u",
"uuml": "u",
"lrm": "",
"rlm": "",
}
# Format tables in HTML rather than Markdown syntax
BYPASS_TABLES = False
# Ignore table-related tags (table, th, td, tr) while keeping rows
IGNORE_TABLES = False
# Use a single line break after a block element rather than two line breaks.
# NOTE: Requires body width setting to be 0.
SINGLE_LINE_BREAK = False
# Use double quotation marks when converting the <q> tag.
OPEN_QUOTE = '"'
CLOSE_QUOTE = '"'

View File

@ -1,18 +0,0 @@
from typing import Dict, Optional
class AnchorElement:
__slots__ = ["attrs", "count", "outcount"]
def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
self.attrs = attrs
self.count = count
self.outcount = outcount
class ListElement:
__slots__ = ["name", "num"]
def __init__(self, name: str, num: int):
self.name = name
self.num = num

View File

@ -1,3 +0,0 @@
class OutCallback:
def __call__(self, s: str) -> None:
...

View File

@ -1,287 +0,0 @@
import html.entities
from typing import Dict, List, Optional
from . import config
unifiable_n = {
html.entities.name2codepoint[k]: v
for k, v in config.UNIFIABLE.items()
if k != "nbsp"
}
def hn(tag: str) -> int:
if tag[0] == "h" and len(tag) == 2:
n = tag[1]
if "0" < n <= "9":
return int(n)
return 0
def dumb_property_dict(style: str) -> Dict[str, str]:
"""
:returns: A hash of css attributes
"""
return {
x.strip().lower(): y.strip().lower()
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
}
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
"""
:type data: str
:returns: A hash of css selectors, each of which contains a hash of
css attributes.
:rtype: dict
"""
# remove @import sentences
data += ";"
importIndex = data.find("@import")
while importIndex != -1:
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
importIndex = data.find("@import")
# parse the css. reverted from dictionary comprehension in order to
# support older pythons
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
try:
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
except ValueError:
elements = {} # not that important
return elements
def element_style(
attrs: Dict[str, Optional[str]],
style_def: Dict[str, Dict[str, str]],
parent_style: Dict[str, str],
) -> Dict[str, str]:
"""
:type attrs: dict
:type style_def: dict
:type style_def: dict
:returns: A hash of the 'final' style attributes of the element
:rtype: dict
"""
style = parent_style.copy()
if attrs.get("class"):
for css_class in attrs["class"].split():
css_style = style_def.get("." + css_class, {})
style.update(css_style)
if attrs.get("style"):
immediate_style = dumb_property_dict(attrs["style"])
style.update(immediate_style)
return style
def google_list_style(style: Dict[str, str]) -> str:
"""
Finds out whether this is an ordered or unordered list
:type style: dict
:rtype: str
"""
if "list-style-type" in style:
list_style = style["list-style-type"]
if list_style in ["disc", "circle", "square", "none"]:
return "ul"
return "ol"
def google_has_height(style: Dict[str, str]) -> bool:
"""
Check if the style of the element has the 'height' attribute
explicitly defined
:type style: dict
:rtype: bool
"""
return "height" in style
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
"""
:type style: dict
:returns: A list of all emphasis modifiers of the element
:rtype: list
"""
emphasis = []
if "text-decoration" in style:
emphasis.append(style["text-decoration"])
if "font-style" in style:
emphasis.append(style["font-style"])
if "font-weight" in style:
emphasis.append(style["font-weight"])
return emphasis
def google_fixed_width_font(style: Dict[str, str]) -> bool:
"""
Check if the css of the current element defines a fixed width font
:type style: dict
:rtype: bool
"""
font_family = ""
if "font-family" in style:
font_family = style["font-family"]
return "courier new" == font_family or "consolas" == font_family
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
"""
Extract numbering from list element attributes
:type attrs: dict
:rtype: int or None
"""
if attrs.get("start"):
try:
return int(attrs["start"]) - 1
except ValueError:
pass
return 0
def skipwrap(
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
) -> bool:
# If it appears to contain a link
# don't wrap
if not wrap_links and config.RE_LINK.search(para):
return True
# If the text begins with four spaces or one tab, it's a code block;
# don't wrap
if para[0:4] == " " or para[0] == "\t":
return True
# If the text begins with only two "--", possibly preceded by
# whitespace, that's an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists,
# but there's a <br>-inside-<span> case in one of the tests that
# also depends upon it.
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
return not wrap_list_items
# If text contains a pipe character it is likely a table
if not wrap_tables and config.RE_TABLE.search(para):
return True
# If the text begins with a single -, *, or +, followed by a space,
# or an integer, followed by a ., followed by a space (in either
# case optionally proceeded by whitespace), it's a list; don't wrap.
return bool(
config.RE_ORDERED_LIST_MATCHER.match(stripped)
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
)
def escape_md(text: str) -> str:
"""
Escapes markdown-sensitive characters within other markdown
constructs.
"""
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
def escape_md_section(text: str, snob: bool = False) -> str:
"""
Escapes markdown-sensitive characters across whole document sections.
"""
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
if snob:
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
return text
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
max_cols = len(max_width)
for line in lines:
cols = [x.rstrip() for x in line.split("|")]
num_cols = len(cols)
# don't drop any data if colspan attributes result in unequal lengths
if num_cols < max_cols:
cols += [""] * (max_cols - num_cols)
elif max_cols < num_cols:
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
max_cols = num_cols
max_width = [
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
]
# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split("|")]
if set(line.strip()) == set("-|"):
filler = "-"
new_cols = [
x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)
]
new_lines.append("|-" + "|".join(new_cols) + "|")
else:
filler = " "
new_cols = [
x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)
]
new_lines.append("| " + "|".join(new_cols) + "|")
return new_lines
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
"""
Provide padding for tables in the text
"""
lines = text.split("\n")
table_buffer = [] # type: List[str]
table_started = False
new_lines = []
for line in lines:
# Toggle table started
if config.TABLE_MARKER_FOR_PAD in line:
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append("")
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
return "\n".join(new_lines)

View File

@ -1 +0,0 @@
__all__ = (["users", "topics", "content_items", "comments"],)

View File

@ -1,209 +0,0 @@
from datetime import datetime, timezone
from dateutil.parser import parse as date_parse
from services.db import local_session
from migration.html2text import html2text
from orm.reaction import Reaction, ReactionKind
from orm.shout import ShoutReactionsFollower
from orm.topic import TopicFollower
from orm.user import User
from orm.shout import Shout
ts = datetime.now(tz=timezone.utc)
def auto_followers(session, topics, reaction_dict):
# creating shout's reactions following for reaction author
following1 = (
session.query(ShoutReactionsFollower)
.where(ShoutReactionsFollower.follower == reaction_dict["createdBy"])
.filter(ShoutReactionsFollower.shout == reaction_dict["shout"])
.first()
)
if not following1:
following1 = ShoutReactionsFollower.create(
follower=reaction_dict["createdBy"], shout=reaction_dict["shout"], auto=True
)
session.add(following1)
# creating topics followings for reaction author
for t in topics:
tf = (
session.query(TopicFollower)
.where(TopicFollower.follower == reaction_dict["createdBy"])
.filter(TopicFollower.topic == t["id"])
.first()
)
if not tf:
topic_following = TopicFollower.create(
follower=reaction_dict["createdBy"], topic=t["id"], auto=True
)
session.add(topic_following)
def migrate_ratings(session, entry, reaction_dict):
for comment_rating_old in entry.get("ratings", []):
rater = (
session.query(User)
.filter(User.oid == comment_rating_old["createdBy"])
.first()
)
re_reaction_dict = {
"shout": reaction_dict["shout"],
"replyTo": reaction_dict["id"],
"kind": ReactionKind.LIKE
if comment_rating_old["value"] > 0
else ReactionKind.DISLIKE,
"createdBy": rater.id if rater else 1,
}
cts = comment_rating_old.get("createdAt")
if cts:
re_reaction_dict["createdAt"] = date_parse(cts)
try:
# creating reaction from old rating
rr = Reaction.create(**re_reaction_dict)
following2 = (
session.query(ShoutReactionsFollower)
.where(ShoutReactionsFollower.follower == re_reaction_dict["createdBy"])
.filter(ShoutReactionsFollower.shout == rr.shout)
.first()
)
if not following2:
following2 = ShoutReactionsFollower.create(
follower=re_reaction_dict["createdBy"], shout=rr.shout, auto=True
)
session.add(following2)
session.add(rr)
except Exception as e:
print("[migration] comment rating error: %r" % re_reaction_dict)
raise e
session.commit()
async def migrate(entry, storage):
"""
{
"_id": "hdtwS8fSyFLxXCgSC",
"body": "<p>",
"contentItem": "mnK8KsJHPRi8DrybQ",
"createdBy": "bMFPuyNg6qAD2mhXe",
"thread": "01/",
"createdAt": "2016-04-19 04:33:53+00:00",
"ratings": [
{ "createdBy": "AqmRukvRiExNpAe8C", "value": 1 },
{ "createdBy": "YdE76Wth3yqymKEu5", "value": 1 }
],
"rating": 2,
"updatedAt": "2020-05-27 19:22:57.091000+00:00",
"updatedBy": "0"
}
->
type Reaction {
id: Int!
shout: Shout!
createdAt: DateTime!
createdBy: User!
updatedAt: DateTime
deletedAt: DateTime
deletedBy: User
range: String # full / 0:2340
kind: ReactionKind!
body: String
replyTo: Reaction
stat: Stat
old_id: String
old_thread: String
}
"""
old_ts = entry.get("createdAt")
reaction_dict = {
"createdAt": (ts if not old_ts else date_parse(old_ts)),
"body": html2text(entry.get("body", "")),
"oid": entry["_id"],
}
shout_oid = entry.get("contentItem")
if shout_oid not in storage["shouts"]["by_oid"]:
if len(storage["shouts"]["by_oid"]) > 0:
return shout_oid
else:
print("[migration] no shouts migrated yet")
raise Exception
return
else:
stage = "started"
reaction = None
with local_session() as session:
author = session.query(User).filter(User.oid == entry["createdBy"]).first()
old_shout = storage["shouts"]["by_oid"].get(shout_oid)
if not old_shout:
raise Exception("no old shout in storage")
else:
stage = "author and old id found"
try:
shout = (
session.query(Shout)
.where(Shout.slug == old_shout["slug"])
.one()
)
if shout:
reaction_dict["shout"] = shout.id
reaction_dict["createdBy"] = author.id if author else 1
reaction_dict["kind"] = ReactionKind.COMMENT
# creating reaction from old comment
reaction = Reaction.create(**reaction_dict)
session.add(reaction)
# session.commit()
stage = "new reaction commited"
reaction_dict = reaction.dict()
topics = [t.dict() for t in shout.topics]
auto_followers(session, topics, reaction_dict)
migrate_ratings(session, entry, reaction_dict)
return reaction
except Exception as e:
print(e)
print(reaction)
raise Exception(stage)
return
def migrate_2stage(old_comment, idmap):
if old_comment.get("body"):
new_id = idmap.get(old_comment.get("oid"))
new_id = idmap.get(old_comment.get("_id"))
if new_id:
new_replyto_id = None
old_replyto_id = old_comment.get("replyTo")
if old_replyto_id:
new_replyto_id = int(idmap.get(old_replyto_id, "0"))
with local_session() as session:
comment = session.query(Reaction).where(Reaction.id == new_id).first()
try:
if new_replyto_id:
new_reply = (
session.query(Reaction)
.where(Reaction.id == new_replyto_id)
.first()
)
if not new_reply:
print(new_replyto_id)
raise Exception("cannot find reply by id!")
comment.replyTo = new_reply.id
session.add(comment)
srf = (
session.query(ShoutReactionsFollower)
.where(ShoutReactionsFollower.shout == comment.shout)
.filter(ShoutReactionsFollower.follower == comment.createdBy)
.first()
)
if not srf:
srf = ShoutReactionsFollower.create(
shout=comment.shout, follower=comment.createdBy, auto=True
)
session.add(srf)
session.commit()
except Exception:
raise Exception("cannot find a comment by oldid")

View File

@ -1,420 +0,0 @@
from datetime import datetime, timezone
import json
from dateutil.parser import parse as date_parse
from sqlalchemy.exc import IntegrityError
from transliterate import translit
from services.db import local_session
from migration.extract import extract_html, extract_media
from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower
from orm.user import User
from orm.topic import TopicFollower, Topic
from services.viewed import ViewedStorage
import re
OLD_DATE = "2016-03-05 22:22:00.350000"
ts = datetime.now(tz=timezone.utc)
type2layout = {
"Article": "article",
"Literature": "literature",
"Music": "audio",
"Video": "video",
"Image": "image",
}
anondict = {"slug": "anonymous", "id": 1, "name": "Аноним"}
discours = {"slug": "discours", "id": 2, "name": "Дискурс"}
def get_shout_slug(entry):
slug = entry.get("slug", "")
if not slug:
for friend in entry.get("friendlySlugs", []):
slug = friend.get("slug", "")
if slug:
break
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
return slug
def create_author_from_app(app):
user = None
userdata = None
# check if email is used
if app["email"]:
with local_session() as session:
user = session.query(User).where(User.email == app["email"]).first()
if not user:
# print('[migration] app %r' % app)
name = app.get("name")
if name:
slug = translit(name, "ru", reversed=True).lower()
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
print("[migration] created slug %s" % slug)
# check if slug is used
if slug:
user = session.query(User).where(User.slug == slug).first()
# get slug from email
if user:
slug = app["email"].split("@")[0]
user = session.query(User).where(User.slug == slug).first()
# one more try
if user:
slug += "-author"
user = (
session.query(User).where(User.slug == slug).first()
)
# create user with application data
if not user:
userdata = {
"username": app["email"],
"email": app["email"],
"name": app.get("name", ""),
"bio": app.get("bio", ""),
"emailConfirmed": False,
"slug": slug,
"createdAt": ts,
"lastSeen": ts,
}
# print('[migration] userdata %r' % userdata)
user = User.create(**userdata)
session.add(user)
session.commit()
userdata["id"] = user.id
userdata = user.dict()
return userdata
else:
raise Exception("app is not ok", app)
async def create_shout(shout_dict):
s = Shout.create(**shout_dict)
author = s.authors[0]
with local_session() as session:
srf = (
session.query(ShoutReactionsFollower)
.where(ShoutReactionsFollower.shout == s.id)
.filter(ShoutReactionsFollower.follower == author.id)
.first()
)
if not srf:
srf = ShoutReactionsFollower.create(
shout=s.id, follower=author.id, auto=True
)
session.add(srf)
session.commit()
return s
async def get_user(entry, storage):
app = entry.get("application")
userdata = None
user_oid = None
if app:
userdata = create_author_from_app(app)
else:
user_oid = entry.get("createdBy")
if user_oid == "0":
userdata = discours
elif user_oid:
userdata = storage["users"]["by_oid"].get(user_oid)
if not userdata:
print("no userdata by oid, anonymous")
userdata = anondict
print(app)
# cleanup slug
if userdata:
slug = userdata.get("slug", "")
if slug:
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
userdata["slug"] = slug
else:
userdata = anondict
user = await process_user(userdata, storage, user_oid)
return user, user_oid
async def migrate(entry, storage):
author, user_oid = await get_user(entry, storage)
r = {
"layout": type2layout[entry["type"]],
"title": entry["title"],
"authors": [
author,
],
"slug": get_shout_slug(entry),
"cover": (
"https://assets.discours.io/unsafe/1600x/" + entry["thumborId"]
if entry.get("thumborId")
else entry.get("image", {}).get("url")
),
"visibility": "public" if entry.get("published") else "authors",
"publishedAt": date_parse(entry.get("publishedAt"))
if entry.get("published")
else None,
"deletedAt": date_parse(entry.get("deletedAt"))
if entry.get("deletedAt")
else None,
"createdAt": date_parse(entry.get("createdAt", OLD_DATE)),
"updatedAt": date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts,
"topics": await add_topics_follower(entry, storage, author),
"body": extract_html(entry, cleanup=True),
}
# main topic patch
r["mainTopic"] = r["topics"][0]
# published author auto-confirm
if entry.get("published"):
with local_session() as session:
# update user.emailConfirmed if published
author.emailConfirmed = True
session.add(author)
session.commit()
# media
media = extract_media(entry)
r["media"] = json.dumps(media, ensure_ascii=True) if media else None
# ----------------------------------- copy
shout_dict = r.copy()
del shout_dict["topics"]
try:
# save shout to db
shout_dict["oid"] = entry.get("_id", "")
shout = await create_shout(shout_dict)
except IntegrityError as e:
print("[migration] create_shout integrity error", e)
shout = await resolve_create_shout(shout_dict)
except Exception as e:
raise Exception(e)
# udpate data
shout_dict = shout.dict()
shout_dict["authors"] = [
author.dict(),
]
# shout topics aftermath
shout_dict["topics"] = await topics_aftermath(r, storage)
# content_item ratings to reactions
await content_ratings_to_reactions(entry, shout_dict["slug"])
# shout views
await ViewedStorage.increment(
shout_dict["slug"], amount=entry.get("views", 1), viewer="old-discours"
)
# del shout_dict['ratings']
storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
storage["shouts"]["by_slug"][shout_dict["slug"]] = shout_dict
return shout_dict
async def add_topics_follower(entry, storage, user):
topics = set([])
category = entry.get("category")
topics_by_oid = storage["topics"]["by_oid"]
oids = [
category,
] + entry.get("tags", [])
for toid in oids:
tslug = topics_by_oid.get(toid, {}).get("slug")
if tslug:
topics.add(tslug)
ttt = list(topics)
# add author as TopicFollower
with local_session() as session:
for tpcslug in topics:
try:
tpc = session.query(Topic).where(Topic.slug == tpcslug).first()
if tpc:
tf = (
session.query(TopicFollower)
.where(TopicFollower.follower == user.id)
.filter(TopicFollower.topic == tpc.id)
.first()
)
if not tf:
tf = TopicFollower.create(
topic=tpc.id, follower=user.id, auto=True
)
session.add(tf)
session.commit()
except IntegrityError:
print("[migration.shout] hidden by topic " + tpc.slug)
# main topic
maintopic = storage["replacements"].get(topics_by_oid.get(category, {}).get("slug"))
if maintopic in ttt:
ttt.remove(maintopic)
ttt.insert(0, maintopic)
return ttt
async def process_user(userdata, storage, oid):
with local_session() as session:
uid = userdata.get("id") # anonymous as
if not uid:
print(userdata)
print("has no id field, set it @anonymous")
userdata = anondict
uid = 1
user = session.query(User).filter(User.id == uid).first()
if not user:
try:
slug = userdata["slug"].lower().strip()
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
userdata["slug"] = slug
user = User.create(**userdata)
session.add(user)
session.commit()
except IntegrityError:
print(f"[migration] user creating with slug {userdata['slug']}")
print("[migration] from userdata")
print(userdata)
raise Exception(
"[migration] cannot create user in content_items.get_user()"
)
if user.id == 946:
print("[migration] ***************** ALPINA")
if user.id == 2:
print("[migration] +++++++++++++++++ DISCOURS")
userdata["id"] = user.id
userdata["createdAt"] = user.createdAt
storage["users"]["by_slug"][userdata["slug"]] = userdata
storage["users"]["by_oid"][oid] = userdata
if not user:
raise Exception("could not get a user")
return user
async def resolve_create_shout(shout_dict):
with local_session() as session:
s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
bump = False
if s:
if s.createdAt != shout_dict["createdAt"]:
# create new with different slug
shout_dict["slug"] += "-" + shout_dict["layout"]
try:
await create_shout(shout_dict)
except IntegrityError as e:
print(e)
bump = True
else:
# update old
for key in shout_dict:
if key in s.__dict__:
if s.__dict__[key] != shout_dict[key]:
print(
"[migration] shout already exists, but differs in %s"
% key
)
bump = True
else:
print("[migration] shout already exists, but lacks %s" % key)
bump = True
if bump:
s.update(shout_dict)
else:
print("[migration] something went wrong with shout: \n%r" % shout_dict)
raise Exception("")
session.commit()
return s
async def topics_aftermath(entry, storage):
r = []
for tpc in filter(lambda x: bool(x), entry["topics"]):
oldslug = tpc
newslug = storage["replacements"].get(oldslug, oldslug)
if newslug:
with local_session() as session:
shout = session.query(Shout).where(Shout.slug == entry["slug"]).first()
new_topic = session.query(Topic).where(Topic.slug == newslug).first()
shout_topic_old = (
session.query(ShoutTopic)
.join(Shout)
.join(Topic)
.filter(Shout.slug == entry["slug"])
.filter(Topic.slug == oldslug)
.first()
)
if shout_topic_old:
shout_topic_old.update({"topic": new_topic.id})
else:
shout_topic_new = (
session.query(ShoutTopic)
.join(Shout)
.join(Topic)
.filter(Shout.slug == entry["slug"])
.filter(Topic.slug == newslug)
.first()
)
if not shout_topic_new:
try:
ShoutTopic.create(
**{"shout": shout.id, "topic": new_topic.id}
)
except Exception:
print("[migration] shout topic error: " + newslug)
session.commit()
if newslug not in r:
r.append(newslug)
else:
print("[migration] ignored topic slug: \n%r" % tpc["slug"])
# raise Exception
return r
async def content_ratings_to_reactions(entry, slug):
try:
with local_session() as session:
for content_rating in entry.get("ratings", []):
rater = (
session.query(User)
.filter(User.oid == content_rating["createdBy"])
.first()
) or User.default_user
shout = session.query(Shout).where(Shout.slug == slug).first()
cts = content_rating.get("createdAt")
reaction_dict = {
"createdAt": date_parse(cts) if cts else None,
"kind": ReactionKind.LIKE
if content_rating["value"] > 0
else ReactionKind.DISLIKE,
"createdBy": rater.id,
"shout": shout.id,
}
reaction = (
session.query(Reaction)
.filter(Reaction.shout == reaction_dict["shout"])
.filter(Reaction.createdBy == reaction_dict["createdBy"])
.filter(Reaction.kind == reaction_dict["kind"])
.first()
)
if reaction:
k = (
ReactionKind.AGREE
if content_rating["value"] > 0
else ReactionKind.DISAGREE
)
reaction_dict["kind"] = k
reaction.update(reaction_dict)
session.add(reaction)
else:
rea = Reaction.create(**reaction_dict)
session.add(rea)
# shout_dict['ratings'].append(reaction_dict)
session.commit()
except Exception:
print("[migration] content_item.ratings error: \n%r" % content_rating)

View File

@ -1,34 +0,0 @@
from services.db import local_session
from migration.extract import extract_md
from migration.html2text import html2text
from orm.reaction import Reaction, ReactionKind
def migrate(entry, storage):
post_oid = entry["contentItem"]
print(post_oid)
shout_dict = storage["shouts"]["by_oid"].get(post_oid)
if shout_dict:
print(shout_dict["body"])
remark = {
"shout": shout_dict["id"],
"body": extract_md(html2text(entry["body"]), shout_dict),
"kind": ReactionKind.REMARK,
}
if entry.get("textBefore"):
remark["range"] = (
str(shout_dict["body"].index(entry["textBefore"] or ""))
+ ":"
+ str(
shout_dict["body"].index(entry["textAfter"] or "")
+ len(entry["textAfter"] or "")
)
)
with local_session() as session:
rmrk = Reaction.create(**remark)
session.commit()
del rmrk["_sa_instance_state"]
return rmrk
return

View File

@ -1,828 +0,0 @@
{
"207": "207",
"1990-e": "90s",
"2000-e": "2000s",
"90-e": "90s",
"Georgia": "georgia",
"Japan": "japan",
"Sweden": "sweden",
"abstraktsiya": "abstract",
"absurdism": "absurdism",
"acclimatization": "acclimatisation",
"activism": "activism",
"adolf-gitler": "adolf-hitler",
"afrika": "africa",
"agata-kristi": "agatha-christie",
"agressivnoe-povedenie": "agression",
"agressiya": "agression",
"aktsii": "actions",
"aktsionizm": "actionism",
"alber-kamyu": "albert-kamus",
"albomy": "albums",
"aleksandr-griboedov": "aleksander-griboedov",
"aleksandr-pushkin": "aleksander-pushkin",
"aleksandr-solzhenitsyn": "aleksander-solzhenitsyn",
"aleksandr-vvedenskiy": "aleksander-vvedensky",
"aleksey-navalnyy": "alexey-navalny",
"alfavit": "alphabet",
"alkogol": "alcohol",
"alternativa": "alternative",
"alternative": "alternative",
"alternativnaya-istoriya": "alternative-history",
"amerika": "america",
"anarhizm": "anarchism",
"anatoliy-mariengof": "anatoly-mariengof",
"ancient-russia": "ancient-russia",
"andegraund": "underground",
"andrey-platonov": "andrey-platonov",
"andrey-rodionov": "andrey-rodionov",
"andrey-tarkovskiy": "andrey-tarkovsky",
"angliyskie-istorii": "english-stories",
"angliyskiy-yazyk": "english-langugae",
"ango": "ango",
"animation": "animation",
"animatsiya": "animation",
"anime": "anime",
"anri-volohonskiy": "anri-volohonsky",
"antifashizm": "anti-faschism",
"antiquity": "antiquity",
"antiutopiya": "dystopia",
"anton-dolin": "anton-dolin",
"antropology": "antropology",
"antropotsen": "antropocenus",
"architecture": "architecture",
"arheologiya": "archeology",
"arhetipy": "archetypes",
"arhiv": "archive",
"aristokraty": "aristocracy",
"aristotel": "aristotle",
"arktika": "arctic",
"armiya": "army",
"armiya-1": "army",
"art": "art",
"art-is": "art-is",
"artists": "artists",
"ateizm": "atheism",
"audio-poetry": "audio-poetry",
"audiopoeziya": "audio-poetry",
"audiospektakl": "audio-spectacles",
"auktsyon": "auktsyon",
"avangard": "avantgarde",
"avtofikshn": "autofiction",
"avtorskaya-pesnya": "bardsongs",
"azbuka-immigratsii": "immigration-basics",
"aziatskiy-kinematograf": "asian-cinema",
"b-movie": "b-movie",
"bannye-chteniya": "sauna-reading",
"bardsongs": "bardsongs",
"bdsm": "bdsm",
"beecake": "beecake",
"belarus": "belarus",
"belgiya": "belgium",
"bertold-breht": "berttold-brecht",
"bezumie": "madness",
"biography": "biography",
"biologiya": "biology",
"bipolyarnoe-rasstroystvo": "bipolar-disorder",
"bitniki": "beatnics",
"biznes": "business",
"blizhniy-vostok": "middle-east",
"blizost": "closeness",
"blocked-in-russia": "blocked-in-russia",
"blokada": "blockade",
"bob-dilan": "bob-dylan",
"bog": "god",
"bol": "pain",
"bolotnoe-delo": "bolotnaya-case",
"books": "books",
"boris-eltsin": "boris-eltsin",
"boris-godunov": "boris-godunov",
"boris-grebenschikov": "boris-grebenschikov",
"boris-nemtsov": "boris-nemtsov",
"boris-pasternak": "boris-pasternak",
"brak": "marriage",
"bret-iston-ellis": "bret-iston-ellis",
"buddizm": "buddhism",
"bullying": "bullying",
"bunt": "riot",
"burning-man": "burning-man",
"bytie": "being",
"byurokratiya": "bureaucracy",
"capitalism": "capitalism",
"censored-in-russia": "censored-in-russia",
"ch-rno-beloe": "black-and-white",
"ch-rnyy-yumor": "black-humour",
"chapters": "chapters",
"charity": "charity",
"chayldfri": "childfree",
"chechenskaya-voyna": "chechen-war",
"chechnya": "chechnya",
"chelovek": "male",
"chernobyl": "chernobyl",
"chernyy-yumor": "black-humour",
"children": "children",
"china": "china",
"chinovniki": "bureaucracy",
"chukotka": "chukotka",
"chuma": "plague",
"church": "church",
"cinema": "cinema",
"city": "city",
"civil-position": "civil-position",
"clips": "clips",
"collage": "collage",
"comics": "comics",
"conspiracy-theory": "conspiracy-theory",
"contemporary-art": "contemporary-art",
"contemporary-poetry": "poetry",
"contemporary-prose": "prose",
"coronavirus": "coronavirus",
"corruption": "corruption",
"creative-writing-school": "creative-writing-school",
"crime": "crime",
"criticism": "criticism",
"critiques": "reviews",
"culture": "culture",
"dadaizm": "dadaism",
"daniel-defo": "daniel-defoe",
"daniil-harms": "daniil-kharms",
"dante-aligeri": "dante-alighieri",
"darkveyv": "darkwave",
"death": "death",
"debaty": "debats",
"delo-seti": "seti-case",
"democracy": "democracy",
"demografiya": "demographics",
"demonstrations": "demonstrations",
"depression": "depression",
"derevnya": "village",
"derrida": "derrida",
"design": "design",
"detskie-doma": "orphanages",
"detstvo": "childhood",
"devid-linch": "david-linch",
"devyanostye": "90s",
"dialog": "dialogue",
"digital": "digital",
"digital-art": "digital-art",
"dinozavry": "dinosaurs",
"directing": "directing",
"diskurs": "discours",
"diskurs-1": "discourse",
"diskurs-analiz": "discourse-analytics",
"dissidenty": "dissidents",
"diy": "diy",
"dmitriy-donskoy": "dmitriy-donskoy",
"dmitriy-prigov": "dmitriy-prigov",
"dnevnik-1": "dairy",
"dnevniki": "dairies",
"documentary": "documentary",
"dokumentalnaya-poema": "documentary-poem",
"dokumentalnaya-poeziya": "documentary-poetry",
"dokumenty": "doсuments",
"domashnee-nasilie": "home-terror",
"donald-tramp": "donald-trump",
"donbass": "donbass",
"donbass-diary": "donbass-diary",
"donorstvo": "donation",
"dozhd": "rain",
"drama": "drama",
"dramaturgy": "dramaturgy",
"drawing": "drawing",
"drevo-zhizni": "tree-of-life",
"drugs": "drugs",
"duh": "spirit",
"dzhaz": "jazz",
"dzhek-keruak": "jack-keruak",
"dzhim-morrison": "jim-morrison",
"dzhordzh-romero": "george-romero",
"dzhordzho-agamben": "giorgio-agamben",
"ecology": "ecology",
"economics": "economics",
"eda": "food",
"editorial-statements": "editorial-statements",
"eduard-limonov": "eduard-limonov",
"education": "education",
"egor-letov": "egor-letov",
"ekspat": "expat",
"eksperiment": "experiments",
"eksperimentalnaya-muzyka": "experimental-music",
"ekspressionizm": "expressionism",
"ekstremizm": "extremism",
"ekzistentsializm-1": "existentialism",
"ekzistentsiya": "existence",
"elections": "elections",
"electronic": "electronics",
"electronics": "electronics",
"elena-glinskaya": "elena-glinskaya",
"elena-guro": "elena-guro",
"elizaveta-mnatsakanova": "elizaveta-mnatsakanova",
"embient": "ambient",
"emigration": "emigration",
"emil-dyurkgeym": "emile-durkheim",
"emotsii": "emotions",
"empiric": "empiric",
"epidemiya": "pandemic",
"erich-von-neff": "erich-von-neff",
"erotika": "erotics",
"essay": "essay",
"estetika": "aestetics",
"etika": "ethics",
"etno": "ethno",
"etnos": "ethnics",
"everyday-life": "everyday-life",
"evgeniy-onegin": "eugene-onegin",
"evolyutsiya": "evolution",
"exhibitions": "exhibitions",
"experience": "experiences",
"experimental": "experimental",
"experimental-music": "experimental-music",
"explanation": "explanation",
"faktcheking": "fact-checking",
"falsifikatsii": "falsifications",
"family": "family",
"fanfiki": "fan-fiction",
"fantastika": "sci-fi",
"fatalizm": "fatalism",
"fedor-dostoevskiy": "fedor-dostoevsky",
"fedor-ioannovich": "fedor-ioannovich",
"feleton": "feuilleton",
"feminism": "feminism",
"fenomenologiya": "phenomenology",
"fentezi": "fantasy",
"festival": "festival",
"festival-territoriya": "festival-territory",
"folk": "folk",
"folklor": "folklore",
"fotoreportazh": "photoreports",
"france": "france",
"frants-kafka": "franz-kafka",
"frederik-begbeder": "frederick-begbeder",
"freedom": "freedom",
"friendship": "friendship",
"fsb": "fsb",
"futbol": "footbool",
"future": "future",
"futuristy": "futurists",
"futurizm": "futurism",
"galereya": "gallery",
"galereya-anna-nova": "gallery-anna-nova",
"gdr": "gdr",
"gender": "gender",
"gendernyy-diskurs": "gender",
"gennadiy-aygi": "gennadiy-aygi",
"gerhard-rihter": "gerhard-rihter",
"germaniya": "germany",
"germenevtika": "hermeneutics",
"geroi": "heroes",
"girls": "girls",
"gkchp": "gkchp",
"glitch": "glitch",
"globalizatsiya": "globalisation",
"gollivud": "hollywood",
"gonzo": "gonzo",
"gore-ot-uma": "woe-from-wit",
"graffiti": "graffiti",
"graficheskaya-novella": "graphic-novell",
"graphics": "graphics",
"gravyura": "engraving",
"grazhdanskaya-oborona": "grazhdanskaya-oborona",
"gretsiya": "greece",
"griby": "mushrooms",
"gruziya-2": "georgia",
"gulag": "gulag",
"han-batyy": "khan-batyy",
"hayku": "haiku",
"health": "health",
"himiya": "chemistry",
"hip-hop": "hip-hop",
"history": "history",
"history-of-russia": "history-of-russia",
"holokost": "holocaust",
"horeografiya": "choreography",
"horror": "horror",
"hospis": "hospice",
"hristianstvo": "christianity",
"humans": "humans",
"humour": "humour",
"ideologiya": "ideology",
"idm": "idm",
"igil": "isis",
"igor-pomerantsev": "igor-pomerantsev",
"igra": "game",
"igra-prestolov": "game-of-throne",
"igry": "games",
"iisus-hristos": "jesus-christ",
"illness": "illness",
"illustration-history": "illustration-history",
"illustrations": "illustrations",
"imazhinizm": "imagism",
"immanuil-kant": "immanuel-kant",
"impressionizm": "impressionism",
"improvizatsiya": "improvisation",
"indi": "indie",
"individualizm": "individualism",
"infografika": "infographics",
"informatsiya": "information",
"ingmar-bergman": "ingmar-bergman",
"inklyuziya": "inclusion",
"installyatsiya": "installation",
"internet": "internet",
"interview": "interview",
"invalidnost": "disability",
"investigations": "investigations",
"iosif-brodskiy": "joseph-brodsky",
"iosif-stalin": "joseph-stalin",
"iskusstvennyy-intellekt": "artificial-intelligence",
"islam": "islam",
"istoriya-moskvy": "moscow-history",
"istoriya-nauki": "history-of-sceince",
"istoriya-o-medsestre": "nurse-story",
"istoriya-teatra": "theatre-history",
"italiya": "italy",
"italyanskiy-yazyk": "italian-language",
"iudaika": "judaica",
"ivan-groznyy": "ivan-grozny",
"ivan-iii-gorbatyy": "ivan-iii-gorbaty",
"ivan-kalita": "ivan-kalita",
"ivan-krylov": "ivan-krylov",
"izobreteniya": "inventions",
"izrail-1": "israel",
"jazz": "jazz",
"john-lennon": "john-lennon",
"journalism": "journalism",
"justice": "justice",
"k-pop": "k-pop",
"kalligrafiya": "calligraphy",
"karikatura": "caricatures",
"kartochki-rubinshteyna": "rubinstein-cards",
"katrin-nenasheva": "katrin-nenasheva",
"kavarga": "kavarga",
"kavkaz": "caucasus",
"kazan": "kazan",
"kiberbezopasnost": "cybersecurity",
"kinoklub": "cinema-club",
"kinokritika": "film-criticism",
"kirill-serebrennikov": "kirill-serebrennikov",
"kladbische": "cemetery",
"klassika": "classic",
"kollektivnoe-bessoznatelnoe": "сollective-unconscious",
"komediya": "comedy",
"kommunikatsii": "communications",
"kommunizm": "communism",
"kommuny": "communes",
"kompyuternye-igry": "computer-games",
"konets-vesny": "end-of-spring",
"konservatizm": "conservatism",
"kontrkultura": "counter-culture",
"kontseptualizm": "conceptualism",
"korotkometrazhka": "cinema-shorts",
"kosmos": "cosmos",
"kraudfanding": "crowdfunding",
"kriptovalyuty": "cryptocurrencies",
"krizis": "crisis",
"krov": "blood",
"krym": "crimea",
"kulturologiya": "culturology",
"kulty": "cults",
"kurdistan": "kurdistan",
"kurt-kobeyn": "kurt-cobain",
"kurt-vonnegut": "kurt-vonnegut",
"kvir": "queer",
"laboratoriya": "lab",
"language": "languages",
"lars-fon-trier": "lars-fon-trier",
"laws": "laws",
"lectures": "lectures",
"leto": "summer",
"lev-tolstoy": "leo-tolstoy",
"lgbt": "lgbt",
"liberalizm": "liberalism",
"libertarianstvo": "libertarianism",
"life": "life",
"likbez": "likbez",
"lingvistika": "linguistics",
"lirika": "lirics",
"literary-studies": "literary-studies",
"literature": "literature",
"literaturnyykaver": "literature-cover",
"lo-fi": "lo-fi",
"lomonosov": "lomonosov",
"love": "love",
"luzha-goluboy-krovi": "luzha-goluboy-krovi",
"lyudvig-vitgenshteyn": "ludwig-wittgenstein",
"lzhedmitriy": "false-dmitry",
"lzhenauka": "pseudoscience",
"magiya": "magic",
"maks-veber": "max-weber",
"manifests": "manifests",
"manipulyatsii-soznaniem": "mind-manipulation",
"marina-abramovich": "marina-abramovich",
"marketing": "marketing",
"marksizm": "marxism",
"marsel-dyushan": "marchel-duchamp",
"marsel-prust": "marcel-proust",
"martin-haydegger": "martin-hidegger",
"matematika": "maths",
"mayakovskiy": "vladimir-mayakovsky",
"media": "media",
"medicine": "medicine",
"memuary": "memoirs",
"menedzhment": "management",
"menty": "police",
"merab-mamardashvili": "merab-mamardashvili",
"mest": "revenge",
"metamodernizm": "metamodern",
"metavselennaya": "metaverse",
"metro": "metro",
"mifologiya": "mythology",
"mify": "myth",
"mihael-haneke": "michael-haneke",
"mihail-baryshnikov": "mihail-baryshnikov",
"mihail-bulgakov": "mihail-bulgakov",
"mikrotonalnaya-muzyka": "mikrotone-muzyka",
"minimalizm": "minimalism",
"minkult-privet": "minkult-privet",
"mir": "world",
"mirovozzrenie": "mindsets",
"mishel-fuko": "michel-foucault",
"mistika": "mystics",
"mitropolit-makariy": "mitropolit-makariy",
"mlm": "mlm",
"mobilizatsiya": "mobilisation",
"moda": "fashion",
"modernizm": "modernism",
"mokyumentari": "mockumentary",
"molodezh": "youth",
"moloko-plus": "moloko-plus",
"money": "money",
"monologs": "monologues",
"monstratsiya": "monstration",
"moralnaya-otvetstvennost": "moral-responsibility",
"more": "sea",
"moscow": "moscow",
"moshennichestvo": "frauds",
"moskovskiy-romanticheskiy-kontseptualizm": "moscow-romantic-conceptualism",
"moskovskoe-delo": "moscow-case",
"movies": "movies",
"mozg": "brain",
"multiplikatsiya": "animation",
"music": "music",
"musulmanstvo": "islam",
"muzei": "museum",
"muzey": "museum",
"muzhchiny": "man",
"myshlenie": "thinking",
"nagornyy-karabah": "nagorno-karabakh",
"nasilie-1": "violence",
"natsionalizm": "nationalism",
"natsionalnaya-ideya": "national-idea",
"natsizm": "nazism",
"natyurmort": "nature-morte",
"nauchpop": "pop-science",
"nbp": "nbp",
"nenavist": "hate",
"neofitsialnaya-literatura": "unofficial-literature",
"neoklassika": "neoclassic",
"neprozrachnye-smysly": "hidden-meanings",
"neravenstvo": "inequality",
"net-voyne": "no-war",
"new-year": "new-year",
"neyronauka": "neuro-science",
"neyroseti": "neural-networks",
"niu-vshe": "hse",
"nizhniy-novgorod": "nizhny-novgorod",
"nko": "nonprofits",
"nlo": "ufo",
"nobelevskaya-premiya": "nobel-prize",
"noize-mc": "noize-mc",
"nonkonformizm": "nonconformism",
"notforall": "notforall",
"novaya-drama": "new-drama",
"novosti": "news",
"noyz": "noise",
"nuar": "noir",
"oberiu": "oberiu",
"ocherk": "etudes",
"ochevidnyy-nuar": "ochevidnyy-nuar",
"odinochestvo": "loneliness",
"odna-kniga-odna-istoriya": "one-book-one-story",
"okrainy": "outskirts",
"omon": "swat",
"opinions": "opinions",
"oppozitsiya": "opposition",
"orhan-pamuk": "orhan-pamuk",
"ornitologiya": "ornitology",
"osen": "autumn",
"osip-mandelshtam": "osip-mandelshtam",
"oskar-uayld": "oscar-wilde",
"osoznanie": "awareness",
"otnosheniya": "relationship",
"pablo-pikasso": "pablo-picasso",
"painting": "painting",
"paintings": "painting",
"pamyat": "memory",
"pandemiya": "pandemic",
"parizh": "paris",
"patriotizm": "patriotism",
"patsifizm": "pacifism",
"paul-tselan": "paul-tselan",
"per-burd": "pierre-bourdieu",
"perezhivaniya": "worries",
"performance": "performance",
"peyzazh": "landscape",
"philology": "philology",
"philosophy": "philosophy",
"photo": "photography",
"photography": "photography",
"photoprojects": "photoprojects",
"plakaty": "posters",
"plastilin": "plasticine",
"plays": "plays",
"podrostki": "teenagers",
"poema": "poem",
"poems": "poems",
"poeticheskaya-proza": "poetic-prose",
"poetry": "poetry",
"poetry-of-squares": "poetry-of-squares",
"poetry-slam": "poetry-slam",
"pokoy": "peace",
"police": "police",
"politicheskoe-fentezi": "political-fantasy",
"politics": "politics",
"politzaklyuchennye": "political-prisoners",
"polsha": "poland",
"pomosch": "help",
"pop-art": "pop-art",
"pop-culture": "pop-culture",
"populyarnaya-psihologiya": "popular-psychology",
"pornografiya": "pornography",
"portret": "portrait",
"poslovitsy": "proverbs",
"post-pank": "post-punk",
"post-rok": "post-rock",
"postmodernism": "postmodernism",
"povest": "novells",
"povsednevnost": "everyday-life",
"power": "power",
"pravo": "right",
"pravoslavie": "orthodox",
"pravozaschitniki": "human-rights-activism",
"prazdnik": "holidays",
"predatelstvo": "betrayal",
"predprinimatelstvo": "entrepreneurship",
"premera": "premier",
"premiya-oskar": "oscar-prize",
"pribaltika-1": "baltic",
"priroda": "nature",
"prison": "prison",
"pritcha": "parable",
"privatnost": "privacy",
"progress": "progress",
"projects": "projects",
"prokrastinatsiya": "procrastination",
"propaganda": "propaganda",
"proschenie": "forgiveness",
"prose": "prose",
"proshloe": "past",
"prostitutsiya": "prostitution",
"prosveschenie": "enlightenment",
"protests": "protests",
"psalmy": "psalms",
"psihoanaliz": "psychoanalysis",
"psihodeliki": "psychodelics",
"pskov": "pskov",
"psychiatry": "psychiatry",
"psychology": "psychology",
"ptitsy": "birds",
"punk": "punk",
"r-b": "rnb",
"rasizm": "racism",
"realizm": "realism",
"redaktura": "editing",
"refleksiya": "reflection",
"reggi": "reggae",
"religion": "religion",
"rene-zhirar": "rene-girard",
"renesanss": "renessance",
"renovatsiya": "renovation",
"rep": "rap",
"reportage": "reportage",
"reportazh-1": "reportage",
"repressions": "repressions",
"research": "research",
"retroveyv": "retrowave",
"review": "review",
"revolution": "revolution",
"rezo-gabriadze": "rezo-gabriadze",
"risunki": "painting",
"roboty": "robots",
"rock": "rock",
"roditeli": "parents",
"romantizm": "romantism",
"romany": "novell",
"ronald-reygan": "ronald-reygan",
"roskomnadzor": "roskomnadzor",
"rossiyskoe-kino": "russian-cinema",
"rouling": "rowling",
"rozhava": "rojava",
"rpts": "rpts",
"rus-na-grani-sryva": "rus-na-grani-sryva",
"russia": "russia",
"russian-language": "russian-language",
"russian-literature": "russian-literature",
"russkaya-toska": "russian-toska",
"russkiy-mir": "russkiy-mir",
"salo": "lard",
"salvador-dali": "salvador-dali",
"samoidentifikatsiya": "self-identity",
"samoopredelenie": "self-definition",
"sankt-peterburg": "saint-petersburg",
"sasha-skochilenko": "sasha-skochilenko",
"satira": "satiric",
"saund-art": "sound-art",
"schaste": "happiness",
"school": "school",
"science": "science",
"sculpture": "sculpture",
"second-world-war": "second-world-war",
"sekond-hend": "second-hand",
"seksprosvet": "sex-education",
"seksualizirovannoe-nasilie": "sexualized-violence",
"seksualnoe-nasilie": "sexualized-violence",
"sekty": "sects",
"semi": "semi",
"semiotics": "semiotics",
"serbiya": "serbia",
"sergey-bodrov-mladshiy": "sergey-bodrov-junior",
"sergey-solov-v": "sergey-solovyov",
"serialy": "series",
"sever": "north",
"severnaya-koreya": "north-korea",
"sex": "sex",
"shotlandiya": "scotland",
"shugeyz": "shoegaze",
"siloviki": "siloviki",
"simeon-bekbulatovich": "simeon-bekbulatovich",
"simvolizm": "simbolism",
"siriya": "siria",
"skulptura": "sculpture",
"slavoy-zhizhek": "slavoj-zizek",
"smert-1": "death",
"smysl": "meaning",
"sny": "dreams",
"sobytiya": "events",
"social": "society",
"society": "society",
"sociology": "sociology",
"sofya-paleolog": "sofya-paleolog",
"sofya-vitovtovna": "sofya-vitovtovna",
"soobschestva": "communities",
"soprotivlenie": "resistence",
"sotsializm": "socialism",
"sotsialnaya-filosofiya": "social-philosophy",
"sotsiologiya-1": "sociology",
"sotsseti": "social-networks",
"sotvorenie-tretego-rima": "third-rome",
"sovremennost": "modernity",
"spaces": "spaces",
"spektakl": "spectacles",
"spetseffekty": "special-fx",
"spetsoperatsiya": "special-operation",
"spetssluzhby": "special-services",
"sport": "sport",
"srednevekove": "middle-age",
"state": "state",
"statistika": "statistics",
"stendap": "stand-up",
"stihi": "poetry",
"stoitsizm": "stoicism",
"stories": "stories",
"stoyanie-na-ugre": "stoyanie-na-ugre",
"strah": "fear",
"street-art": "street-art",
"stsenarii": "scenarios",
"sud": "court",
"summary": "summary",
"supergeroi": "superheroes",
"svetlana-aleksievich": "svetlana-aleksievich",
"svobodu-ivanu-golunovu": "free-ivan-golunov",
"syurrealizm": "surrealism",
"tales": "tales",
"tanets": "dance",
"tataro-mongolskoe-igo": "mongol-tatar-yoke",
"tatuirovki": "tattoo",
"technology": "technology",
"televidenie": "television",
"telo": "body",
"telo-kak-iskusstvo": "body-as-art",
"terrorizm": "terrorism",
"tests": "tests",
"text": "texts",
"the-beatles": "the-beatles",
"theater": "theater",
"theory": "theory",
"tokio": "tokio",
"torture": "torture",
"totalitarizm": "totalitarism",
"traditions": "traditions",
"tragicomedy": "tragicomedy",
"transgendernost": "transgender",
"translation": "translation",
"transport": "transport",
"travel": "travel",
"travma": "trauma",
"trendy": "trends",
"tretiy-reyh": "third-reich",
"triller": "thriller",
"tsar": "central-african-republic",
"tsar-edip": "oedipus",
"tsarevich-dmitriy": "tsarevich-dmitry",
"tsennosti": "values",
"tsenzura": "censorship",
"tseremonii": "ceremonies",
"turizm": "tourism",
"tvorchestvo": "creativity",
"ugnetennyy-zhilischnyy-klass": "oppressed-housing-class",
"uilyam-shekspir": "william-shakespeare",
"ukraina-2": "ukraine",
"ukraine": "ukraine",
"university": "university",
"urban-studies": "urban-studies",
"uroki-literatury": "literature-lessons",
"usa": "usa",
"ussr": "ussr",
"utopiya": "utopia",
"utrata": "loss",
"valter-benyamin": "valter-benyamin",
"varlam-shalamov": "varlam-shalamov",
"vasiliy-ii-temnyy": "basil-ii-temnyy",
"vasiliy-iii": "basil-iii",
"vdnh": "vdnh",
"vechnost": "ethernety",
"velikobritaniya": "great-britain",
"velimir-hlebnikov": "velimir-hlebnikov",
"velkom-tu-greyt-britn": "welcome-to-great-britain",
"venedikt-erofeev": "venedikt-erofeev",
"venetsiya": "veneece",
"vengriya": "hungary",
"verlibry": "free-verse",
"veschi": "things",
"vessels": "vessels",
"veterany": "veterans",
"video": "video",
"videoart": "videoart",
"videoklip": "clips",
"videopoeziya": "video-poetry",
"viktor-astafev": "viktor-astafev",
"viktor-pelevin": "viktor-pelevin",
"vilgelm-rayh": "wilhelm-reich",
"vinzavod": "vinzavod",
"violence": "violence",
"visual-culture": "visual-culture",
"vizualnaya-poeziya": "visual-poetry",
"vladimir-lenin": "vladimir-lenin",
"vladimir-mayakovskiy": "vladimir-mayakovsky",
"vladimir-nabokov": "vladimir-nabokov",
"vladimir-putin": "vladimir-putin",
"vladimir-sorokin": "vladimir-sorokin",
"vladimir-voynovich": "vladimir-voynovich",
"vnutrenniy-opyt": "inner-expirience",
"volga": "volga",
"volontery": "volonteurs",
"vong-karvay": "wong-karwai",
"vospominaniya": "memories",
"vostok": "east",
"voyna-na-ukraine": "war-in-ukraine",
"voyna-v-ukraine": "war-in-ukraine",
"vremya": "time",
"vudi-allen": "woody-allen",
"vynuzhdennye-otnosheniya": "forced-relationship",
"war": "war",
"war-in-ukraine-images": "war-in-ukrahine-images",
"women": "women",
"work": "work",
"writers": "writers",
"xx-century": "xx-century",
"yakob-yordans": "yakob-yordans",
"yan-vermeer": "yan-vermeer",
"yanka-dyagileva": "yanka-dyagileva",
"yaponskaya-literatura": "japan-literature",
"yazychestvo": "paganism",
"youth": "youth",
"yozef-rot": "yozef-rot",
"yurgen-habermas": "jorgen-habermas",
"za-liniey-mannergeyma": "behind-mannerheim-line",
"zabota": "care",
"zahar-prilepin": "zahar-prilepin",
"zakonodatelstvo": "laws",
"zakony-mira": "world-laws",
"zametki": "notes",
"zhelanie": "wish",
"zhivotnye": "animals",
"zhoze-saramago": "jose-saramago",
"zigmund-freyd": "sigmund-freud",
"zolotaya-orda": "golden-horde",
"zombi": "zombie",
"zombi-simpsony": "zombie-simpsons"
}

View File

@ -1,32 +0,0 @@
from services.db import local_session
from migration.extract import extract_md
from migration.html2text import html2text
from orm import Topic
def migrate(entry):
body_orig = entry.get("description", "").replace("&nbsp;", " ")
topic_dict = {
"slug": entry["slug"],
"oid": entry["_id"],
"title": entry["title"].replace("&nbsp;", " "),
"body": extract_md(html2text(body_orig)),
}
with local_session() as session:
slug = topic_dict["slug"]
topic = session.query(Topic).filter(Topic.slug == slug).first() or Topic.create(
**topic_dict
)
if not topic:
raise Exception("no topic!")
if topic:
if len(topic.title) > len(topic_dict["title"]):
Topic.update(topic, {"title": topic_dict["title"]})
if len(topic.body) < len(topic_dict["body"]):
Topic.update(topic, {"body": topic_dict["body"]})
session.commit()
# print(topic.__dict__)
rt = topic.__dict__.copy()
del rt["_sa_instance_state"]
return rt

View File

@ -1,167 +0,0 @@
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
from sqlalchemy.exc import IntegrityError
from services.db import local_session
from orm.user import AuthorFollower, User, UserRating
def migrate(entry):
if "subscribedTo" in entry:
del entry["subscribedTo"]
email = entry["emails"][0]["address"]
user_dict = {
"oid": entry["_id"],
"roles": [],
"ratings": [],
"username": email,
"email": email,
"createdAt": parse(entry["createdAt"]),
"emailConfirmed": ("@discours.io" in email)
or bool(entry["emails"][0]["verified"]),
"muted": False, # amnesty
"bio": entry["profile"].get("bio", ""),
"links": [],
"name": "anonymous",
"password": entry["services"]["password"].get("bcrypt"),
}
if "updatedAt" in entry:
user_dict["updatedAt"] = parse(entry["updatedAt"])
if "wasOnineAt" in entry:
user_dict["lastSeen"] = parse(entry["wasOnlineAt"])
if entry.get("profile"):
# slug
slug = entry["profile"].get("path").lower()
slug = re.sub("[^0-9a-zA-Z]+", "-", slug).strip()
user_dict["slug"] = slug
bio = (
(entry.get("profile", {"bio": ""}).get("bio") or "")
.replace("\(", "(")
.replace("\)", ")")
)
bio_text = BeautifulSoup(bio, features="lxml").text
if len(bio_text) > 120:
user_dict["about"] = bio_text
else:
user_dict["bio"] = bio_text
# userpic
try:
user_dict["userpic"] = (
"https://assets.discours.io/unsafe/100x/"
+ entry["profile"]["thumborId"]
)
except KeyError:
try:
user_dict["userpic"] = entry["profile"]["image"]["url"]
except KeyError:
user_dict["userpic"] = ""
# name
fn = entry["profile"].get("firstName", "")
ln = entry["profile"].get("lastName", "")
name = fn if fn else ""
name = (name + " " + ln) if ln else name
if not name:
name = slug if slug else "anonymous"
name = (
entry["profile"]["path"].lower().strip().replace(" ", "-")
if len(name) < 2
else name
)
user_dict["name"] = name
# links
fb = entry["profile"].get("facebook", False)
if fb:
user_dict["links"].append(fb)
vk = entry["profile"].get("vkontakte", False)
if vk:
user_dict["links"].append(vk)
tr = entry["profile"].get("twitter", False)
if tr:
user_dict["links"].append(tr)
ws = entry["profile"].get("website", False)
if ws:
user_dict["links"].append(ws)
# some checks
if not user_dict["slug"] and len(user_dict["links"]) > 0:
user_dict["slug"] = user_dict["links"][0].split("/")[-1]
user_dict["slug"] = user_dict.get("slug", user_dict["email"].split("@")[0])
oid = user_dict["oid"]
user_dict["slug"] = user_dict["slug"].lower().strip().replace(" ", "-")
try:
user = User.create(**user_dict.copy())
except IntegrityError:
print("[migration] cannot create user " + user_dict["slug"])
with local_session() as session:
old_user = (
session.query(User).filter(User.slug == user_dict["slug"]).first()
)
old_user.oid = oid
old_user.password = user_dict["password"]
session.commit()
user = old_user
if not user:
print("[migration] ERROR: cannot find user " + user_dict["slug"])
raise Exception
user_dict["id"] = user.id
return user_dict
def post_migrate():
old_discours_dict = {
"slug": "old-discours",
"username": "old-discours",
"email": "old@discours.io",
"name": "Просмотры на старой версии сайта",
}
with local_session() as session:
old_discours_user = User.create(**old_discours_dict)
session.add(old_discours_user)
session.commit()
def migrate_2stage(entry, id_map):
ce = 0
for rating_entry in entry.get("ratings", []):
rater_oid = rating_entry["createdBy"]
rater_slug = id_map.get(rater_oid)
if not rater_slug:
ce += 1
# print(rating_entry)
continue
oid = entry["_id"]
author_slug = id_map.get(oid)
with local_session() as session:
try:
rater = session.query(User).where(User.slug == rater_slug).one()
user = session.query(User).where(User.slug == author_slug).one()
user_rating_dict = {
"value": rating_entry["value"],
"rater": rater.id,
"user": user.id,
}
user_rating = UserRating.create(**user_rating_dict)
if user_rating_dict["value"] > 0:
af = AuthorFollower.create(
author=user.id, follower=rater.id, auto=True
)
session.add(af)
session.add(user_rating)
session.commit()
except IntegrityError:
print("[migration] cannot rate " + author_slug + "`s by " + rater_slug)
except Exception as e:
print(e)
return ce

View File

@ -1,10 +0,0 @@
from datetime import datetime
from json import JSONEncoder
class DateTimeEncoder(JSONEncoder):
def default(self, z):
if isinstance(z, datetime):
return str(z)
else:
return super().default(z)

View File

@ -6,7 +6,7 @@ pyjwt>=2.6.0
git+https://github.com/encode/starlette.git#main git+https://github.com/encode/starlette.git#main
sqlalchemy>=1.4.41 sqlalchemy>=1.4.41
graphql-core>=3.0.3 graphql-core>=3.0.3
gql~=3.4.0 gql[httpx]
uvicorn>=0.18.3 uvicorn>=0.18.3
pydantic>=1.10.2 pydantic>=1.10.2
passlib~=1.7.4 passlib~=1.7.4
@ -14,14 +14,11 @@ itsdangerous
authlib>=1.1.0 authlib>=1.1.0
httpx>=0.23.0 httpx>=0.23.0
psycopg2-binary psycopg2-binary
transliterate~=1.10.2
bcrypt>=4.0.0 bcrypt>=4.0.0
websockets websockets
bson~=0.5.10
flake8 flake8
DateTime~=4.7 DateTime~=4.7
python-dateutil~=2.8.2 python-dateutil~=2.8.2
beautifulsoup4~=4.11.1
lxml lxml
sentry-sdk>=1.14.0 sentry-sdk>=1.14.0
boto3~=1.28.2 boto3~=1.28.2

View File

@ -2,18 +2,16 @@ import asyncio
import time import time
from datetime import timedelta, timezone, datetime from datetime import timedelta, timezone, datetime
from os import environ, path from os import environ, path
from ssl import create_default_context
from gql import Client, gql from gql import Client, gql
from gql.transport.aiohttp import AIOHTTPTransport from gql.transport.httpx import HTTPXAsyncTransport
from services.db import local_session from services.db import local_session
from orm import Topic from orm import Topic
from orm.shout import ShoutTopic, Shout from orm.shout import ShoutTopic, Shout
load_facts = gql( load_facts = gql(
""" """ query getDomains {
query getDomains {
domains { domains {
id id
title title
@ -23,14 +21,11 @@ query getDomains {
viewsMonth viewsMonth
viewsYear viewsYear
} }
} } } """
}
"""
) )
load_pages = gql( load_pages = gql(
""" """ query getDomains {
query getDomains {
domains { domains {
title title
statistics { statistics {
@ -41,10 +36,9 @@ query getDomains {
value value
} }
} }
} } } """
}
"""
) )
schema_str = open(path.dirname(__file__) + "/ackee.graphql").read() schema_str = open(path.dirname(__file__) + "/ackee.graphql").read()
token = environ.get("ACKEE_TOKEN", "") token = environ.get("ACKEE_TOKEN", "")
@ -52,9 +46,8 @@ token = environ.get("ACKEE_TOKEN", "")
def create_client(headers=None, schema=None): def create_client(headers=None, schema=None):
return Client( return Client(
schema=schema, schema=schema,
transport=AIOHTTPTransport( transport=HTTPXAsyncTransport(
url="https://ackee.discours.io/api", url="https://ackee.discours.io/api",
ssl=create_default_context(),
headers=headers, headers=headers,
), ),
) )