core/migration/__init__.py

279 lines
10 KiB
Python
Raw Normal View History

2022-09-03 10:50:14 +00:00
""" cmd managed migration """
2022-11-23 14:09:35 +00:00
from datetime import datetime, timezone
from migration.export import export_mdx
from migration.tables.comments import migrate as migrateComment
from migration.tables.comments import migrate_2stage as migrateComment_2stage
2022-11-10 05:40:32 +00:00
from migration.tables.content_items import get_shout_slug
from migration.tables.content_items import migrate as migrateShout
2023-10-26 20:38:31 +00:00
# from migration.tables.remarks import migrate as migrateRemark
from migration.tables.topics import migrate as migrateTopic
2023-10-26 17:56:42 +00:00
from migration.tables.users import migrate as migrateUser
from migration.tables.users import migrate_2stage as migrateUser_2stage
2023-10-26 17:56:42 +00:00
from migration.tables.users import post_migrate as users_post_migrate
2022-11-19 11:35:34 +00:00
from orm import init_tables
from orm.reaction import Reaction
2022-08-11 09:59:35 +00:00
2023-10-26 20:38:31 +00:00
import asyncio
import bs4
import gc
import json
import sys
2022-11-23 14:09:35 +00:00
TODAY = datetime.strftime(datetime.now(tz=timezone.utc), "%Y%m%d")
2022-09-03 10:50:14 +00:00
OLD_DATE = "2016-03-05 22:22:00.350000"
2022-08-11 09:14:12 +00:00
async def users_handle(storage):
2022-09-03 10:50:14 +00:00
"""migrating users first"""
counter = 0
id_map = {}
print("[migration] migrating %d users" % (len(storage["users"]["data"])))
for entry in storage["users"]["data"]:
oid = entry["_id"]
user = migrateUser(entry)
storage["users"]["by_oid"][oid] = user # full
del user["password"]
del user["emailConfirmed"]
del user["username"]
del user["email"]
storage["users"]["by_slug"][user["slug"]] = user # public
id_map[user["oid"]] = user["slug"]
counter += 1
ce = 0
for entry in storage["users"]["data"]:
ce += migrateUser_2stage(entry, id_map)
2023-01-18 12:43:56 +00:00
users_post_migrate()
2022-08-11 09:14:12 +00:00
async def topics_handle(storage):
2022-09-03 10:50:14 +00:00
"""topics from categories and tags"""
counter = 0
for t in storage["topics"]["tags"] + storage["topics"]["cats"]:
if t["slug"] in storage["replacements"]:
t["slug"] = storage["replacements"][t["slug"]]
topic = migrateTopic(t)
storage["topics"]["by_oid"][t["_id"]] = topic
storage["topics"]["by_slug"][t["slug"]] = topic
counter += 1
else:
print("[migration] topic " + t["slug"] + " ignored")
for oldslug, newslug in storage["replacements"].items():
if oldslug != newslug and oldslug in storage["topics"]["by_slug"]:
oid = storage["topics"]["by_slug"][oldslug]["_id"]
del storage["topics"]["by_slug"][oldslug]
storage["topics"]["by_oid"][oid] = storage["topics"]["by_slug"][newslug]
print("[migration] " + str(counter) + " topics migrated")
2023-10-26 17:56:42 +00:00
print("[migration] " + str(len(storage["topics"]["by_oid"].values())) + " topics by oid")
print("[migration] " + str(len(storage["topics"]["by_slug"].values())) + " topics by slug")
2022-08-11 09:14:12 +00:00
2022-08-18 06:12:46 +00:00
async def shouts_handle(storage, args):
2022-09-03 10:50:14 +00:00
"""migrating content items one by one"""
counter = 0
discours_author = 0
2022-09-19 13:50:43 +00:00
anonymous_author = 0
2022-09-03 10:50:14 +00:00
pub_counter = 0
2022-11-19 11:35:34 +00:00
ignored = 0
2022-09-03 10:50:14 +00:00
topics_dataset_bodies = []
topics_dataset_tlist = []
for entry in storage["shouts"]["data"]:
2022-12-14 06:49:49 +00:00
gc.collect()
2022-09-03 10:50:14 +00:00
# slug
slug = get_shout_slug(entry)
# single slug mode
if "-" in args and slug not in args:
continue
# migrate
2022-11-30 19:47:34 +00:00
shout_dict = await migrateShout(entry, storage)
if shout_dict:
storage["shouts"]["by_oid"][entry["_id"]] = shout_dict
storage["shouts"]["by_slug"][shout_dict["slug"]] = shout_dict
2022-11-19 11:35:34 +00:00
# shouts.topics
2022-11-30 19:47:34 +00:00
if not shout_dict["topics"]:
2022-11-19 11:35:34 +00:00
print("[migration] no topics!")
# with author
2022-11-30 19:47:34 +00:00
author = shout_dict["authors"][0]
2022-11-19 11:35:34 +00:00
if author["slug"] == "discours":
discours_author += 1
if author["slug"] == "anonymous":
anonymous_author += 1
# print('[migration] ' + shout['slug'] + ' with author ' + author)
if entry.get("published"):
if "mdx" in args:
2022-11-30 19:47:34 +00:00
export_mdx(shout_dict)
2022-11-19 11:35:34 +00:00
pub_counter += 1
# print main counter
counter += 1
2023-10-26 17:56:42 +00:00
print(
2023-10-26 20:38:31 +00:00
"[migration] shouts_handle %d: %s @%s"
2023-10-26 17:56:42 +00:00
% ((counter + 1), shout_dict["slug"], author["slug"])
)
2022-11-19 11:35:34 +00:00
2022-11-30 19:47:34 +00:00
b = bs4.BeautifulSoup(shout_dict["body"], "html.parser")
texts = [shout_dict["title"].lower().replace(r"[^а-яА-Яa-zA-Z]", "")]
2022-11-19 11:35:34 +00:00
texts = texts + b.findAll(text=True)
topics_dataset_bodies.append(" ".join([x.strip().lower() for x in texts]))
2022-11-30 19:47:34 +00:00
topics_dataset_tlist.append(shout_dict["topics"])
2022-11-19 11:35:34 +00:00
else:
ignored += 1
2022-09-03 10:50:14 +00:00
# np.savetxt('topics_dataset.csv', (topics_dataset_bodies, topics_dataset_tlist), delimiter=',
# ', fmt='%s')
2022-09-03 10:50:14 +00:00
print("[migration] " + str(counter) + " content items were migrated")
print("[migration] " + str(pub_counter) + " have been published")
2023-01-25 15:00:39 +00:00
print("[migration] " + str(discours_author) + " authored by @discours")
2022-09-19 13:50:43 +00:00
print("[migration] " + str(anonymous_author) + " authored by @anonymous")
2022-08-11 09:14:12 +00:00
2023-10-26 20:38:31 +00:00
# async def remarks_handle(storage):
# print("[migration] comments")
# c = 0
# for entry_remark in storage["remarks"]["data"]:
# remark = await migrateRemark(entry_remark, storage)
# c += 1
# print("[migration] " + str(c) + " remarks migrated")
2023-01-17 06:19:12 +00:00
2022-08-18 06:12:46 +00:00
async def comments_handle(storage):
2022-11-29 11:51:06 +00:00
print("[migration] comments")
2022-09-03 10:50:14 +00:00
id_map = {}
ignored_counter = 0
missed_shouts = {}
for oldcomment in storage["reactions"]["data"]:
if not oldcomment.get("deleted"):
reaction = await migrateComment(oldcomment, storage)
2023-10-26 20:38:31 +00:00
if isinstance(reaction, str):
2022-09-03 10:50:14 +00:00
missed_shouts[reaction] = oldcomment
2023-10-26 20:38:31 +00:00
elif isinstance(reaction, Reaction):
2022-09-03 10:50:14 +00:00
reaction = reaction.dict()
rid = reaction["id"]
2022-09-03 10:50:14 +00:00
oid = reaction["oid"]
id_map[oid] = rid
2022-09-03 10:50:14 +00:00
else:
ignored_counter += 1
for reaction in storage["reactions"]["data"]:
migrateComment_2stage(reaction, id_map)
print("[migration] " + str(len(id_map)) + " comments migrated")
print("[migration] " + str(ignored_counter) + " comments ignored")
print("[migration] " + str(len(missed_shouts.keys())) + " commented shouts missed")
missed_counter = 0
for missed in missed_shouts.values():
missed_counter += len(missed)
print("[migration] " + str(missed_counter) + " comments dropped")
2022-08-11 09:14:12 +00:00
2022-08-18 06:12:46 +00:00
async def all_handle(storage, args):
2022-09-03 10:50:14 +00:00
print("[migration] handle everything")
await users_handle(storage)
await topics_handle(storage)
2022-11-16 09:23:32 +00:00
print("[migration] users and topics are migrated")
2022-09-03 10:50:14 +00:00
await shouts_handle(storage, args)
2023-01-18 12:43:56 +00:00
# print("[migration] remarks...")
# await remarks_handle(storage)
2022-11-16 09:23:32 +00:00
print("[migration] migrating comments")
2022-09-03 10:50:14 +00:00
await comments_handle(storage)
# export_email_subscriptions()
print("[migration] done!")
2022-08-11 09:14:12 +00:00
def data_load():
2022-09-03 10:50:14 +00:00
storage = {
"content_items": {
"by_oid": {},
"by_slug": {},
},
"shouts": {"by_oid": {}, "by_slug": {}, "data": []},
"reactions": {"by_oid": {}, "by_slug": {}, "by_content": {}, "data": []},
"topics": {
"by_oid": {},
"by_slug": {},
"cats": [],
"tags": [],
},
"remarks": {"data": []},
2022-09-03 10:50:14 +00:00
"users": {"by_oid": {}, "by_slug": {}, "data": []},
"replacements": json.loads(open("migration/tables/replacements.json").read()),
}
try:
users_data = json.loads(open("migration/data/users.json").read())
print("[migration.load] " + str(len(users_data)) + " users ")
tags_data = json.loads(open("migration/data/tags.json").read())
storage["topics"]["tags"] = tags_data
print("[migration.load] " + str(len(tags_data)) + " tags ")
2023-10-26 17:56:42 +00:00
cats_data = json.loads(open("migration/data/content_item_categories.json").read())
2022-09-03 10:50:14 +00:00
storage["topics"]["cats"] = cats_data
print("[migration.load] " + str(len(cats_data)) + " cats ")
comments_data = json.loads(open("migration/data/comments.json").read())
storage["reactions"]["data"] = comments_data
print("[migration.load] " + str(len(comments_data)) + " comments ")
content_data = json.loads(open("migration/data/content_items.json").read())
storage["shouts"]["data"] = content_data
print("[migration.load] " + str(len(content_data)) + " content items ")
2023-01-17 06:19:12 +00:00
remarks_data = json.loads(open("migration/data/remarks.json").read())
storage["remarks"]["data"] = remarks_data
print("[migration.load] " + str(len(remarks_data)) + " remarks data ")
2022-09-03 10:50:14 +00:00
# fill out storage
for x in users_data:
storage["users"]["by_oid"][x["_id"]] = x
# storage['users']['by_slug'][x['slug']] = x
# no user.slug yet
2023-10-26 17:56:42 +00:00
print("[migration.load] " + str(len(storage["users"]["by_oid"].keys())) + " users by oid")
2022-09-03 10:50:14 +00:00
for x in tags_data:
storage["topics"]["by_oid"][x["_id"]] = x
storage["topics"]["by_slug"][x["slug"]] = x
for x in cats_data:
storage["topics"]["by_oid"][x["_id"]] = x
storage["topics"]["by_slug"][x["slug"]] = x
print(
2023-10-26 17:56:42 +00:00
"[migration.load] " + str(len(storage["topics"]["by_slug"].keys())) + " topics by slug"
2022-09-03 10:50:14 +00:00
)
for item in content_data:
slug = get_shout_slug(item)
storage["content_items"]["by_slug"][slug] = item
storage["content_items"]["by_oid"][item["_id"]] = item
print("[migration.load] " + str(len(content_data)) + " content items")
for x in comments_data:
storage["reactions"]["by_oid"][x["_id"]] = x
cid = x["contentItem"]
storage["reactions"]["by_content"][cid] = x
ci = storage["content_items"]["by_oid"].get(cid, {})
if "slug" in ci:
storage["reactions"]["by_slug"][ci["slug"]] = x
print(
"[migration.load] "
+ str(len(storage["reactions"]["by_content"].keys()))
+ " with comments"
)
storage["users"]["data"] = users_data
storage["topics"]["tags"] = tags_data
storage["topics"]["cats"] = cats_data
storage["shouts"]["data"] = content_data
storage["reactions"]["data"] = comments_data
2022-09-03 10:50:14 +00:00
except Exception as e:
raise e
return storage
2022-08-11 09:14:12 +00:00
2022-12-25 05:45:13 +00:00
async def handling_migration():
2022-12-21 06:10:12 +00:00
init_tables()
2022-09-03 10:50:14 +00:00
await all_handle(data_load(), sys.argv)
2022-11-29 11:51:06 +00:00
2022-12-25 05:45:13 +00:00
def process():
2022-12-25 05:48:45 +00:00
loop = asyncio.get_event_loop()
loop.run_until_complete(handling_migration())
2022-12-25 05:45:13 +00:00
if __name__ == "__main__":
process()