This commit is contained in:
Igor Lobanov
2023-10-26 22:38:31 +02:00
parent 1c49780cd4
commit c2cc428abe
64 changed files with 631 additions and 626 deletions

View File

@@ -1,18 +1,12 @@
""" cmd managed migration """
import asyncio
import gc
import json
import sys
from datetime import datetime, timezone
import bs4
from migration.export import export_mdx
from migration.tables.comments import migrate as migrateComment
from migration.tables.comments import migrate_2stage as migrateComment_2stage
from migration.tables.content_items import get_shout_slug
from migration.tables.content_items import migrate as migrateShout
from migration.tables.remarks import migrate as migrateRemark
# from migration.tables.remarks import migrate as migrateRemark
from migration.tables.topics import migrate as migrateTopic
from migration.tables.users import migrate as migrateUser
from migration.tables.users import migrate_2stage as migrateUser_2stage
@@ -20,6 +14,12 @@ from migration.tables.users import post_migrate as users_post_migrate
from orm import init_tables
from orm.reaction import Reaction
import asyncio
import bs4
import gc
import json
import sys
TODAY = datetime.strftime(datetime.now(tz=timezone.utc), "%Y%m%d")
OLD_DATE = "2016-03-05 22:22:00.350000"
@@ -111,7 +111,7 @@ async def shouts_handle(storage, args):
# print main counter
counter += 1
print(
'[migration] shouts_handle %d: %s @%s'
"[migration] shouts_handle %d: %s @%s"
% ((counter + 1), shout_dict["slug"], author["slug"])
)
@@ -132,13 +132,13 @@ async def shouts_handle(storage, args):
print("[migration] " + str(anonymous_author) + " authored by @anonymous")
async def remarks_handle(storage):
print("[migration] comments")
c = 0
for entry_remark in storage["remarks"]["data"]:
remark = await migrateRemark(entry_remark, storage)
c += 1
print("[migration] " + str(c) + " remarks migrated")
# async def remarks_handle(storage):
# print("[migration] comments")
# c = 0
# for entry_remark in storage["remarks"]["data"]:
# remark = await migrateRemark(entry_remark, storage)
# c += 1
# print("[migration] " + str(c) + " remarks migrated")
async def comments_handle(storage):
@@ -149,9 +149,9 @@ async def comments_handle(storage):
for oldcomment in storage["reactions"]["data"]:
if not oldcomment.get("deleted"):
reaction = await migrateComment(oldcomment, storage)
if type(reaction) == str:
if isinstance(reaction, str):
missed_shouts[reaction] = oldcomment
elif type(reaction) == Reaction:
elif isinstance(reaction, Reaction):
reaction = reaction.dict()
rid = reaction["id"]
oid = reaction["oid"]

View File

@@ -1,11 +1,10 @@
from .utils import DateTimeEncoder
import bson
import gc
import json
import os
import bson
from .utils import DateTimeEncoder
def json_tables():
print("[migration] unpack dump/discours/*.bson to migration/data/*.json")
@@ -19,7 +18,7 @@ def json_tables():
"remarks": [],
}
for table in data.keys():
print('[migration] bson2json for ' + table)
print("[migration] bson2json for " + table)
gc.collect()
lc = []
bs = open("dump/discours/" + table + ".bson", "rb").read()

View File

@@ -1,11 +1,10 @@
import json
import os
from .extract import extract_html, extract_media
from .utils import DateTimeEncoder
from datetime import datetime, timezone
import frontmatter
from .extract import extract_html, extract_media
from .utils import DateTimeEncoder
import json
import os
OLD_DATE = "2016-03-05 22:22:00.350000"
EXPORT_DEST = "../discoursio-web/data/"

View File

@@ -1,9 +1,11 @@
from bs4 import BeautifulSoup
import base64
import os
import re
import uuid
from bs4 import BeautifulSoup
# import uuid
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"
contentDir = os.path.join(
@@ -26,40 +28,40 @@ def replace_tooltips(body):
return newbody
def extract_footnotes(body, shout_dict):
parts = body.split("&&&")
lll = len(parts)
newparts = list(parts)
placed = False
if lll & 1:
if lll > 1:
i = 1
print("[extract] found %d footnotes in body" % (lll - 1))
for part in parts[1:]:
if i & 1:
placed = True
if 'a class="footnote-url" href=' in part:
print("[extract] footnote: " + part)
fn = 'a class="footnote-url" href="'
exxtracted_link = part.split(fn, 1)[1].split('"', 1)[0]
extracted_body = part.split(fn, 1)[1].split('>', 1)[1].split('</a>', 1)[0]
print("[extract] footnote link: " + extracted_link)
with local_session() as session:
Reaction.create(
{
"shout": shout_dict['id'],
"kind": ReactionKind.FOOTNOTE,
"body": extracted_body,
"range": str(body.index(fn + link) - len('<'))
+ ':'
+ str(body.index(extracted_body) + len('</a>')),
}
)
newparts[i] = "<a href='#'></a>"
else:
newparts[i] = part
i += 1
return ("".join(newparts), placed)
# def extract_footnotes(body, shout_dict):
# parts = body.split("&&&")
# lll = len(parts)
# newparts = list(parts)
# placed = False
# if lll & 1:
# if lll > 1:
# i = 1
# print("[extract] found %d footnotes in body" % (lll - 1))
# for part in parts[1:]:
# if i & 1:
# placed = True
# if 'a class="footnote-url" href=' in part:
# print("[extract] footnote: " + part)
# fn = 'a class="footnote-url" href="'
# # exxtracted_link = part.split(fn, 1)[1].split('"', 1)[0]
# extracted_body = part.split(fn, 1)[1].split(">", 1)[1].split("</a>", 1)[0]
# print("[extract] footnote link: " + extracted_link)
# with local_session() as session:
# Reaction.create(
# {
# "shout": shout_dict["id"],
# "kind": ReactionKind.FOOTNOTE,
# "body": extracted_body,
# "range": str(body.index(fn + link) - len("<"))
# + ":"
# + str(body.index(extracted_body) + len("</a>")),
# }
# )
# newparts[i] = "<a href='#'></a>"
# else:
# newparts[i] = part
# i += 1
# return ("".join(newparts), placed)
def place_tooltips(body):
@@ -228,7 +230,6 @@ di = "data:image"
def extract_md_images(body, prefix):
newbody = ""
body = (
body.replace("\n! [](" + di, "\n ![](" + di)
.replace("\n[](" + di, "\n![](" + di)
@@ -236,10 +237,10 @@ def extract_md_images(body, prefix):
)
parts = body.split(di)
if len(parts) > 1:
newbody = extract_dataimages(parts, prefix)
new_body = extract_dataimages(parts, prefix)
else:
newbody = body
return newbody
new_body = body
return new_body
def cleanup_md(body):
@@ -262,28 +263,28 @@ def cleanup_md(body):
return newbody
def extract_md(body, shout_dict=None):
newbody = body
if newbody:
newbody = cleanup_md(newbody)
if not newbody:
raise Exception("cleanup error")
if shout_dict:
uid = shout_dict['id'] or uuid.uuid4()
newbody = extract_md_images(newbody, uid)
if not newbody:
raise Exception("extract_images error")
newbody, placed = extract_footnotes(body, shout_dict)
if not newbody:
raise Exception("extract_footnotes error")
return newbody
# def extract_md(body, shout_dict=None):
# newbody = body
# if newbody:
# newbody = cleanup_md(newbody)
# if not newbody:
# raise Exception("cleanup error")
#
# if shout_dict:
# uid = shout_dict["id"] or uuid.uuid4()
# newbody = extract_md_images(newbody, uid)
# if not newbody:
# raise Exception("extract_images error")
#
# newbody, placed = extract_footnotes(body, shout_dict)
# if not newbody:
# raise Exception("extract_footnotes error")
#
# return newbody
def extract_media(entry):
'''normalized media extraction method'''
"""normalized media extraction method"""
# media [ { title pic url body } ]}
kind = entry.get("type")
if not kind:
@@ -398,16 +399,14 @@ def cleanup_html(body: str) -> str:
return new_body
def extract_html(entry, shout_id=None, cleanup=False):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
def extract_html(entry, cleanup=False):
body_orig = (entry.get("body") or "").replace(r"\(", "(").replace(r"\)", ")")
if cleanup:
# we do that before bs parsing to catch the invalid html
body_clean = cleanup_html(body_orig)
if body_clean != body_orig:
print(f"[migration] html cleaned for slug {entry.get('slug', None)}")
body_orig = body_clean
if shout_id:
extract_footnotes(body_orig, shout_id)
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
if cleanup:
# we do that after bs parsing because it can add dummy tags

View File

@@ -1,13 +1,5 @@
"""html2text: Turn HTML into equivalent Markdown-structured text."""
import html.entities
import html.parser
import re
import string
import urllib.parse as urlparse
from textwrap import wrap
from typing import Dict, List, Optional, Tuple, Union
from . import config
from .elements import AnchorElement, ListElement
from .typing import OutCallback
@@ -26,6 +18,14 @@ from .utils import (
skipwrap,
unifiable_n,
)
from textwrap import wrap
from typing import Dict, List, Optional, Tuple, Union
import html.entities
import html.parser
import re
import string
import urllib.parse as urlparse
__version__ = (2020, 1, 16)

View File

@@ -1,8 +1,8 @@
from . import __version__, config, HTML2Text
import argparse
import sys
from . import HTML2Text, __version__, config
# noinspection DuplicatedCode
def main() -> None:

View File

@@ -1,7 +1,7 @@
import html.entities
from . import config
from typing import Dict, List, Optional
from . import config
import html.entities
unifiable_n = {
html.entities.name2codepoint[k]: v for k, v in config.UNIFIABLE.items() if k != "nbsp"

View File

@@ -1,8 +1,6 @@
from datetime import datetime, timezone
from dateutil.parser import parse as date_parse
from base.orm import local_session
from datetime import datetime, timezone
from dateutil.parser import parse as date_parse
from migration.html2text import html2text
from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutReactionsFollower
@@ -30,12 +28,12 @@ def auto_followers(session, topics, reaction_dict):
tf = (
session.query(TopicFollower)
.where(TopicFollower.follower == reaction_dict["createdBy"])
.filter(TopicFollower.topic == t['id'])
.filter(TopicFollower.topic == t["id"])
.first()
)
if not tf:
topic_following = TopicFollower.create(
follower=reaction_dict["createdBy"], topic=t['id'], auto=True
follower=reaction_dict["createdBy"], topic=t["id"], auto=True
)
session.add(topic_following)
@@ -57,13 +55,13 @@ def migrate_ratings(session, entry, reaction_dict):
rr = Reaction.create(**re_reaction_dict)
following2 = (
session.query(ShoutReactionsFollower)
.where(ShoutReactionsFollower.follower == re_reaction_dict['createdBy'])
.where(ShoutReactionsFollower.follower == re_reaction_dict["createdBy"])
.filter(ShoutReactionsFollower.shout == rr.shout)
.first()
)
if not following2:
following2 = ShoutReactionsFollower.create(
follower=re_reaction_dict['createdBy'], shout=rr.shout, auto=True
follower=re_reaction_dict["createdBy"], shout=rr.shout, auto=True
)
session.add(following2)
session.add(rr)
@@ -160,9 +158,9 @@ async def migrate(entry, storage):
def migrate_2stage(old_comment, idmap):
if old_comment.get('body'):
new_id = idmap.get(old_comment.get('oid'))
new_id = idmap.get(old_comment.get('_id'))
if old_comment.get("body"):
new_id = idmap.get(old_comment.get("oid"))
new_id = idmap.get(old_comment.get("_id"))
if new_id:
new_replyto_id = None
old_replyto_id = old_comment.get("replyTo")

View File

@@ -1,18 +1,17 @@
import json
import re
from datetime import datetime, timezone
from dateutil.parser import parse as date_parse
from sqlalchemy.exc import IntegrityError
from transliterate import translit
from base.orm import local_session
from datetime import datetime, timezone
from dateutil.parser import parse as date_parse
from migration.extract import extract_html, extract_media
from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutReactionsFollower, ShoutTopic
from orm.topic import Topic, TopicFollower
from orm.user import User
from services.stat.viewed import ViewedStorage
from sqlalchemy.exc import IntegrityError
from transliterate import translit
import json
import re
OLD_DATE = "2016-03-05 22:22:00.350000"
ts = datetime.now(tz=timezone.utc)
@@ -35,7 +34,7 @@ def get_shout_slug(entry):
slug = friend.get("slug", "")
if slug:
break
slug = re.sub('[^0-9a-zA-Z]+', '-', slug)
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
return slug
@@ -43,27 +42,27 @@ def create_author_from_app(app):
user = None
userdata = None
# check if email is used
if app['email']:
if app["email"]:
with local_session() as session:
user = session.query(User).where(User.email == app['email']).first()
user = session.query(User).where(User.email == app["email"]).first()
if not user:
# print('[migration] app %r' % app)
name = app.get('name')
name = app.get("name")
if name:
slug = translit(name, "ru", reversed=True).lower()
slug = re.sub('[^0-9a-zA-Z]+', '-', slug)
print('[migration] created slug %s' % slug)
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
print("[migration] created slug %s" % slug)
# check if slug is used
if slug:
user = session.query(User).where(User.slug == slug).first()
# get slug from email
if user:
slug = app['email'].split('@')[0]
slug = app["email"].split("@")[0]
user = session.query(User).where(User.slug == slug).first()
# one more try
if user:
slug += '-author'
slug += "-author"
user = session.query(User).where(User.slug == slug).first()
# create user with application data
@@ -81,7 +80,7 @@ def create_author_from_app(app):
user = User.create(**userdata)
session.add(user)
session.commit()
userdata['id'] = user.id
userdata["id"] = user.id
userdata = user.dict()
return userdata
@@ -119,14 +118,14 @@ async def get_user(entry, storage):
elif user_oid:
userdata = storage["users"]["by_oid"].get(user_oid)
if not userdata:
print('no userdata by oid, anonymous')
print("no userdata by oid, anonymous")
userdata = anondict
print(app)
# cleanup slug
if userdata:
slug = userdata.get("slug", "")
if slug:
slug = re.sub('[^0-9a-zA-Z]+', '-', slug)
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
userdata["slug"] = slug
else:
userdata = anondict
@@ -160,7 +159,7 @@ async def migrate(entry, storage):
}
# main topic patch
r['mainTopic'] = r['topics'][0]
r["mainTopic"] = r["topics"][0]
# published author auto-confirm
if entry.get("published"):
@@ -183,7 +182,7 @@ async def migrate(entry, storage):
shout_dict["oid"] = entry.get("_id", "")
shout = await create_shout(shout_dict)
except IntegrityError as e:
print('[migration] create_shout integrity error', e)
print("[migration] create_shout integrity error", e)
shout = await resolve_create_shout(shout_dict)
except Exception as e:
raise Exception(e)
@@ -202,7 +201,7 @@ async def migrate(entry, storage):
# shout views
await ViewedStorage.increment(
shout_dict["slug"], amount=entry.get("views", 1), viewer='old-discours'
shout_dict["slug"], amount=entry.get("views", 1), viewer="old-discours"
)
# del shout_dict['ratings']
@@ -240,7 +239,7 @@ async def add_topics_follower(entry, storage, user):
session.add(tf)
session.commit()
except IntegrityError:
print('[migration.shout] hidden by topic ' + tpc.slug)
print("[migration.shout] hidden by topic " + tpc.slug)
# main topic
maintopic = storage["replacements"].get(topics_by_oid.get(category, {}).get("slug"))
if maintopic in ttt:
@@ -261,7 +260,7 @@ async def process_user(userdata, storage, oid):
if not user:
try:
slug = userdata["slug"].lower().strip()
slug = re.sub('[^0-9a-zA-Z]+', '-', slug)
slug = re.sub("[^0-9a-zA-Z]+", "-", slug)
userdata["slug"] = slug
user = User.create(**userdata)
session.add(user)
@@ -289,9 +288,9 @@ async def resolve_create_shout(shout_dict):
s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
bump = False
if s:
if s.createdAt != shout_dict['createdAt']:
if s.createdAt != shout_dict["createdAt"]:
# create new with different slug
shout_dict["slug"] += '-' + shout_dict["layout"]
shout_dict["slug"] += "-" + shout_dict["layout"]
try:
await create_shout(shout_dict)
except IntegrityError as e:

View File

@@ -5,24 +5,24 @@ from orm.reaction import Reaction, ReactionKind
def migrate(entry, storage):
post_oid = entry['contentItem']
post_oid = entry["contentItem"]
print(post_oid)
shout_dict = storage['shouts']['by_oid'].get(post_oid)
shout_dict = storage["shouts"]["by_oid"].get(post_oid)
if shout_dict:
print(shout_dict['body'])
print(shout_dict["body"])
remark = {
"shout": shout_dict['id'],
"body": extract_md(html2text(entry['body']), shout_dict),
"shout": shout_dict["id"],
"body": extract_md(html2text(entry["body"]), shout_dict),
"kind": ReactionKind.REMARK,
}
if entry.get('textBefore'):
remark['range'] = (
str(shout_dict['body'].index(entry['textBefore'] or ''))
+ ':'
if entry.get("textBefore"):
remark["range"] = (
str(shout_dict["body"].index(entry["textBefore"] or ""))
+ ":"
+ str(
shout_dict['body'].index(entry['textAfter'] or '')
+ len(entry['textAfter'] or '')
shout_dict["body"].index(entry["textAfter"] or "")
+ len(entry["textAfter"] or "")
)
)

View File

@@ -1,11 +1,10 @@
import re
from base.orm import local_session
from bs4 import BeautifulSoup
from dateutil.parser import parse
from orm.user import AuthorFollower, User, UserRating
from sqlalchemy.exc import IntegrityError
from base.orm import local_session
from orm.user import AuthorFollower, User, UserRating
import re
def migrate(entry):
@@ -33,12 +32,12 @@ def migrate(entry):
if entry.get("profile"):
# slug
slug = entry["profile"].get("path").lower()
slug = re.sub('[^0-9a-zA-Z]+', '-', slug).strip()
slug = re.sub("[^0-9a-zA-Z]+", "-", slug).strip()
user_dict["slug"] = slug
bio = (
(entry.get("profile", {"bio": ""}).get("bio") or "")
.replace('\(', '(')
.replace('\)', ')')
.replace(r"\(", "(")
.replace(r"\)", ")")
)
bio_text = BeautifulSoup(bio, features="lxml").text
@@ -144,7 +143,7 @@ def migrate_2stage(entry, id_map):
}
user_rating = UserRating.create(**user_rating_dict)
if user_rating_dict['value'] > 0:
if user_rating_dict["value"] > 0:
af = AuthorFollower.create(author=user.id, follower=rater.id, auto=True)
session.add(af)
session.add(user_rating)