2021-10-08 04:42:59 +00:00
''' cmd managed migration '''
2021-08-20 09:27:19 +00:00
import json
2021-11-23 07:16:42 +00:00
import pprint
2021-10-08 04:42:59 +00:00
import base64
import re
2021-10-08 09:58:19 +00:00
import frontmatter
2021-08-20 09:27:19 +00:00
from migration . tables . users import migrate as migrateUser
2021-10-08 09:58:19 +00:00
from migration . tables . content_items import get_metadata , migrate as migrateShout
2021-10-08 04:42:59 +00:00
from migration . tables . content_item_categories import migrate as migrateCategory
from migration . tables . tags import migrate as migrateTag
2021-10-13 17:46:30 +00:00
from migration . tables . comments import migrate as migrateComment
2021-12-08 07:28:38 +00:00
from migration . tables . comments import migrate_2stage as migrateComment_2stage
2021-08-20 09:27:19 +00:00
from migration . utils import DateTimeEncoder
2021-11-15 11:55:08 +00:00
from orm import Community , Topic
2021-10-14 05:59:42 +00:00
from dateutil . parser import parse as date_parse
2021-08-20 09:27:19 +00:00
2021-10-16 13:53:46 +00:00
from orm . base import local_session
from orm import User
2021-11-23 07:16:42 +00:00
print = pprint . pprint
2021-10-08 04:42:59 +00:00
IMG_REGEX = r " \ ! \ [(.*?) \ ] \ ((data \ :image \ /(png|jpeg|jpg);base64 \ ,(.*?)) \ ) "
OLD_DATE = ' 2016-03-05 22:22:00.350000 '
2021-11-23 07:16:42 +00:00
def extract_images ( article ) :
''' extract b64 encoded images from markdown in article body '''
body = article [ ' body ' ]
images = [ ]
matches = re . finditer ( IMG_REGEX , body , re . IGNORECASE | re . MULTILINE )
for i , match in enumerate ( matches , start = 1 ) :
ext = match . group ( 3 )
link = ' /static/upload/image- ' + \
article [ ' old_id ' ] + str ( i ) + ' . ' + ext
img = match . group ( 4 )
if img not in images :
open ( ' .. ' + link , ' wb ' ) . write ( base64 . b64decode ( img ) )
images . append ( img )
body = body . replace ( match . group ( 2 ) , link )
print ( link )
article [ ' body ' ] = body
return article
def users ( users_by_oid , users_by_slug , users_data ) :
''' migrating users first '''
# limiting
limit = len ( users_data )
if len ( sys . argv ) > 2 : limit = int ( sys . argv [ 2 ] )
print ( ' migrating %d users... ' % limit )
counter = 0
for entry in users_data :
oid = entry [ ' _id ' ]
user = migrateUser ( entry )
users_by_oid [ oid ] = user # full
del user [ ' password ' ]
del user [ ' notifications ' ]
# del user['oauth']
del user [ ' emailConfirmed ' ]
del user [ ' username ' ]
del user [ ' email ' ]
users_by_slug [ user [ ' slug ' ] ] = user # public
counter + = 1
export_authors = dict ( sorted ( users_by_slug . items ( ) , key = lambda item : item [ 1 ] [ ' rating ' ] ) [ - 10 : ] )
try :
open ( ' migration/data/users.old_id.json ' , ' w ' ) . write ( json . dumps ( users_by_oid , cls = DateTimeEncoder ) ) # NOTE: by old_id
open ( ' migration/data/users.slug.json ' , ' w ' ) . write ( json . dumps ( users_by_slug , cls = DateTimeEncoder ) ) # NOTE: by slug
print ( str ( len ( users_by_slug . items ( ) ) ) + ' users migrated ' )
except Exception :
print ( ' json dump error ' )
print ( users_by_oid )
def topics ( export_topics , topics_by_slug , topics_by_cat , topics_by_tag , cats_data , tags_data ) :
''' topics from categories and tags '''
# limiting
limit = len ( cats_data ) + len ( tags_data )
if len ( sys . argv ) > 2 : limit = int ( sys . argv [ 2 ] )
print ( ' migrating %d topics... ' % limit )
counter = 0
for cat in cats_data :
old_id = cat [ " createdBy " ]
# cat["createdBy"] = user_id_map[old_id]
try : topic = migrateCategory ( cat )
except Exception as e : raise e
topics_by_cat [ topic [ ' cat_id ' ] ] = topic
topics_by_slug [ topic [ ' slug ' ] ] = topic
counter + = 1
for tag in tags_data :
old_id = tag [ " createdBy " ]
tag [ " createdBy " ] = user_id_map . get ( old_id , 0 )
topic = migrateTag ( tag )
topics_by_tag [ topic [ ' tag_id ' ] ] = topic
if not topics_by_slug . get ( topic [ ' slug ' ] ) : topics_by_slug [ topic [ ' slug ' ] ] = topic
counter + = 1
export_topics = dict ( topics_by_slug . items ( ) ) # sorted(topics_by_slug.items(), key=lambda item: str(item[1]['createdAt']))) # NOTE: sorting does not work :)
open ( ' migration/data/topics.slug.json ' , ' w ' ) . write ( json . dumps ( topics_by_slug ,
cls = DateTimeEncoder ,
indent = 4 ,
sort_keys = True ,
ensure_ascii = False ) )
open ( ' migration/data/topics.cat_id.json ' , ' w ' ) . write ( json . dumps ( topics_by_cat ,
cls = DateTimeEncoder ,
indent = 4 ,
sort_keys = True ,
ensure_ascii = False ) )
def shouts ( content_data , shouts_by_slug , shouts_by_oid ) :
''' migrating content items one by one '''
# limiting
limit = len ( content_data )
if len ( sys . argv ) > 2 : limit = int ( sys . argv [ 2 ] )
print ( ' migrating %d content items... ' % limit )
counter = 0
discours_author = 0
errored = [ ]
# limiting
try : limit = int ( sys . argv [ 2 ] ) if len ( sys . argv ) > 2 else len ( content_data )
except ValueError : limit = len ( content_data )
2021-11-23 14:17:19 +00:00
with local_session ( ) as session :
community = session . query ( Community ) . filter ( Community . id == 0 ) . first ( )
if not community :
Community . create ( * * {
' id ' : 0 ,
' slug ' : ' discours.io ' ,
' name ' : ' Дискурс ' ,
' pic ' : ' https://discours.io/images/logo-min.svg ' ,
' createdBy ' : ' 0 ' ,
' createdAt ' : date_parse ( OLD_DATE )
} )
2021-11-23 07:16:42 +00:00
if not topics_by_cat :
with local_session ( ) as session :
topics = session . query ( Topic ) . all ( )
print ( " loaded %s topics " % len ( topics ) )
for topic in topics :
topics_by_cat [ topic . cat_id ] = topic
for entry in content_data [ : limit ] :
try :
shout = migrateShout ( entry , users_by_oid , topics_by_cat )
author = shout [ ' authors ' ] [ 0 ]
shout [ ' authors ' ] = [ author . id , ]
shouts_by_slug [ shout [ ' slug ' ] ] = shout
shouts_by_oid [ entry [ ' _id ' ] ] = shout
line = str ( counter + 1 ) + ' : ' + shout [ ' slug ' ] + " @ " + str ( author . slug )
counter + = 1
if author . slug == ' discours ' : discours_author + = 1
print ( line )
# open('./shouts.id.log', 'a').write(line + '\n')
except Exception as e :
print ( entry [ ' _id ' ] )
errored . append ( entry )
raise e
open ( ' migration/data/shouts.old_id.json ' , ' w ' ) . write ( json . dumps ( shouts_by_oid , cls = DateTimeEncoder ) )
open ( ' migration/data/shouts.slug.json ' , ' w ' ) . write ( json . dumps ( shouts_by_slug , cls = DateTimeEncoder ) )
print ( str ( counter ) + ' / ' + str ( len ( content_data ) ) + ' content items were migrated ' )
print ( str ( discours_author ) + ' authored by @discours ' )
def export_shouts ( shouts_by_slug , export_articles , export_authors , content_dict ) :
# update what was just migrated or load json again
if len ( export_authors . keys ( ) ) == 0 :
export_authors = json . loads ( open ( ' ../src/data/authors.json ' ) . read ( ) )
print ( str ( len ( export_authors . items ( ) ) ) + ' exported authors loaded ' )
if len ( export_articles . keys ( ) ) == 0 :
export_articles = json . loads ( open ( ' ../src/data/articles.json ' ) . read ( ) )
print ( str ( len ( export_articles . items ( ) ) ) + ' exported articles loaded ' )
# limiting
limit = 33
if len ( sys . argv ) > 2 : limit = int ( sys . argv [ 2 ] )
print ( ' exporting %d articles to json... ' % limit )
# filter
export_list = [ i for i in shouts_by_slug . items ( ) if i [ 1 ] [ ' layout ' ] == ' article ' ]
export_list = sorted ( export_list , key = lambda item : item [ 1 ] [ ' createdAt ' ] or OLD_DATE , reverse = True )
print ( str ( len ( export_list ) ) + ' filtered ' )
export_list = export_list [ : limit or len ( export_list ) ]
for ( slug , article ) in export_list :
if article [ ' layout ' ] == ' article ' :
export_slug ( slug , export_articles , export_authors , content_dict )
def export_body ( article , content_dict ) :
article = extract_images ( article )
metadata = get_metadata ( article )
content = frontmatter . dumps ( frontmatter . Post ( article [ ' body ' ] , * * metadata ) )
open ( ' ../content/discours.io/ ' + slug + ' .md ' , ' w ' ) . write ( content )
open ( ' ../content/discours.io/ ' + slug + ' .html ' , ' w ' ) . write ( content_dict [ article [ ' old_id ' ] ] [ ' body ' ] )
def export_slug ( slug , export_articles , export_authors , content_dict ) :
print ( ' exporting %s ' % slug )
if export_authors == { } :
export_authors = json . loads ( open ( ' ../src/data/authors.json ' ) . read ( ) )
print ( str ( len ( export_authors . items ( ) ) ) + ' exported authors loaded ' )
if export_articles == { } :
export_articles = json . loads ( open ( ' ../src/data/articles.json ' ) . read ( ) )
print ( str ( len ( export_articles . items ( ) ) ) + ' exported articles loaded ' )
shout = shouts_by_slug . get ( slug , False )
assert shout , ' no data error '
author = users_by_slug . get ( shout [ ' authors ' ] [ 0 ] [ ' slug ' ] , None )
export_authors . update ( { shout [ ' authors ' ] [ 0 ] [ ' slug ' ] : author } )
export_articles . update ( { shout [ ' slug ' ] : shout } )
export_body ( shout , content_dict )
comments ( [ slug , ] )
2021-12-08 07:28:38 +00:00
def comments ( comments_data ) :
id_map = { }
for comment in comments_data :
comment = migrateComment ( comment )
id = comment . get ( ' id ' )
old_id = comment . get ( ' old_id ' )
id_map [ old_id ] = id
for comment in comments_data :
migrateComment_2stage ( comment , id_map )
print ( str ( len ( id_map ) ) + ' comments exported ' )
2021-11-23 07:16:42 +00:00
def export_finish ( export_articles = { } , export_authors = { } , export_topics = { } , export_comments = { } ) :
open ( ' ../src/data/authors.json ' , ' w ' ) . write ( json . dumps ( export_authors ,
cls = DateTimeEncoder ,
indent = 4 ,
sort_keys = True ,
ensure_ascii = False ) )
print ( str ( len ( export_authors . items ( ) ) ) + ' authors exported ' )
open ( ' ../src/data/topics.json ' , ' w ' ) . write ( json . dumps ( export_topics ,
cls = DateTimeEncoder ,
indent = 4 ,
sort_keys = True ,
ensure_ascii = False ) )
print ( str ( len ( export_topics . keys ( ) ) ) + ' topics exported ' )
open ( ' ../src/data/articles.json ' , ' w ' ) . write ( json . dumps ( export_articles ,
cls = DateTimeEncoder ,
indent = 4 ,
sort_keys = True ,
ensure_ascii = False ) )
print ( str ( len ( export_articles . items ( ) ) ) + ' articles exported ' )
open ( ' ../src/data/comments.json ' , ' w ' ) . write ( json . dumps ( export_comments ,
cls = DateTimeEncoder ,
indent = 4 ,
sort_keys = True ,
ensure_ascii = False ) )
print ( str ( len ( export_comments . items ( ) ) ) + ' exported articles with comments ' )
2021-10-15 10:00:26 +00:00
if __name__ == ' __main__ ' :
2021-11-23 07:16:42 +00:00
import sys
if len ( sys . argv ) > 1 :
cmd = sys . argv [ 1 ]
if cmd == " bson " :
# decode bson
from migration import bson2json
bson2json . json_tables ( )
else :
# preparing data
users_data = json . loads ( open ( ' migration/data/users.json ' ) . read ( ) )
# users_dict = { x['_id']: x for x in users_data } # by id
print ( str ( len ( users_data ) ) + ' users loaded ' )
users_by_oid = { }
users_by_slug = { }
with local_session ( ) as session :
default_user = session . query ( User ) . filter ( User . id == 0 ) . first ( )
if not default_user :
default_user = User . create ( id = 0 , email = " discours@discours.io " , username = " discours " , slug = " default " , old_id = 0 )
user_id_map = { }
with local_session ( ) as session :
users_list = session . query ( User ) . all ( )
for user in users_list :
user_id_map [ user . old_id ] = user . id
users_by_oid [ user . old_id ] = vars ( user )
tags_data = json . loads ( open ( ' migration/data/tags.json ' ) . read ( ) )
print ( str ( len ( tags_data ) ) + ' tags loaded ' )
cats_data = json . loads ( open ( ' migration/data/content_item_categories.json ' ) . read ( ) )
print ( str ( len ( cats_data ) ) + ' cats loaded ' )
topics_by_cat = { }
topics_by_tag = { }
topics_by_slug = { }
content_data = json . loads ( open ( ' migration/data/content_items.json ' ) . read ( ) )
content_dict = { x [ ' _id ' ] : x for x in content_data }
print ( str ( len ( content_data ) ) + ' content items loaded ' )
shouts_by_slug = { }
shouts_by_oid = { }
comments_data = json . loads ( open ( ' migration/data/comments.json ' ) . read ( ) )
print ( str ( len ( comments_data ) ) + ' comments loaded ' )
comments_by_post = { }
# sort comments by old posts ids
for old_comment in comments_data :
cid = old_comment [ ' contentItem ' ]
comments_by_post [ cid ] = comments_by_post . get ( cid , [ ] )
if not old_comment . get ( ' deletedAt ' , True ) :
comments_by_post [ cid ] . append ( old_comment )
print ( str ( len ( comments_by_post . keys ( ) ) ) + ' articles with comments ' )
export_articles = { } # slug: shout
export_authors = { } # slug: user
export_comments = { } # shout-slug: comment[] (list)
export_topics = { } # slug: topic
##################### COMMANDS ##########################3
if cmd == " users " :
users ( users_by_oid , users_by_slug , users_data )
elif cmd == " topics " :
topics ( export_topics , topics_by_slug , topics_by_cat , topics_by_tag , cats_data , tags_data )
elif cmd == " shouts " :
shouts ( content_data , shouts_by_slug , shouts_by_oid ) # NOTE: listens limit
elif cmd == " comments " :
2021-12-08 07:28:38 +00:00
comments ( comments_data )
2021-11-23 07:16:42 +00:00
elif cmd == " export_shouts " :
export_shouts ( shouts_by_slug , export_articles , export_authors , content_dict )
elif cmd == " all " :
users ( users_by_oid , users_by_slug , users_data )
topics ( export_topics , topics_by_slug , topics_by_cat , topics_by_tag , cats_data , tags_data )
shouts ( content_data , shouts_by_slug , shouts_by_oid )
2021-12-08 07:28:38 +00:00
comments ( comments_data )
2021-11-23 07:16:42 +00:00
elif cmd == ' slug ' :
export_slug ( sys . argv [ 2 ] , export_articles , export_authors , content_dict )
#export_finish(export_articles, export_authors, export_topics, export_comments)
else :
print ( '''
usage : python migrate . py bson
\n . . \ttopics < limit >
\n . . \tusers < limit >
\n . . \tshouts < limit >
\n . . \texport_shouts < limit >
\n . . \tslug < slug >
\n . . \tall
''' )