From c3e0c5720ab6199d691bd5ff5fc645e7e26ab530 Mon Sep 17 00:00:00 2001
From: Untone <anton.rewin@gmail.com>
Date: Fri, 8 Oct 2021 07:42:59 +0300
Subject: [PATCH] upgrade migration

---
 migrate.py                        | 186 ++++++++++++++++++++----------
 migration/tables/content_items.py |  70 ++++++-----
 migration/tables/tags.py          |  28 ++++-
 migration/tables/users.py         | 150 ++++++++++++------------
 4 files changed, 260 insertions(+), 174 deletions(-)

diff --git a/migrate.py b/migrate.py
index a8ce0e75..5ed76323 100644
--- a/migrate.py
+++ b/migrate.py
@@ -1,18 +1,45 @@
+''' cmd managed migration '''
 import json
+import base64
+import re
 from migration.tables.users import migrate as migrateUser
 from migration.tables.content_items import migrate as migrateShout
-from migration.tables.content_item_categories import migrate as migrateTopic
+from migration.tables.content_item_categories import migrate as migrateCategory
+from migration.tables.tags import migrate as migrateTag
 from migration.utils import DateTimeEncoder
 from orm import Community
 
-def users(limit):
+
+IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,(.*?))\)"
+OLD_DATE = '2016-03-05 22:22:00.350000'
+
+
+def extract_images(article):
+    ''' extract b64 encoded images from markdown in article body '''
+    body = article['body']
+    images = []
+    matches = re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE)
+    for i, match in enumerate(matches, start=1):
+        ext = match.group(3)
+        link = '/static/upload/image-' + \
+            article['old_id'] + str(i) + '.' + ext
+        img = match.group(4)
+        if img not in images:
+          open('..' + link, 'wb').write(base64.b64decode(img))
+          images.append(img)
+        body = body.replace(match.group(2), link)
+        print(link)
+    article['body'] = body
+    return article
+
+
+def users():
+    ''' migrating users first '''
     print('migrating users...')
-    data = json.loads(open('migration/data/users.json').read())
     newdata = {}
-    exportData = {}
+    data = json.loads(open('migration/data/users.json').read())
     counter = 0
-    # limit = 100
-    #try:
+    export_data = {}
     for entry in data:
         oid = entry['_id']
         user = migrateUser(entry)
@@ -23,96 +50,127 @@ def users(limit):
         del user['emailConfirmed']
         del user['username']
         del user['email']
-        exportData[user['slug']] = user
+        export_data[user['slug']] = user
         counter += 1
-        if counter > limit:
-            break
-    #except Exception:
-    #    print(str(counter) + '/' + str(len(data)) + ' users entries were migrated')
-    #    print('try to remove database first')
-    open('migration/data/users.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) )
-    open('../src/data/authors.json','w').write( json.dumps(exportData, cls=DateTimeEncoder) )
-    print(str(counter) + ' users entries were migrated')
+    export_list = sorted(export_data.items(),
+                        key=lambda item: item[1]['rating'])[-10:]
+    open('migration/data/users.dict.json',
+         'w').write(json.dumps(newdata, cls=DateTimeEncoder))  # NOTE: by old_id
+    open('../src/data/authors.json', 'w').write(json.dumps(dict(export_list),
+                                                           cls=DateTimeEncoder,
+                                                           indent=4,
+                                                           sort_keys=True,
+                                                           ensure_ascii=False))
+    print(str(len(newdata.items())) + ' user accounts were migrated')
+    print(str(len(export_list)) + ' authors were exported')
 
 
 def topics():
+    ''' topics from categories and tags '''
     print('migrating topics...')
-    data = json.loads(open('migration/data/content_item_categories.json').read())
+    cat_data = json.loads(
+        open('migration/data/content_item_categories.json').read())
+    tag_data = json.loads(open('migration/data/tags.json').read())
     newdata = {}
-    exportData = {}
     counter = 0
     try:
-        for entry in data:
-            oid = entry['_id']
-            newdata[oid] = migrateTopic(entry)
-            exportData[entry['slug']] = newdata[oid]
+        for cat in cat_data:
+            topic = migrateCategory(cat)
+            newdata[topic['slug']] = topic
             counter += 1
     except Exception:
-        print(str(counter) + '/' + str(len(data)) + ' topics were migrated')
-        print('try to remove database first')
-    open('migration/data/topics.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) )
-    open('../src/data/topics.json','w').write( json.dumps(exportData, cls=DateTimeEncoder) )
-    print(str(counter) + ' topics were migrated')
+        print('cats exception, try to remove database first')
+    try:
+        for tag in tag_data:
+            topic = migrateTag(tag)
+            newdata[topic['slug']] = topic
+            counter += 1
+    except Exception:
+        print('tags exception, try to remove database first')
+        raise Exception
+    export_list = sorted(newdata.items(), key=lambda item: str(
+        item[1]['createdAt']))[-10:]
+    open('migration/data/topics.dict.json',
+         'w').write(json.dumps(newdata, cls=DateTimeEncoder))
+    open('../src/data/topics.json', 'w').write(json.dumps(dict(export_list),
+                                                          cls=DateTimeEncoder, indent=4, sort_keys=True, ensure_ascii=False))
+    print(str(counter) + ' from ' + str(len(cat_data)) +
+          ' tags and ' + str(len(tag_data)) + ' cats were migrated')
+    print(str(len(export_list)) + ' topics were exported')
 
-def shouts(limit):
+
+def shouts():
+    ''' migrating content items one by one '''
     print('loading shouts...')
     counter = 0
-    discoursAuthor = 0
-    data = json.loads(open('migration/data/content_items.json').read())
+    discours_author = 0
+    content_data = json.loads(open('migration/data/content_items.json').read())
     newdata = {}
-    print(str(len(data)) + ' entries loaded. now migrating...')
+    print(str(len(content_data)) + ' entries loaded. now migrating...')
     errored = []
-    exportData = {}
-    for entry in data:
+    for entry in content_data:
         try:
-            oid = entry['_id']
-            shout = migrateShout(entry)
-            newdata[oid] = shout
-            author = newdata[oid]['authors'][0]['slug']
-            line = str(counter) + ': ' + newdata[oid]['slug'] + " @" + str(author)
-            if shout['layout'] == 'article':
-                counter += 1
-                exportData[shout['slug']] = shout
-                print(line)
-            # counter += 1
+            (shout, content) = migrateShout(entry)
+            newdata[shout['slug']] = shout
+            author = newdata[shout['slug']]['authors'][0]['slug']
+            line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
+            print(line)
+            counter += 1
             if author == 'discours.io':
-                discoursAuthor += 1
-            open('./shouts.id.log','a').write(line + '\n')
-            if counter > limit:
-                break
+                discours_author += 1
+            open('./shouts.id.log', 'a').write(line + '\n')
         except Exception:
             print(entry['_id'])
             errored.append(entry)
-            raise Exception
+            raise Exception(" error")
+    try:
+        limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
+    except ValueError:
+        limit = len(content_data)
+    export_list = sorted(newdata.items(
+    ), key=lambda item: item[1]['createdAt'] if item[1]['layout'] == 'article' else OLD_DATE)[:limit]
+    export_clean = {}
+    for slug, a in dict(export_list).items():
+        export_clean[slug] = extract_images(a)
+        open('../content/discours.io/'+slug+'.md', 'w').write(content)
+    open('migration/data/shouts.dict.json',
+         'w').write(json.dumps(newdata, cls=DateTimeEncoder))
+    open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
+                                                            cls=DateTimeEncoder,
+                                                            indent=4,
+                                                            sort_keys=True,
+                                                            ensure_ascii=False))
+    print(str(counter) + '/' + str(len(content_data)) +
+          ' content items were migrated')
+    print(str(len(export_list)) + ' shouts were exported')
+    print(str(discours_author) + ' from them by @discours.io')
 
-    open('migration/data/shouts.dict.json','w').write( json.dumps(newdata, cls=DateTimeEncoder) )
-    open('../src/data/articles.json','w').write( json.dumps(exportData, cls=DateTimeEncoder) )
-    print(str(counter) + ' shouts were migrated')
-    print(str(discoursAuthor) + ' from them by @discours.io')
-    print(str(len(errored)) + ' shouts without authors')
 
 if __name__ == '__main__':
     import sys
     if len(sys.argv) > 1:
         if sys.argv[1] == "users":
-            users(668)
+            users()
         elif sys.argv[1] == "topics":
             topics()
         elif sys.argv[1] == "shouts":
-            Community.create(**{
-                'slug': 'discours.io',
-                'name': 'Дискурс',
-                'pic': 'https://discours.io/images/logo-min.svg',
-                'createdBy': '0',
-                'createdAt': ts
+            try:
+                Community.create(**{
+                    'slug': 'discours.io',
+                    'name': 'Дискурс',
+                    'pic': 'https://discours.io/images/logo-min.svg',
+                    'createdBy': '0',
+                    'createdAt': OLD_DATE
                 })
-            shouts(3626)
+            except Exception:
+                pass
+            shouts()
         elif sys.argv[1] == "all":
+            users()
             topics()
-            users(668)
-            shouts(3626)
+            shouts()
         elif sys.argv[1] == "bson":
-            import migration.bson2json
+            from migration import bson2json
             bson2json.json_tables()
     else:
-        print('usage: python migrate.py <all|topics|users|shouts|comments>')
\ No newline at end of file
+        print('usage: python migrate.py <bson|all|topics|users|shouts>')
diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py
index d878b80d..7a1d16e4 100644
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -16,8 +16,9 @@ users_dict['0'] = {
     'id': 9999999,
     'slug': 'discours.io',
     'name': 'Дискурс',
-    'userpic': 'https://discours.io/images/logo-mini.svg'
-    }
+    'userpic': 'https://discours.io/images/logo-mini.svg',
+    'createdAt': '2016-03-05 22:22:00.350000'
+}
 
 ts = datetime.now()
 
@@ -29,8 +30,9 @@ type2layout = {
     'Image': 'image'
 }
 
-def migrate(entry, limit=3626, start=0):
-    '''  
+
+def migrate(entry):
+    '''
     type Shout {
         slug: String!
         author: Int!
@@ -41,7 +43,7 @@ def migrate(entry, limit=3626, start=0):
         deletedBy: Int
         rating: Int
         ratigns: [Rating]
-        published: Bool! 
+        published: Bool!
         publishedAt: DateTime # if there is no published field - it is not published
         replyTo: String # another shout
         tags: [String] # actual values
@@ -53,17 +55,19 @@ def migrate(entry, limit=3626, start=0):
         views: Int
     }
     '''
+    content = ''
     r = {
-            'layout': type2layout[entry['type']],
-            'title': entry['title'],
-            'community': 0,
-            'authors': [],
-            'topics': [],
-            'published': entry.get('published', False),
-            'views': entry.get('views', 0),
-            'rating': entry.get('rating', 0),
-            'ratings': []
-        }
+        'layout': type2layout[entry['type']],
+        'title': entry['title'],
+        'community': 0,
+        'authors': [],
+        'topics': [],
+        'published': entry.get('published', False),
+        'views': entry.get('views', 0),
+        'rating': entry.get('rating', 0),
+        'ratings': [],
+        'createdAt': '2016-03-05 22:22:00.350000'
+    }
     r['slug'] = entry.get('slug', '')
     body_orig = entry.get('body', '')
     if not r['slug'] and entry.get('friendlySlugs') is not None:
@@ -88,7 +92,8 @@ def migrate(entry, limit=3626, start=0):
             if body_orig == '':
                 print('EMPTY BODY!')
             else:
-                body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+                body_html = str(BeautifulSoup(
+                    body_orig, features="html.parser"))
                 r['body'] = html2text(body_html).replace('****', '**')
                 r['old_id'] = entry.get('_id')
         else:
@@ -103,20 +108,20 @@ def migrate(entry, limit=3626, start=0):
         if videoUrl == '#':
             print(entry.get('media', 'NO MEDIA!'))
             # raise Exception
-        r['body'] = '<ShoutVideo src=\"' + videoUrl + '\" />' + html2text(m.get('body', '')) # FIXME
+        r['body'] = '<ShoutVideo src=\"' + videoUrl + \
+            '\" />' + html2text(m.get('body', ''))  # FIXME
     elif entry.get('type') == 'Music':
-        r['body'] = '<ShoutMusic media={\"' + json.dumps(entry['media']) +'\"} />' # FIXME
-
+        r['body'] = '<ShoutMusic media={\"' + \
+            json.dumps(entry['media']) + '\"} />'  # FIXME
     if r.get('body') is None:
         body_orig = entry.get('body', '')
         body_html = str(BeautifulSoup(body_orig, features="html.parser"))
         r['body'] = html2text(body_html).replace('****', '**')
         r['old_id'] = entry.get('_id')
-        
     body = r.get('body')
     user = None
     try:
-        userdata = users_dict[entry['createdBy']]
+        userdata = users_dict.get(entry['createdBy'], users_dict['0'])
         slug = userdata['slug']
         name = userdata['name']
         userpic = userdata['userpic']
@@ -137,10 +142,11 @@ def migrate(entry, limit=3626, start=0):
                 user = User.create(**authordata)
             except IntegrityError:
                 with local_session() as session:
-                    user = session.query(User).filter(User.email == authordata['email']).first()
+                    user = session.query(User).filter(
+                        User.email == authordata['email']).first()
                     if user is None:
-                        user = session.query(User).filter(User.slug == authordata['slug']).first()
-                    
+                        user = session.query(User).filter(
+                            User.slug == authordata['slug']).first()
             slug = user['slug']
             name = user['name']
             userpic = user.userpic
@@ -167,15 +173,15 @@ def migrate(entry, limit=3626, start=0):
     post = frontmatter.Post(body, **metadata)
     dumped = frontmatter.dumps(post)
 
-    if entry['published']: 
-        #if r.get('old_id', None):
+    if entry['published']:
+        # if r.get('old_id', None):
         #    ext = 'html'
         #    content = str(body).replace('<p></p>', '').replace('<p> </p>', '')
-        #else:
+        # else:
         ext = 'md'
         content = dumped
-        open('migration/content/' + metadata['layout'] + '/' + r['slug'] + '.' + ext, 'w').write(content)
-
+        open('migration/content/' +
+             metadata['layout'] + '/' + r['slug'] + '.' + ext, 'w').write(content)
 
     try:
         shout_dict = r.copy()
@@ -190,8 +196,8 @@ def migrate(entry, limit=3626, start=0):
             else:
                 shout_dict['publishedAt'] = ts
         del shout_dict['published']
-        del shout_dict['views'] # FIXME
-        del shout_dict['rating'] # FIXME
+        del shout_dict['views']  # FIXME
+        del shout_dict['rating']  # FIXME
         del shout_dict['ratings']
         try:
             s = Shout.create(**shout_dict)
@@ -203,4 +209,4 @@ def migrate(entry, limit=3626, start=0):
         print(r)
         # print(s)
         raise Exception
-    return r
+    return (r, content)
diff --git a/migration/tables/tags.py b/migration/tables/tags.py
index 6e6d80f0..620f7bd1 100644
--- a/migration/tables/tags.py
+++ b/migration/tables/tags.py
@@ -1,20 +1,36 @@
+import json
+
+from os.path import abspath
+from datetime import datetime
+
+users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read())
+users_dict['0'] = {
+    'id': 9999999,
+    'slug': 'discours.io',
+    'name': 'Дискурс',
+    'userpic': 'https://discours.io/images/logo-mini.svg',
+    'createdAt': '2016-03-05 22:22:00.350000'
+    }
+
+ts = datetime.now()
+
 def migrate(entry):
-    ```
+    '''
     type Topic {
         slug: String! # ID
         createdBy: Int! # User
         createdAt: DateTime!
-        value: String
+        title: String
         parents: [String] # NOTE: topic can have parent topics
         children: [String] # and children
     }
-    ```
-    creator = get_new_user_id(entry['createdBy'])
+    '''
+    creator = users_dict.get(entry['createdBy'], users_dict['0'])
     return {
         'slug': entry['slug'],
-        'createdBy': creator_id, # NOTE: uses an old user id
+        'createdBy': creator['id'], # NOTE: uses an old user id
         'createdAt': entry['createdAt'],
-        'title': entry['value'].lower(),
+        'title': entry['title'].lower(),
         'parents': [],
         'children': []
     }
\ No newline at end of file
diff --git a/migration/tables/users.py b/migration/tables/users.py
index 5443f783..22917a92 100644
--- a/migration/tables/users.py
+++ b/migration/tables/users.py
@@ -7,76 +7,82 @@ from migration.html2text import html2text
 counter = 0
 
 def migrate(entry, limit=668):
-        '''
-        
-        type User {
-            username: String! # email
-            createdAt: DateTime!
-            email: String
-            password: String
-            oauth: String # provider:token
-            name: String # to display
-            userpic: String
-            links: [String]
-            emailConfirmed: Boolean # should contain all emails too
-            id: Int!
-            muted: Boolean
-            rating: Int
-            roles: [Role]
-            updatedAt: DateTime
-            wasOnlineAt: DateTime
-            ratings: [Rating]
-            slug: String
-            bio: String
-            notifications: [Int] 
-        }
+  '''
 
-        '''
-        res = {}
-        res['old_id'] = entry['_id']
-        res['password'] = entry['services']['password'].get('bcrypt', '')
-        res['username'] = entry['emails'][0]['address']
-        res['email'] = res['username']
-        res['wasOnlineAt'] = parse(entry.get('loggedInAt', entry['createdAt']))
-        res['emailConfirmed'] = entry['emails'][0]['verified']
-        res['createdAt'] = parse(entry['createdAt'])
-        res['rating'] = entry['rating'] # number
-        res['roles'] = [] # entry['roles'] # roles by community
-        res['ratings'] = [] # entry['ratings']
-        res['notifications'] = []
-        res['links'] = []
-        res['muted'] = False
-        res['bio'] = html2text(entry.get('bio', ''))
-        if entry['profile']:
-            res['slug'] = entry['profile'].get('path')
-            res['userpic'] = entry['profile'].get('image', {'thumborId': ''}).get('thumborId', '') # adding 'https://assets.discours.io/unsafe/1600x' in web ui
-            fn = entry['profile'].get('firstName', '')
-            ln = entry['profile'].get('lastName', '')
-            name = res['slug'] if res['slug'] else 'anonymous'
-            name = fn if fn else name
-            name = (name + ' ' + ln) if ln else name
-            name = entry['profile']['path'] if len(name) < 2 else name
-            res['name'] = name
-            fb = entry['profile'].get('facebook', False)
-            if fb:
-                res['links'].append(fb)
-            vk = entry['profile'].get('vkontakte', False)
-            if vk:
-                res['links'].append(vk)
-            tr = entry['profile'].get('twitter', False)
-            if tr:
-                res['links'].append(tr)
-            ws = entry['profile'].get('website', False)
-            if ws:
-                res['links'].append(ws)
-            if not res['slug']:
-                res['slug'] = res['links'][0].split('/')[-1]
-        if not res['slug']:
-            res['slug'] = res['email'].split('@')[0]
-        else:
-            old = res['old_id']
-            del res['old_id']
-            user = User.create(**res.copy())
-            res['id'] = user.id
-            res['old_id'] = old
-            return res
\ No newline at end of file
+  type User {
+      username: String! # email
+      createdAt: DateTime!
+      email: String
+      password: String
+      oauth: String # provider:token
+      name: String # to display
+      userpic: String
+      links: [String]
+      emailConfirmed: Boolean # should contain all emails too
+      id: Int!
+      muted: Boolean
+      rating: Int
+      roles: [Role]
+      updatedAt: DateTime
+      wasOnlineAt: DateTime
+      ratings: [Rating]
+      slug: String
+      bio: String
+      notifications: [Int]
+  }
+
+  '''
+  res = {}
+  res['old_id'] = entry['_id']
+  res['password'] = entry['services']['password'].get('bcrypt', '')
+  res['username'] = entry['emails'][0]['address']
+  res['email'] = res['username']
+  res['wasOnlineAt'] = parse(entry.get('loggedInAt', entry['createdAt']))
+  res['emailConfirmed'] = entry['emails'][0]['verified']
+  res['createdAt'] = parse(entry['createdAt'])
+  res['rating'] = entry['rating'] # number
+  res['roles'] = [] # entry['roles'] # roles by community
+  res['ratings'] = [] # entry['ratings']
+  res['notifications'] = []
+  res['links'] = []
+  res['muted'] = False
+  res['bio'] = html2text(entry.get('bio', ''))
+  if entry['profile']:
+      res['slug'] = entry['profile'].get('path')
+      try:
+        res['userpic'] = 'https://assets.discours.io/unsafe/100x/' + entry['profile']['thumborId']
+      except KeyError:
+        try:
+          res['userpic'] = entry['profile']['image']['url']
+        except KeyError:
+          res['userpic'] = ''
+      fn = entry['profile'].get('firstName', '')
+      ln = entry['profile'].get('lastName', '')
+      name = res['slug'] if res['slug'] else 'anonymous'
+      name = fn if fn else name
+      name = (name + ' ' + ln) if ln else name
+      name = entry['profile']['path'] if len(name) < 2 else name
+      res['name'] = name
+      fb = entry['profile'].get('facebook', False)
+      if fb:
+          res['links'].append(fb)
+      vk = entry['profile'].get('vkontakte', False)
+      if vk:
+          res['links'].append(vk)
+      tr = entry['profile'].get('twitter', False)
+      if tr:
+          res['links'].append(tr)
+      ws = entry['profile'].get('website', False)
+      if ws:
+          res['links'].append(ws)
+      if not res['slug']:
+          res['slug'] = res['links'][0].split('/')[-1]
+  if not res['slug']:
+      res['slug'] = res['email'].split('@')[0]
+  else:
+      old = res['old_id']
+      del res['old_id']
+      user = User.create(**res.copy())
+      res['id'] = user.id
+      res['old_id'] = old
+      return res