export separated

2021-10-09 11:36:14 +03:00 · 2021-10-09 11:36:14 +03:00 · 1714a60e99
commit 1714a60e99
parent fe28c3918c
3 changed files with 33 additions and 19 deletions
--- a/migrate.py
+++ b/migrate.py
@ -114,6 +114,7 @@ def shouts():
    counter = 0
    discours_author = 0
    content_data = json.loads(open('migration/data/content_items.json').read())
+    content_dict = { x['_id']:x for x in content_data }
    newdata = {}
    print(str(len(content_data)) + ' entries loaded. now migrating...')
    errored = []
@ -125,7 +126,7 @@ def shouts():
            line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
            print(line)
            counter += 1
-            if author == 'discours.io':
+            if author == 'discours':
                discours_author += 1
            open('./shouts.id.log', 'a').write(line + '\n')
        except Exception:
@ -136,25 +137,35 @@ def shouts():
        limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
    except ValueError:
        limit = len(content_data)
-    export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
-    export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)[:limit]
-    export_clean = {}
-    for (slug, a) in export_list:
-        export_clean[a['slug']] = extract_images(a)
-        metadata = get_metadata(a)
-        content = frontmatter.dumps(frontmatter.Post(a['body'], **metadata))
-        open('../content/discours.io/'+a['slug']+'.md', 'w').write(content)
    open('migration/data/shouts.dict.json',
         'w').write(json.dumps(newdata, cls=DateTimeEncoder))
+    print(str(counter) + '/' + str(len(content_data)) +
+          ' content items were migrated')
+    print(str(discours_author) + ' from them by @discours')
+
+def export_shouts(limit):
+    print('reading json...')
+    newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
+    print(str(len(newdata.keys())) + ' loaded')
+    export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
+    export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
+    print(str(len(export_list)) + ' filtered')
+    export_list = export_list[:limit or len(export_list)]
+    export_clean = {}
+    for (slug, article) in export_list:
+        if article['layout'] == 'article':
+            export_clean[article['slug']] = extract_images(article)
+            metadata = get_metadata(article)
+            content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
+            open('../content/discours.io/'+slug+'.md', 'w').write(content)
+            # print(slug)
+            # open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
    open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
                                                            cls=DateTimeEncoder,
                                                            indent=4,
                                                            sort_keys=True,
                                                            ensure_ascii=False))
-    print(str(counter) + '/' + str(len(content_data)) +
-          ' content items were migrated')
-    print(str(len(export_list)) + ' shouts were exported')
-    print(str(discours_author) + ' from them by @discours.io')
+    print(str(len(export_clean.items())) + ' exported')


 if __name__ == '__main__':
@ -176,6 +187,9 @@ if __name__ == '__main__':
            except Exception:
                pass
            shouts()
+        elif sys.argv[1] == "export_shouts":
+          limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
+          export_shouts(limit)
        elif sys.argv[1] == "all":
            users()
            topics()
--- a/migration/html2text.py
+++ b/migration/html2text.py
@ -463,7 +463,7 @@ class HTML2Text(HTMLParser.HTMLParser):
            if start:
                if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
                    self.astack.append(attrs)
-                    self.maybe_automatic_link = attrs['href']
+                    self.maybe_automatic_link = attrs['href'][:2000]
                else:
                    self.astack.append(None)
            else:
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@ -15,7 +15,7 @@ users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read())
 topics_dict = json.loads(open(abspath('migration/data/topics.dict.json')).read()) # old_id keyed
 users_dict['0'] = {
    'id': 9999999,
-    'slug': 'discours.io',
+    'slug': 'discours',
    'name': 'Дискурс',
    'userpic': 'https://discours.io/images/logo-mini.svg',
    'createdAt': '2016-03-05 22:22:00.350000'
@ -109,7 +109,7 @@ def migrate(entry):
            else:
                body_html = str(BeautifulSoup(
                    body_orig, features="html.parser"))
-                r['body'] = html2text(body_html).replace('****', '**')
+                r['body'] = body_html # html2text(body_html).replace('****', '**')
                r['old_id'] = entry.get('_id')
        else:
            print(r['slug'] + ': literature has no media')
@ -131,7 +131,7 @@ def migrate(entry):
    if r.get('body') is None:
        body_orig = entry.get('body', '')
        body_html = str(BeautifulSoup(body_orig, features="html.parser"))
-        r['body'] = html2text(body_html).replace('****', '**')
+        r['body'] = body_html # html2text(body_html).replace('****', '**')
        r['old_id'] = entry.get('_id')
    body = r.get('body')
    user = None
@ -167,7 +167,7 @@ def migrate(entry):
            userpic = user.userpic
        else:
            # no application, no author!
-            slug = 'discours.io'
+            slug = 'discours'
            name = 'Дискурс'
            userpic = 'https://discours.io/images/logo-mini.svg'
    with local_session() as session: