discours content decode

This commit is contained in:
Untone 2021-10-16 10:19:39 +03:00
parent 14fdfe71e5
commit 2a6baa7404
3 changed files with 60 additions and 36 deletions

View File

@ -21,7 +21,7 @@ if __name__ == '__main__':
import sys
users_data = json.loads(open('migration/data/users.json').read())
users_dict = { x['_id']: x for x in users_data } # by id
# users_dict = { x['_id']: x for x in users_data } # by id
print(str(len(users_data)) + ' users loaded')
users_by_oid = {}
users_by_slug = {}
@ -49,7 +49,8 @@ if __name__ == '__main__':
for old_comment in comments_data:
cid = old_comment['contentItem']
comments_by_post[cid] = comments_by_post.get(cid, [])
comments_by_post[cid].append(old_comment)
if 'deletedAt' not in old_comment:
comments_by_post[cid].append(old_comment)
print(str(len(comments_by_post.keys())) + ' articles with comments')
export_articles = {} # slug: shout
@ -77,7 +78,7 @@ if __name__ == '__main__':
return article
def users():
def users(users_by_oid, users_by_slug, users_data):
''' migrating users first '''
# limiting
limit = len(users_data)
@ -102,7 +103,7 @@ if __name__ == '__main__':
print(str(len(users_by_slug.items())) + ' users migrated')
def topics():
def topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data):
''' topics from categories and tags '''
# limiting
limit = len(cats_data) + len(tags_data)
@ -133,7 +134,7 @@ if __name__ == '__main__':
sort_keys=True,
ensure_ascii=False))
def shouts():
def shouts(content_data, shouts_by_slug, shouts_by_oid):
''' migrating content items one by one '''
# limiting
limit = len(content_data)
@ -168,7 +169,7 @@ if __name__ == '__main__':
print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
print(str(discours_author) + ' authored by @discours')
def export_shouts(shouts_by_slug, export_articles, export_authors):
def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
# update what was just migrated or load json again
if len(export_authors.keys()) == 0:
export_authors = json.loads(open('../src/data/authors.json').read())
@ -190,33 +191,33 @@ if __name__ == '__main__':
for (slug, article) in export_list:
if article['layout'] == 'article':
export_slug(slug, export_articles, export_authors)
export_slug(slug, export_articles, export_authors, content_dict)
def export_body(article):
def export_body(article, content_dict):
article = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
def export_slug(slug, export_articles, export_authors):
if exported_authors == {}:
exported_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(exported_authors.items())) + ' exported authors loaded')
if exported_articles == {}:
exported_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(exported_articles.items())) + ' exported articles loaded')
def export_slug(slug, export_articles, export_authors, content_dict):
print('exporting %s ' % slug)
if export_authors == {}:
export_authors = json.loads(open('../src/data/authors.json').read())
print(str(len(export_authors.items())) + ' exported authors loaded')
if export_articles == {}:
export_articles = json.loads(open('../src/data/articles.json').read())
print(str(len(export_articles.items())) + ' exported articles loaded')
shout = shouts_by_slug.get(slug, False)
assert shout, 'no data error'
author = users_by_slug.get(shout['authors'][0]['slug'], None)
exported_authors.update({shout['authors'][0]['slug']: author})
exported_articles.update({shout['slug']: shout})
export_body(shout)
export_authors.update({shout['authors'][0]['slug']: author})
export_articles.update({shout['slug']: shout})
export_body(shout, content_dict)
comments([slug, ])
def comments(sluglist = []):
def comments(sluglist, export_comments, export_articles, shouts_by_slug, content_dict):
''' migrating comments on content items one '''
if len(sluglist) == 0:
export_articles = json.loads(open('../src/data/articles.json').read())
@ -224,7 +225,8 @@ if __name__ == '__main__':
if len(sluglist) == 0: sluglist = list(export_articles.keys())
if len(sluglist) > 0:
print('exporting comments for exact articles...')
print('exporting comments for: ')
print(' '.join(sluglist))
for slug in sluglist:
shout = shouts_by_slug[slug]
old_id = shout['old_id']
@ -282,9 +284,9 @@ if __name__ == '__main__':
if len(sys.argv) > 1:
cmd = sys.argv[1]
if cmd == "users":
users(users_by_oid, users_by_slug, users_data, users_dict)
users(users_by_oid, users_by_slug, users_data)
elif cmd == "topics":
topics(topics_by_cat, topics_by_tag, topics_by_slug)
topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
elif cmd == "shouts":
try:
Community.create(**{
@ -298,19 +300,23 @@ if __name__ == '__main__':
pass
shouts(shouts_by_slug, shouts_by_oid) # NOTE: listens limit
elif cmd == "comments":
comments()
cl = sys.argv[2] if len(sys.argv) > 2 else 10
topCommented = sorted([ c[0] for c in comments_by_post.items()], reverse=True, key=lambda i: len(i[1]))[-cl:]
comments(topCommented, export_comments, export_articles, shouts_by_slug, content_dict)
elif cmd == "export_shouts":
export_shouts(shouts_by_slug, export_articles, export_authors)
export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
elif cmd == "all":
users()
topics()
shouts()
comments()
users(users_by_oid, users_by_slug, users_data)
topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
shouts(content_data, shouts_by_slug, shouts_by_oid)
cl = sys.argv[2] if len(sys.argv) > 2 else 10
topCommented = sorted([ c[0] for c in comments_by_post.items()], reverse=True, key=lambda i: len(i[1]))[-cl:]
comments(topCommented, export_comments, export_articles, shouts_by_slug, content_dict)
elif cmd == "bson":
from migration import bson2json
bson2json.json_tables()
elif cmd == 'slug':
export_slug(sys.argv[2], export_articles, export_authors)
export_slug(sys.argv[2], export_articles, export_authors, content_dict)
export_finish(export_articles, export_authors, export_topics, export_comments)
else:
print('''

View File

@ -86,6 +86,9 @@ class HTML2Text(html.parser.HTMLParser):
self.tag_callback = None
self.open_quote = config.OPEN_QUOTE # covered in cli
self.close_quote = config.CLOSE_QUOTE # covered in cli
self.header_id = None
self.span_hightlight = False
self.span_lead = False
if out is None:
self.out = self.outtextf
@ -347,18 +350,34 @@ class HTML2Text(html.parser.HTMLParser):
self.space = False
self.o(hn(tag) * "#" + " ")
self.o("[")
else:
self.p_p = 0 # don't break up link name
self.inheader = False
return # prevent redundant emphasis marks on headers
self.header_id = attrs.get('id')
else:
self.p()
if start:
self.inheader = True
self.o(hn(tag) * "#" + " ")
if self.header_id:
self.o(' {#' + self.header_id + '}')
self.header_id = None
else:
self.inheader = False
return # prevent redundant emphasis marks on headers
if tag == 'span':
if start and 'class' in attrs:
if attrs['class'] == 'highlight':
self.o('`') # NOTE: same as <code>
self.span_hightlight = True
elif attrs['class'] == 'lead':
self.o('==') # NOTE: but CriticMarkup uses {== ==}
self.span_lead = True
else:
if self.span_hightlight:
self.o('`')
self.span_hightlight = False
elif self.span_lead:
self.o('==')
self.span_lead = False
if tag in ["p", "div"]:
if self.google_doc:

View File

@ -17,7 +17,7 @@ BODY_WIDTH = 78
# Don't show internal links (href="#local-anchor") -- corresponding link
# targets won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = True
SKIP_INTERNAL_LINKS = False
# Use inline, rather than reference, formatting for images and links
INLINE_LINKS = True
@ -25,7 +25,6 @@ INLINE_LINKS = True
# Protect links from line breaks surrounding them with angle brackets (in
# addition to their square brackets)
PROTECT_LINKS = False
# WRAP_LINKS = True
WRAP_LINKS = True
# Wrap list items.