export separated

This commit is contained in:
Untone 2021-10-09 11:36:14 +03:00
parent fe28c3918c
commit 1714a60e99
3 changed files with 33 additions and 19 deletions

View File

@ -114,6 +114,7 @@ def shouts():
counter = 0
discours_author = 0
content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
newdata = {}
print(str(len(content_data)) + ' entries loaded. now migrating...')
errored = []
@ -125,7 +126,7 @@ def shouts():
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
print(line)
counter += 1
if author == 'discours.io':
if author == 'discours':
discours_author += 1
open('./shouts.id.log', 'a').write(line + '\n')
except Exception:
@ -136,25 +137,35 @@ def shouts():
limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError:
limit = len(content_data)
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)[:limit]
export_clean = {}
for (slug, a) in export_list:
export_clean[a['slug']] = extract_images(a)
metadata = get_metadata(a)
content = frontmatter.dumps(frontmatter.Post(a['body'], **metadata))
open('../content/discours.io/'+a['slug']+'.md', 'w').write(content)
open('migration/data/shouts.dict.json',
'w').write(json.dumps(newdata, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) +
' content items were migrated')
print(str(discours_author) + ' from them by @discours')
def export_shouts(limit):
print('reading json...')
newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
print(str(len(newdata.keys())) + ' loaded')
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
export_clean = {}
for (slug, article) in export_list:
if article['layout'] == 'article':
export_clean[article['slug']] = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
# print(slug)
# open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print(str(counter) + '/' + str(len(content_data)) +
' content items were migrated')
print(str(len(export_list)) + ' shouts were exported')
print(str(discours_author) + ' from them by @discours.io')
print(str(len(export_clean.items())) + ' exported')
if __name__ == '__main__':
@ -176,6 +187,9 @@ if __name__ == '__main__':
except Exception:
pass
shouts()
elif sys.argv[1] == "export_shouts":
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
export_shouts(limit)
elif sys.argv[1] == "all":
users()
topics()

View File

@ -463,7 +463,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if start:
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.maybe_automatic_link = attrs['href']
self.maybe_automatic_link = attrs['href'][:2000]
else:
self.astack.append(None)
else:

View File

@ -15,7 +15,7 @@ users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read())
topics_dict = json.loads(open(abspath('migration/data/topics.dict.json')).read()) # old_id keyed
users_dict['0'] = {
'id': 9999999,
'slug': 'discours.io',
'slug': 'discours',
'name': 'Дискурс',
'userpic': 'https://discours.io/images/logo-mini.svg',
'createdAt': '2016-03-05 22:22:00.350000'
@ -109,7 +109,7 @@ def migrate(entry):
else:
body_html = str(BeautifulSoup(
body_orig, features="html.parser"))
r['body'] = html2text(body_html).replace('****', '**')
r['body'] = body_html # html2text(body_html).replace('****', '**')
r['old_id'] = entry.get('_id')
else:
print(r['slug'] + ': literature has no media')
@ -131,7 +131,7 @@ def migrate(entry):
if r.get('body') is None:
body_orig = entry.get('body', '')
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
r['body'] = html2text(body_html).replace('****', '**')
r['body'] = body_html # html2text(body_html).replace('****', '**')
r['old_id'] = entry.get('_id')
body = r.get('body')
user = None
@ -167,7 +167,7 @@ def migrate(entry):
userpic = user.userpic
else:
# no application, no author!
slug = 'discours.io'
slug = 'discours'
name = 'Дискурс'
userpic = 'https://discours.io/images/logo-mini.svg'
with local_session() as session: