2021-08-20 09:27:19 +00:00
|
|
|
from html.parser import HTMLParser
|
|
|
|
import os
|
|
|
|
import codecs
|
|
|
|
from typing import Tuple
|
|
|
|
|
|
|
|
|
|
|
|
class Converter(HTMLParser):
|
|
|
|
md_file: str
|
|
|
|
temp_tag: str
|
|
|
|
code_box: bool
|
|
|
|
div_count: int
|
|
|
|
code_box_div_num: int
|
|
|
|
ol_count: int
|
|
|
|
related_data: list
|
|
|
|
is_link: bool
|
|
|
|
link_ref: str
|
|
|
|
ignore_data: bool
|
|
|
|
class_div_count: int
|
|
|
|
ignore_div: bool
|
|
|
|
table_start: Tuple[int, int]
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
self.md_file = ''
|
|
|
|
self.code_box = False
|
|
|
|
self.div_count = 0
|
2021-08-23 08:44:46 +00:00
|
|
|
self.span_count = 0
|
2021-08-20 09:27:19 +00:00
|
|
|
self.code_box_div_num = 0
|
|
|
|
self.ol_count = 0
|
|
|
|
self.temp_tag = ''
|
|
|
|
self.related_data = []
|
|
|
|
self.is_link = False
|
|
|
|
self.link_ref = ''
|
|
|
|
self.ignore_data = False
|
|
|
|
self.class_div_count = 0
|
|
|
|
self.ignore_div = False
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if self.ignore_data:
|
|
|
|
return None
|
2021-08-23 08:44:46 +00:00
|
|
|
elif tag == 'sup':
|
|
|
|
self.md_file += '<sup>'
|
|
|
|
elif tag == 'p':
|
|
|
|
self.temp_tag = 'p'
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'i':
|
|
|
|
self.temp_tag = 'i'
|
|
|
|
self.md_file += '*'
|
|
|
|
elif tag == 'wbr':
|
|
|
|
self.temp_tag = 'wbr'
|
|
|
|
self.md_file += ''
|
|
|
|
elif tag == 'span':
|
|
|
|
self.temp_tag = 'span'
|
|
|
|
self.span_count += 1
|
|
|
|
self.md_file += ' '
|
|
|
|
elif tag == 'figcaption':
|
|
|
|
self.md_file += ''
|
2021-08-20 09:27:19 +00:00
|
|
|
elif tag == 'hr':
|
|
|
|
self.md_file += '\n*** \n'
|
|
|
|
elif tag == 'title':
|
|
|
|
self.md_file += '# '
|
|
|
|
elif tag == 'h1':
|
|
|
|
self.md_file += '# '
|
|
|
|
elif tag == 'h2':
|
|
|
|
self.md_file += '## '
|
|
|
|
elif tag == 'h3':
|
|
|
|
self.md_file += '### '
|
|
|
|
elif tag == 'b' or tag == 'strong':
|
|
|
|
self.md_file += '**'
|
|
|
|
elif tag == 'ul':
|
|
|
|
self.temp_tag = 'ul'
|
|
|
|
self.md_file += ' \n'
|
|
|
|
elif tag == 'ol':
|
|
|
|
self.ol_count = 0
|
|
|
|
self.temp_tag = 'ol'
|
|
|
|
self.md_file += ' \n'
|
|
|
|
elif tag == 'li':
|
|
|
|
if self.temp_tag == 'ul':
|
|
|
|
self.md_file += '* '
|
|
|
|
elif self.temp_tag == 'ol':
|
|
|
|
self.ol_count += 1
|
|
|
|
self.md_file += f'{self.ol_count}. '
|
|
|
|
elif tag == 'div':
|
|
|
|
self.div_count += 1
|
|
|
|
attrs_dict = dict(attrs)
|
|
|
|
if 'style' in attrs_dict and 'codeblock' in attrs_dict['style']:
|
|
|
|
self.code_box_div_num = self.div_count
|
|
|
|
self.code_box = True
|
|
|
|
self.md_file += '```\n'
|
|
|
|
elif 'class' in attrs_dict:
|
|
|
|
self.class_div_count = self.div_count
|
|
|
|
self.ignore_div = True
|
2021-08-23 08:44:46 +00:00
|
|
|
elif tag == 'pre' or tag == 'code':
|
2021-08-20 09:27:19 +00:00
|
|
|
self.code_box = True
|
|
|
|
self.md_file += '\n```\n'
|
|
|
|
elif tag == 'a':
|
|
|
|
self.is_link = True
|
|
|
|
attrs_dict = dict(attrs)
|
|
|
|
self.link_ref = attrs_dict.get('href', '#')
|
|
|
|
if not self.link_ref.startswith('http') and not self.link_ref.endswith('html') and not '@' in self.link_ref:
|
|
|
|
self.related_data.append(self.link_ref)
|
|
|
|
elif tag == 'style':
|
|
|
|
self.ignore_data = True
|
|
|
|
elif tag == 'symbol':
|
|
|
|
self.ignore_data = True
|
|
|
|
elif tag == 'svg':
|
|
|
|
self.ignore_data = True
|
|
|
|
elif tag == 'path':
|
|
|
|
self.ignore_data = True
|
|
|
|
elif tag == 'img':
|
|
|
|
attrs_dict = dict(attrs)
|
|
|
|
img_ref = attrs_dict['src']
|
2021-08-23 08:44:46 +00:00
|
|
|
alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'x'
|
2021-08-20 09:27:19 +00:00
|
|
|
if self.is_link:
|
|
|
|
self.related_data.append(img_ref)
|
|
|
|
self.md_file += f'[]({self.link_ref})'
|
|
|
|
else:
|
|
|
|
self.related_data.append(img_ref)
|
|
|
|
self.md_file += f''
|
|
|
|
elif tag == 'table':
|
|
|
|
self.ignore_data = True
|
|
|
|
self.table_start = self.getpos()
|
2021-08-23 08:44:46 +00:00
|
|
|
else:
|
|
|
|
print('<' + tag + '>')
|
2021-08-20 09:27:19 +00:00
|
|
|
|
|
|
|
def get_rawdata(self, start, stop, offset):
|
|
|
|
temp_rawdata = self.rawdata
|
|
|
|
for i in range(offset-1):
|
|
|
|
next_section = temp_rawdata.find('\n')
|
|
|
|
temp_rawdata = temp_rawdata[next_section+1:]
|
|
|
|
return temp_rawdata[start:stop]
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag == 'b' or tag == 'strong':
|
2021-08-23 08:44:46 +00:00
|
|
|
self.md_file += '** '
|
|
|
|
elif tag == 'sup':
|
|
|
|
self.md_file += '</sup>'
|
|
|
|
elif tag == 'iframe':
|
|
|
|
self.ignore_data = False
|
|
|
|
elif tag == 'wbr':
|
|
|
|
self.md_file += ''
|
|
|
|
elif tag == 'title':
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'h1':
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'h2':
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'h3':
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'h4':
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'span':
|
|
|
|
self.span_count -= 1
|
|
|
|
self.md_file += ' '
|
|
|
|
elif tag == 'figcaption':
|
|
|
|
self.md_file += '\n'
|
|
|
|
elif tag == 'i':
|
|
|
|
self.md_file += '* '
|
|
|
|
elif tag == 'p':
|
|
|
|
self.md_file += '\n'
|
2021-08-20 09:27:19 +00:00
|
|
|
elif tag == 'div':
|
|
|
|
if self.code_box and self.code_box_div_num == self.div_count:
|
|
|
|
self.code_box = False
|
|
|
|
self.md_file += '```\n'
|
|
|
|
elif self.ignore_div and self.class_div_count == self.div_count:
|
|
|
|
self.ignore_div = False
|
|
|
|
else:
|
|
|
|
self.md_file += ' \n'
|
|
|
|
self.div_count -= 1
|
2021-08-23 08:44:46 +00:00
|
|
|
elif tag == 'pre' or tag == 'code':
|
2021-08-20 09:27:19 +00:00
|
|
|
self.code_box = False
|
|
|
|
self.md_file += '```\n'
|
|
|
|
elif tag == 'a':
|
|
|
|
self.is_link = False
|
|
|
|
elif tag == 'style':
|
|
|
|
self.ignore_data = False
|
|
|
|
elif tag == 'symbol':
|
|
|
|
self.ignore_data = False
|
|
|
|
elif tag == 'svg':
|
|
|
|
self.ignore_data = False
|
|
|
|
elif tag == 'li':
|
|
|
|
self.md_file += ' \n'
|
|
|
|
elif tag == 'table':
|
|
|
|
offset, lineno_stop = self.getpos()
|
|
|
|
lineno_stop = lineno_stop + len(tag) + 3
|
|
|
|
_, lineno_start = self.table_start
|
|
|
|
raw_data = self.get_rawdata(lineno_start, lineno_stop, offset)
|
|
|
|
self.md_file += '\n' + raw_data
|
|
|
|
self.ignore_data = False
|
2021-08-23 08:44:46 +00:00
|
|
|
else:
|
|
|
|
print('</' + tag + '>')
|
2021-08-20 09:27:19 +00:00
|
|
|
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
|
|
if tag == 'br':
|
|
|
|
self.md_file += ' \n'
|
2021-08-23 08:44:46 +00:00
|
|
|
elif tag == 'wbr':
|
|
|
|
self.md_file += ''
|
2021-08-20 09:27:19 +00:00
|
|
|
elif tag == 'hr':
|
|
|
|
self.md_file += '\n*** \n'
|
|
|
|
elif tag == 'img':
|
|
|
|
attr_dict = dict(attrs)
|
2021-08-23 08:44:46 +00:00
|
|
|
name = attr_dict.get('data-filename', 'image')
|
2021-08-20 09:27:19 +00:00
|
|
|
img_ref = attr_dict['src']
|
|
|
|
self.related_data.append(img_ref)
|
|
|
|
self.md_file += f''
|
2021-08-23 08:44:46 +00:00
|
|
|
else:
|
|
|
|
print("<" + tag + " />")
|
2021-08-20 09:27:19 +00:00
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
if self.is_link:
|
|
|
|
self.md_file += f'[{data}]({self.link_ref})'
|
|
|
|
elif self.ignore_data:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
self.md_file += data
|