from html.parser import HTMLParser import os import codecs from typing import Tuple class Converter(HTMLParser): md_file: str temp_tag: str code_box: bool div_count: int code_box_div_num: int ol_count: int related_data: list is_link: bool link_ref: str ignore_data: bool class_div_count: int ignore_div: bool table_start: Tuple[int, int] def __init__(self): super().__init__() self.md_file = '' self.code_box = False self.div_count = 0 self.span_count = 0 self.code_box_div_num = 0 self.ol_count = 0 self.temp_tag = '' self.related_data = [] self.is_link = False self.link_ref = '' self.ignore_data = False self.class_div_count = 0 self.ignore_div = False def handle_starttag(self, tag, attrs): if self.ignore_data: return None elif tag == 'sup': self.md_file += '' elif tag == 'p': self.temp_tag = 'p' self.md_file += '\n' elif tag == 'i': self.temp_tag = 'i' self.md_file += '*' elif tag == 'wbr': self.temp_tag = 'wbr' self.md_file += '' elif tag == 'span': self.temp_tag = 'span' self.span_count += 1 self.md_file += ' ' elif tag == 'figcaption': self.md_file += '' elif tag == 'hr': self.md_file += '\n*** \n' elif tag == 'title': self.md_file += '# ' elif tag == 'h1': self.md_file += '# ' elif tag == 'h2': self.md_file += '## ' elif tag == 'h3': self.md_file += '### ' elif tag == 'b' or tag == 'strong': self.md_file += '**' elif tag == 'ul': self.temp_tag = 'ul' self.md_file += ' \n' elif tag == 'ol': self.ol_count = 0 self.temp_tag = 'ol' self.md_file += ' \n' elif tag == 'li': if self.temp_tag == 'ul': self.md_file += '* ' elif self.temp_tag == 'ol': self.ol_count += 1 self.md_file += f'{self.ol_count}. ' elif tag == 'div': self.div_count += 1 attrs_dict = dict(attrs) if 'style' in attrs_dict and 'codeblock' in attrs_dict['style']: self.code_box_div_num = self.div_count self.code_box = True self.md_file += '```\n' elif 'class' in attrs_dict: self.class_div_count = self.div_count self.ignore_div = True elif tag == 'pre' or tag == 'code': self.code_box = True self.md_file += '\n```\n' elif tag == 'a': self.is_link = True attrs_dict = dict(attrs) self.link_ref = attrs_dict.get('href', '#') if not self.link_ref.startswith('http') and not self.link_ref.endswith('html') and not '@' in self.link_ref: self.related_data.append(self.link_ref) elif tag == 'style': self.ignore_data = True elif tag == 'symbol': self.ignore_data = True elif tag == 'svg': self.ignore_data = True elif tag == 'path': self.ignore_data = True elif tag == 'img': attrs_dict = dict(attrs) img_ref = attrs_dict['src'] alt_name = attrs_dict['alt'] if 'alt' in attrs_dict else 'x' if self.is_link: self.related_data.append(img_ref) self.md_file += f'[![{alt_name}]({img_ref})]({self.link_ref})' else: self.related_data.append(img_ref) self.md_file += f'![{alt_name}]({img_ref})' elif tag == 'table': self.ignore_data = True self.table_start = self.getpos() else: print('<' + tag + '>') def get_rawdata(self, start, stop, offset): temp_rawdata = self.rawdata for i in range(offset-1): next_section = temp_rawdata.find('\n') temp_rawdata = temp_rawdata[next_section+1:] return temp_rawdata[start:stop] def handle_endtag(self, tag): if tag == 'b' or tag == 'strong': self.md_file += '** ' elif tag == 'sup': self.md_file += '' elif tag == 'iframe': self.ignore_data = False elif tag == 'wbr': self.md_file += '' elif tag == 'title': self.md_file += '\n' elif tag == 'h1': self.md_file += '\n' elif tag == 'h2': self.md_file += '\n' elif tag == 'h3': self.md_file += '\n' elif tag == 'h4': self.md_file += '\n' elif tag == 'span': self.span_count -= 1 self.md_file += ' ' elif tag == 'figcaption': self.md_file += '\n' elif tag == 'i': self.md_file += '* ' elif tag == 'p': self.md_file += '\n' elif tag == 'div': if self.code_box and self.code_box_div_num == self.div_count: self.code_box = False self.md_file += '```\n' elif self.ignore_div and self.class_div_count == self.div_count: self.ignore_div = False else: self.md_file += ' \n' self.div_count -= 1 elif tag == 'pre' or tag == 'code': self.code_box = False self.md_file += '```\n' elif tag == 'a': self.is_link = False elif tag == 'style': self.ignore_data = False elif tag == 'symbol': self.ignore_data = False elif tag == 'svg': self.ignore_data = False elif tag == 'li': self.md_file += ' \n' elif tag == 'table': offset, lineno_stop = self.getpos() lineno_stop = lineno_stop + len(tag) + 3 _, lineno_start = self.table_start raw_data = self.get_rawdata(lineno_start, lineno_stop, offset) self.md_file += '\n' + raw_data self.ignore_data = False else: print('') def handle_startendtag(self, tag, attrs): if tag == 'br': self.md_file += ' \n' elif tag == 'wbr': self.md_file += '' elif tag == 'hr': self.md_file += '\n*** \n' elif tag == 'img': attr_dict = dict(attrs) name = attr_dict.get('data-filename', 'image') img_ref = attr_dict['src'] self.related_data.append(img_ref) self.md_file += f'![{name}]({img_ref})' else: print("<" + tag + " />") def handle_data(self, data): if self.is_link: self.md_file += f'[{data}]({self.link_ref})' elif self.ignore_data: pass else: self.md_file += data