configured isort, black, flake8
This commit is contained in:
@@ -33,7 +33,7 @@ __version__ = (2020, 1, 16)
|
||||
# TODO: Support decoded entities with UNIFIABLE.
|
||||
|
||||
|
||||
class HTML2Text(html.parser.HTMLParser):
|
||||
class HTML2Text(html.parser.HTMLParser): # noqa: C901
|
||||
def __init__(
|
||||
self,
|
||||
out: Optional[OutCallback] = None,
|
||||
@@ -85,7 +85,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
self.tag_callback = None
|
||||
self.open_quote = config.OPEN_QUOTE # covered in cli
|
||||
self.close_quote = config.CLOSE_QUOTE # covered in cli
|
||||
self.header_id = None
|
||||
self.header_id: str | None = None
|
||||
self.span_highlight = False
|
||||
self.span_lead = False
|
||||
|
||||
@@ -119,9 +119,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
self.lastWasList = False
|
||||
self.style = 0
|
||||
self.style_def = {} # type: Dict[str, Dict[str, str]]
|
||||
self.tag_stack = (
|
||||
[]
|
||||
) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]]
|
||||
self.tag_stack = [] # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]]
|
||||
self.emphasis = 0
|
||||
self.drop_white_space = 0
|
||||
self.inheader = False
|
||||
@@ -227,7 +225,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
return i
|
||||
return None
|
||||
|
||||
def handle_emphasis(
|
||||
def handle_emphasis( # noqa: C901
|
||||
self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]
|
||||
) -> None:
|
||||
"""
|
||||
@@ -300,7 +298,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
if strikethrough:
|
||||
self.quiet -= 1
|
||||
|
||||
def handle_tag(
|
||||
def handle_tag( # noqa: C901
|
||||
self, tag: str, attrs: Dict[str, Optional[str]], start: bool
|
||||
) -> None:
|
||||
self.current_tag = tag
|
||||
@@ -333,9 +331,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
tag_style = element_style(attrs, self.style_def, parent_style)
|
||||
self.tag_stack.append((tag, attrs, tag_style))
|
||||
else:
|
||||
dummy, attrs, tag_style = (
|
||||
self.tag_stack.pop() if self.tag_stack else (None, {}, {})
|
||||
)
|
||||
dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {})
|
||||
if self.tag_stack:
|
||||
parent_style = self.tag_stack[-1][2]
|
||||
|
||||
@@ -385,11 +381,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
):
|
||||
self.o("`") # NOTE: same as <code>
|
||||
self.span_highlight = True
|
||||
elif (
|
||||
self.current_class == "lead"
|
||||
and not self.inheader
|
||||
and not self.span_highlight
|
||||
):
|
||||
elif self.current_class == "lead" and not self.inheader and not self.span_highlight:
|
||||
# self.o("==") # NOTE: CriticMarkup {==
|
||||
self.span_lead = True
|
||||
else:
|
||||
@@ -479,11 +471,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
and not self.span_lead
|
||||
and not self.span_highlight
|
||||
):
|
||||
if (
|
||||
start
|
||||
and self.preceding_data
|
||||
and self.preceding_data[-1] == self.strong_mark[0]
|
||||
):
|
||||
if start and self.preceding_data and self.preceding_data[-1] == self.strong_mark[0]:
|
||||
strong = " " + self.strong_mark
|
||||
self.preceding_data += " "
|
||||
else:
|
||||
@@ -548,13 +536,8 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
"href" in attrs
|
||||
and not attrs["href"].startswith("#_ftn")
|
||||
and attrs["href"] is not None
|
||||
and not (
|
||||
self.skip_internal_links and attrs["href"].startswith("#")
|
||||
)
|
||||
and not (
|
||||
self.ignore_mailto_links
|
||||
and attrs["href"].startswith("mailto:")
|
||||
)
|
||||
and not (self.skip_internal_links and attrs["href"].startswith("#"))
|
||||
and not (self.ignore_mailto_links and attrs["href"].startswith("mailto:"))
|
||||
):
|
||||
self.astack.append(attrs)
|
||||
self.maybe_automatic_link = attrs["href"]
|
||||
@@ -591,7 +574,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
|
||||
if tag == "img" and start and not self.ignore_images:
|
||||
# skip cloudinary images
|
||||
if "src" in attrs and "cloudinary" not in attrs["src"]:
|
||||
if "src" in attrs and ("cloudinary" not in attrs["src"]):
|
||||
assert attrs["src"] is not None
|
||||
if not self.images_to_alt:
|
||||
attrs["href"] = attrs["src"]
|
||||
@@ -638,9 +621,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
self.o("![" + escape_md(alt) + "]")
|
||||
if self.inline_links:
|
||||
href = attrs.get("href") or ""
|
||||
self.o(
|
||||
"(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"
|
||||
)
|
||||
self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")")
|
||||
else:
|
||||
i = self.previousIndex(attrs)
|
||||
if i is not None:
|
||||
@@ -696,9 +677,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
# WARNING: does not line up <ol><li>s > 9 correctly.
|
||||
parent_list = None
|
||||
for list in self.list:
|
||||
self.o(
|
||||
" " if parent_list == "ol" and list.name == "ul" else " "
|
||||
)
|
||||
self.o(" " if parent_list == "ol" and list.name == "ul" else " ")
|
||||
parent_list = list.name
|
||||
|
||||
if li.name == "ul":
|
||||
@@ -787,7 +766,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
self.pbr()
|
||||
self.br_toggle = " "
|
||||
|
||||
def o(
|
||||
def o( # noqa: C901
|
||||
self, data: str, puredata: bool = False, force: Union[bool, str] = False
|
||||
) -> None:
|
||||
"""
|
||||
@@ -864,9 +843,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
self.out(" ")
|
||||
self.space = False
|
||||
|
||||
if self.a and (
|
||||
(self.p_p == 2 and self.links_each_paragraph) or force == "end"
|
||||
):
|
||||
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
|
||||
if force == "end":
|
||||
self.out("\n")
|
||||
|
||||
@@ -925,11 +902,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
|
||||
if self.maybe_automatic_link is not None:
|
||||
href = self.maybe_automatic_link
|
||||
if (
|
||||
href == data
|
||||
and self.absolute_url_matcher.match(href)
|
||||
and self.use_automatic_links
|
||||
):
|
||||
if href == data and self.absolute_url_matcher.match(href) and self.use_automatic_links:
|
||||
self.o("<" + data + ">")
|
||||
self.empty_link = False
|
||||
return
|
||||
@@ -980,7 +953,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
|
||||
return nest_count
|
||||
|
||||
def optwrap(self, text: str) -> str:
|
||||
def optwrap(self, text: str) -> str: # noqa: C901
|
||||
"""
|
||||
Wrap all paragraphs in the provided text.
|
||||
|
||||
@@ -1000,9 +973,7 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
self.inline_links = False
|
||||
for para in text.split("\n"):
|
||||
if len(para) > 0:
|
||||
if not skipwrap(
|
||||
para, self.wrap_links, self.wrap_list_items, self.wrap_tables
|
||||
):
|
||||
if not skipwrap(para, self.wrap_links, self.wrap_list_items, self.wrap_tables):
|
||||
indent = ""
|
||||
if para.startswith(" " + self.ul_item_mark):
|
||||
# list item continuation: add a double indent to the
|
||||
@@ -1043,12 +1014,10 @@ class HTML2Text(html.parser.HTMLParser):
|
||||
return result
|
||||
|
||||
|
||||
def html2text(
|
||||
html: str, baseurl: str = "", bodywidth: Optional[int] = config.BODY_WIDTH
|
||||
) -> str:
|
||||
def html2text(html: str, baseurl: str = "", bodywidth: int = config.BODY_WIDTH) -> str:
|
||||
h = html.strip() or ""
|
||||
if h:
|
||||
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
|
||||
h = h.handle(html.strip())
|
||||
h2t = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
|
||||
h = h2t.handle(html.strip())
|
||||
# print('[html2text] %d bytes' % len(html))
|
||||
return h
|
||||
|
@@ -117,10 +117,7 @@ def main() -> None:
|
||||
dest="images_with_size",
|
||||
action="store_true",
|
||||
default=config.IMAGES_WITH_SIZE,
|
||||
help=(
|
||||
"Write image tags with height and width attrs as raw html to retain "
|
||||
"dimensions"
|
||||
),
|
||||
help=("Write image tags with height and width attrs as raw html to retain " "dimensions"),
|
||||
)
|
||||
p.add_argument(
|
||||
"-g",
|
||||
@@ -260,9 +257,7 @@ def main() -> None:
|
||||
default=config.CLOSE_QUOTE,
|
||||
help="The character used to close quotes",
|
||||
)
|
||||
p.add_argument(
|
||||
"--version", action="version", version=".".join(map(str, __version__))
|
||||
)
|
||||
p.add_argument("--version", action="version", version=".".join(map(str, __version__)))
|
||||
p.add_argument("filename", nargs="?")
|
||||
p.add_argument("encoding", nargs="?", default="utf-8")
|
||||
args = p.parse_args()
|
||||
|
@@ -4,9 +4,7 @@ from typing import Dict, List, Optional
|
||||
from . import config
|
||||
|
||||
unifiable_n = {
|
||||
html.entities.name2codepoint[k]: v
|
||||
for k, v in config.UNIFIABLE.items()
|
||||
if k != "nbsp"
|
||||
html.entities.name2codepoint[k]: v for k, v in config.UNIFIABLE.items() if k != "nbsp"
|
||||
}
|
||||
|
||||
|
||||
@@ -68,12 +66,14 @@ def element_style(
|
||||
:rtype: dict
|
||||
"""
|
||||
style = parent_style.copy()
|
||||
if attrs.get("class"):
|
||||
for css_class in attrs["class"].split():
|
||||
attrs_class = attrs.get("class")
|
||||
if attrs_class:
|
||||
for css_class in attrs_class.split():
|
||||
css_style = style_def.get("." + css_class, {})
|
||||
style.update(css_style)
|
||||
if attrs.get("style"):
|
||||
immediate_style = dumb_property_dict(attrs["style"])
|
||||
attrs_style = attrs.get("style")
|
||||
if attrs_style:
|
||||
immediate_style = dumb_property_dict(attrs_style)
|
||||
style.update(immediate_style)
|
||||
|
||||
return style
|
||||
@@ -147,18 +147,17 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
|
||||
|
||||
:rtype: int or None
|
||||
"""
|
||||
if attrs.get("start"):
|
||||
attrs_start = attrs.get("start")
|
||||
if attrs_start:
|
||||
try:
|
||||
return int(attrs["start"]) - 1
|
||||
return int(attrs_start) - 1
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def skipwrap(
|
||||
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
|
||||
) -> bool:
|
||||
def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool) -> bool:
|
||||
# If it appears to contain a link
|
||||
# don't wrap
|
||||
if not wrap_links and config.RE_LINK.search(para):
|
||||
@@ -236,9 +235,7 @@ def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
||||
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
|
||||
max_cols = num_cols
|
||||
|
||||
max_width = [
|
||||
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
|
||||
]
|
||||
max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)]
|
||||
|
||||
# reformat
|
||||
new_lines = []
|
||||
@@ -247,15 +244,13 @@ def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
||||
if set(line.strip()) == set("-|"):
|
||||
filler = "-"
|
||||
new_cols = [
|
||||
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||
for x, M in zip(cols, max_width)
|
||||
x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width)
|
||||
]
|
||||
new_lines.append("|-" + "|".join(new_cols) + "|")
|
||||
else:
|
||||
filler = " "
|
||||
new_cols = [
|
||||
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||
for x, M in zip(cols, max_width)
|
||||
x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width)
|
||||
]
|
||||
new_lines.append("| " + "|".join(new_cols) + "|")
|
||||
return new_lines
|
||||
|
Reference in New Issue
Block a user