-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmicronify.py
136 lines (109 loc) · 5.02 KB
/
micronify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from markdownify import MarkdownConverter, chomp
import posixpath
class MicronConverter(MarkdownConverter):
current_path = "/" # for relative href rewriting
reader_path = "/page/zr.mu" # so we can create valid micron links that will actually point where we want them to
url_suffix=""
def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
if not text:
return ''
href = el.get('href')
title = el.get('title')
# For the replacement see #29: text nodes underscores are escaped
if (self.options['autolinks']
and text.replace(r'\_', '_') == href
and not title
and not self.options['default_title']):
# Shortcut syntax
return '`[%s]' % href
if self.options['default_title'] and not title:
title = href
#title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
micron_link = self.rewrite_link(href)
return '`F44a%s`[%s`%s]%s`f' % (prefix, text, micron_link, suffix) if href else text
def rewrite_link(self, link):
"""
Rewrite a link so it actually goes where we want
"""
#TODO if the HTML has a nomadnet link? how can we detect that? hmmm
if link is None or len(link)==0:
return ''
# nothing we can do with a link like this out to HTTP
if link.startswith("http"):
return link
# absolute path
if link.startswith("/"):
return ":" + self.reader_path + "`p=" +link + self.url_suffix
#relative path
current_dir = posixpath.dirname(self.current_path) if not self.current_path.endswith("/") else self.current_path
new_link = posixpath.normpath(posixpath.join(current_dir, link))
if link.endswith("/") and not new_link.endswith("/"):
new_link += "/"
return ":" + self.reader_path + "`p=" + new_link + self.url_suffix
def convert_b(self, el, text, convert_as_inline):
return "`!" + text + "`!"
def convert_em(self, el, text, convert_as_inline):
return "`*"+text+"`*"
def _convert_hn(self, n, el, text, convert_as_inline):
""" Method name prefixed with _ to prevent <hn> to call this """
if convert_as_inline:
return text
# workaround since links in headers aren't clickable right now.
href_workaround = any(x.name == "a" for x in el.children)
prefix = " \n" if href_workaround else ""
# prevent MemoryErrors in case of very large n
n = max(1, min(8, n))
return '\n>'*n + prefix + text + "\n"
def convert_hr(self, el, text, convert_as_inline):
return '\n-\n'
def convert_img(self, el, text, convert_as_inline):
alt = el.attrs.get('alt', None) or ''
src = el.attrs.get('src', None) or ''
title = el.attrs.get('title', None) or ''
new_src = self.rewrite_link(src)
if len(title) > 0:
label = title + (f"〚alt:{alt}〛" if len(alt) > 0 else '')
elif len(alt) > 0:
label = alt
else:
label = src
return '`F44a`[(🖻:%s)`%s]`f' % (label , new_src)
convert_i = convert_em
def convert_soup(self, soup):
self._clean_soup(soup)
return super().convert_soup(soup)
def _clean_soup(self, soup):
# References and external links for some articles can be quite long and will be on their own seperate page isolated from the main article content.
bad_classes = {
"sidebar-list"
"reflist",
"references",
"mw-references-wrap",
"mw-reference-columns",
"navbox",
"infobox",
"sidebar",
"hatnote",
"external",
}
for ele in soup.find_all(class_=lambda c: c in bad_classes):
ele.decompose()
related = soup.find_all(attrs={"aria-labelledby": "Links_to_related_articles"})
for r in related:
r.decompose()
for tag in soup(["script", "style", "sup"]):
tag.decompose()
# the good stuff here
def html_to_micron(html, current_path=None, extra_get_params=None):
converter = MicronConverter(wrap=False, wrap_width=180, escape_underscore=False)
# set the current path for href rewriting
if current_path is not None:
converter.current_path = current_path
if extra_get_params is not None:
if "L" not in extra_get_params:
extra_get_params["L"] = current_path # set the last path for "back" funcationality
converter.url_suffix = "|" + "|".join(f"{k}={v}" for k,v in extra_get_params.items())
# just remove literal `, escaping is broken`
result = converter.convert(html.replace("`","")) or ""
return result.strip(" \n\r").replace("\n\n\n", "\n").replace("\n\n\n", "\n").strip("<|>#-") # clean up lots of empty \n from html