Skip to content

Commit 16b741e

Browse files
committed
rebasing PR 644
adding import __version__ to the htmlformat.py
1 parent 28530c4 commit 16b741e

5 files changed

Lines changed: 1635 additions & 2 deletions

File tree

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
(* HTML Importer *)
2+
3+
Begin["System`Convert`HTML`"]
4+
5+
ImportExport`RegisterImport[
6+
"HTML",
7+
{
8+
"Data" :> HTML`DataImport,
9+
"FullData" :> HTML`FullDataImport,
10+
"Hyperlinks" :> HTML`HyperlinksImport,
11+
"ImageLinks" :> HTML`ImageLinksImport,
12+
"Plaintext" :> HTML`PlaintextImport,
13+
"Source" :> HTML`SourceImport,
14+
"Title" :> HTML`TitleImport,
15+
"XMLObject" :> HTML`XMLObjectImport,
16+
HTML`PlaintextImport
17+
},
18+
{},
19+
AvailableElements -> {"Data", "FullData", "Hyperlinks", "ImageLinks", "Plaintext", "Source", "Title", "XMLObject"},
20+
DefaultElement -> "Plaintext",
21+
FunctionChannels -> {"FileNames"}
22+
]
23+
24+
End[]

mathics/builtin/arithmetic.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# -*- coding: utf-8 -*-
2-
# cython: language_level=3
32

43
"""
54
Mathematical Functions

mathics/builtin/htmlformat.py

Lines changed: 366 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
# -*- coding: utf-8 -*-
2+
# cython: language_level=3
3+
4+
"""
5+
HTML
6+
7+
Basic implementation for a HTML importer
8+
9+
"""
10+
11+
from mathics.version import __version__ # noqa used in loading to check consistency.
12+
13+
14+
from __future__ import unicode_literals
15+
16+
17+
from mathics.builtin.base import Builtin
18+
from mathics.builtin.files_io.files import mathics_open
19+
from mathics.core.expression import Expression, String, Symbol, from_python
20+
from mathics.builtin.base import MessageException
21+
22+
from io import BytesIO
23+
import re
24+
import sys
25+
26+
try:
27+
import lxml.html as lhtml
28+
except ImportError:
29+
pass
30+
31+
32+
def node_to_xml_element(node, strip_whitespace=True):
33+
def children():
34+
text = node.text
35+
if text:
36+
if strip_whitespace:
37+
text = text.strip()
38+
if text:
39+
yield String(text)
40+
for child in node:
41+
for element in node_to_xml_element(child, strip_whitespace):
42+
yield element
43+
tail = node.tail
44+
if tail:
45+
if strip_whitespace:
46+
tail = tail.strip()
47+
if tail:
48+
yield String(tail)
49+
50+
def attributes():
51+
for name, value in node.attrib.items():
52+
yield Expression("Rule", from_python(name), from_python(value))
53+
54+
return [
55+
Expression(
56+
"XMLElement",
57+
String(node.tag),
58+
Expression("List", *list(attributes())),
59+
Expression("List", *list(children())),
60+
)
61+
]
62+
63+
64+
def xml_object(tree):
65+
declaration = [
66+
Expression(
67+
Expression("XMLObject", String("Declaration")),
68+
Expression(
69+
"Rule", String("Version"), String(tree.docinfo.xml_version or "1.0")
70+
),
71+
Expression(
72+
"Rule",
73+
String("Standalone"),
74+
String("yes") if tree.docinfo.standalone else String("no"),
75+
),
76+
Expression("Rule", String("Encoding"), String(tree.docinfo.encoding)),
77+
)
78+
]
79+
80+
return Expression(
81+
Expression("XMLObject", String("Document")),
82+
Expression("List", *declaration),
83+
*node_to_xml_element(tree.getroot())
84+
)
85+
86+
87+
class ParseError(Exception):
88+
pass
89+
90+
91+
if "__pypy__" in sys.builtin_module_names:
92+
93+
def parse_html_stream(f):
94+
parser = lhtml.HTMLParser(encoding="utf8")
95+
return lhtml.parse(f, parser)
96+
97+
98+
else:
99+
100+
def parse_html_stream(f):
101+
return lhtml.parse(f)
102+
103+
104+
def parse_html_file(filename):
105+
with mathics_open(filename, "rb") as f:
106+
return parse_html_stream(f)
107+
108+
109+
def parse_html(parse, text, evaluation):
110+
try:
111+
return parse(text.get_string_value())
112+
except IOError:
113+
evaluation.message("General", "noopen", text.get_string_value())
114+
return Symbol("$Failed")
115+
except MessageException as e:
116+
e.message(evaluation)
117+
return Symbol("$Failed")
118+
119+
120+
class _HTMLBuiltin(Builtin):
121+
context = "HTML`"
122+
123+
requires = ("lxml",)
124+
125+
126+
class _TagImport(_HTMLBuiltin):
127+
def _import(self, tree):
128+
raise NotImplementedError
129+
130+
def apply(self, text, evaluation):
131+
"""%(name)s[text_String]"""
132+
tree = parse_html(parse_html_file, text, evaluation)
133+
if isinstance(tree, Symbol): # $Failed?
134+
return tree
135+
return Expression("List", Expression("Rule", self.tag_name, self._import(tree)))
136+
137+
138+
class _Get(_HTMLBuiltin):
139+
context = "HTML`Parser`"
140+
141+
messages = {
142+
"prserr": "``.",
143+
}
144+
145+
def apply(self, text, evaluation):
146+
"""%(name)s[text_String]"""
147+
root = parse_html(self._parse, text, evaluation)
148+
if isinstance(root, Symbol): # $Failed?
149+
return root
150+
else:
151+
return xml_object(root)
152+
153+
154+
class HTMLGet(_Get):
155+
def _parse(self, text):
156+
return parse_html_file(text)
157+
158+
159+
class HTMLGetString(_Get):
160+
"""
161+
#> Head[HTML`Parser`HTMLGetString["<a></a>"]]
162+
= XMLObject[Document]
163+
164+
#> Head[HTML`Parser`HTMLGetString["<a><b></a>"]]
165+
= XMLObject[Document]
166+
"""
167+
168+
def _parse(self, text):
169+
with BytesIO() as f:
170+
f.write(text.encode("utf8"))
171+
f.seek(0)
172+
return parse_html_stream(f)
173+
174+
175+
class _DataImport(_TagImport):
176+
def _import(self, tree):
177+
full_data = self.full_data
178+
179+
if full_data:
180+
181+
def add_data(l, x):
182+
l.append(x)
183+
return l
184+
185+
else:
186+
187+
def add_data(l, x):
188+
if x is None:
189+
return l
190+
if l is None:
191+
return [x]
192+
elif len(x) == 1:
193+
l.extend(x)
194+
elif x:
195+
l.append(Expression("List", *x))
196+
return l
197+
198+
newline = re.compile(r"\s+")
199+
200+
def add_text(l, node):
201+
deep_data = traverse(node)
202+
if deep_data: # if there's data, we ignore any text
203+
add_data(l, deep_data)
204+
else:
205+
t = []
206+
for s in node.xpath(".//text()"):
207+
t.append(s)
208+
if t or full_data:
209+
l.append(String(newline.sub(" ", " ".join(t))))
210+
211+
def traverse(parent):
212+
if full_data:
213+
data = []
214+
else:
215+
data = None
216+
217+
for node in parent:
218+
tag = node.tag
219+
if tag == "table":
220+
row_data = []
221+
for tr in node.xpath("tr"):
222+
col_data = []
223+
for td in tr.xpath("th|td"):
224+
add_text(col_data, td)
225+
add_data(row_data, col_data)
226+
data = add_data(data, row_data)
227+
elif tag in ("ul", "ol"):
228+
list_data = []
229+
for child in node:
230+
deep_data = traverse(child)
231+
if deep_data:
232+
add_data(list_data, deep_data)
233+
elif child.tag == "li":
234+
add_text(list_data, child)
235+
data = add_data(data, list_data)
236+
else:
237+
data = add_data(data, traverse(node))
238+
239+
if data and len(data) == 1:
240+
data = data[0]
241+
242+
return data
243+
244+
result = traverse(tree.getroot())
245+
if result is None:
246+
result = []
247+
248+
return Expression("List", *result)
249+
250+
251+
class DataImport(_DataImport):
252+
"""
253+
>> Import["ExampleData/PrimeMeridian.html", "Data"][[1, 1, 2, 3]]
254+
= {Washington, D.C., 77°03′56.07″ W (1897) or 77°04′02.24″ W (NAD 27) or 77°04′01.16″ W (NAD 83), New Naval Observatory meridian}
255+
256+
#> Length[Import["ExampleData/PrimeMeridian.html", "Data"]]
257+
= 3
258+
"""
259+
260+
full_data = False
261+
tag_name = "Data"
262+
263+
264+
class FullDataImport(_DataImport):
265+
full_data = True
266+
tag_name = "FullData"
267+
268+
269+
class _LinksImport(_TagImport):
270+
def _links(self, root):
271+
raise NotImplementedError
272+
273+
def _import(self, tree):
274+
return Expression("List", *list(self._links(tree)))
275+
276+
277+
class HyperlinksImport(_LinksImport):
278+
"""
279+
>> Import["ExampleData/PrimeMeridian.html", "Hyperlinks"][[1]]
280+
= /wiki/Prime_meridian_(Greenwich)
281+
"""
282+
283+
tag_name = "Hyperlinks"
284+
285+
def _links(self, tree):
286+
for link in tree.xpath("//a"):
287+
href = link.get("href")
288+
if href and not href.startswith("#"):
289+
yield href
290+
291+
292+
class ImageLinksImport(_LinksImport):
293+
"""
294+
>> Import["ExampleData/PrimeMeridian.html", "ImageLinks"][[6]]
295+
= //upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Prime_meridian.jpg/180px-Prime_meridian.jpg
296+
"""
297+
298+
tag_name = "ImageLinks"
299+
300+
def _links(self, tree):
301+
for link in tree.xpath("//img"):
302+
src = link.get("src")
303+
if src:
304+
yield src
305+
306+
307+
class PlaintextImport(_TagImport):
308+
"""
309+
>> DeleteDuplicates[StringCases[Import["ExampleData/PrimeMeridian.html"], RegularExpression["Wiki[a-z]+"]]]
310+
= {Wikipedia, Wikidata, Wikibase, Wikimedia}
311+
"""
312+
313+
tag_name = "Plaintext"
314+
315+
def _import(self, tree):
316+
def lines():
317+
for s in tree.xpath("//text()"):
318+
t = s.strip()
319+
if t:
320+
yield t
321+
322+
return String("\n".join(lines()))
323+
324+
325+
class SourceImport(_HTMLBuiltin):
326+
"""
327+
>> DeleteDuplicates[StringCases[Import["ExampleData/PrimeMeridian.html", "Source"], RegularExpression["<t[a-z]+>"]]]
328+
= {<title>, <tr>, <th>, <td>}
329+
"""
330+
331+
def apply(self, text, evaluation):
332+
"""%(name)s[text_String]"""
333+
334+
def source(filename):
335+
with mathics_open(filename, "r", encoding="UTF-8") as f:
336+
return Expression(
337+
"List", Expression("Rule", "Source", String(f.read()))
338+
)
339+
340+
return parse_html(source, text, evaluation)
341+
342+
343+
class TitleImport(_TagImport):
344+
"""
345+
>> Import["ExampleData/PrimeMeridian.html", "Title"]
346+
= Prime meridian - Wikipedia
347+
"""
348+
349+
tag_name = "Title"
350+
351+
def _import(self, tree):
352+
for node in tree.xpath("//title"):
353+
return String(node.text_content())
354+
return String("")
355+
356+
357+
class XMLObjectImport(_HTMLBuiltin):
358+
"""
359+
>> Part[Import["ExampleData/PrimeMeridian.html", "XMLObject"], 2, 3, 1, 3, 2]
360+
= XMLElement[title, {}, {Prime meridian - Wikipedia}]
361+
"""
362+
363+
def apply(self, text, evaluation):
364+
"""%(name)s[text_String]"""
365+
xml = Expression("HTML`Parser`HTMLGet", text).evaluate(evaluation)
366+
return Expression("List", Expression("Rule", "XMLObject", xml))

0 commit comments

Comments
 (0)