|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# cython: language_level=3 |
| 3 | + |
| 4 | +""" |
| 5 | +HTML |
| 6 | +
|
| 7 | +Basic implementation for a HTML importer |
| 8 | +
|
| 9 | +""" |
| 10 | + |
| 11 | +from mathics.version import __version__ # noqa used in loading to check consistency. |
| 12 | + |
| 13 | + |
| 14 | +from __future__ import unicode_literals |
| 15 | + |
| 16 | + |
| 17 | +from mathics.builtin.base import Builtin |
| 18 | +from mathics.builtin.files_io.files import mathics_open |
| 19 | +from mathics.core.expression import Expression, String, Symbol, from_python |
| 20 | +from mathics.builtin.base import MessageException |
| 21 | + |
| 22 | +from io import BytesIO |
| 23 | +import re |
| 24 | +import sys |
| 25 | + |
| 26 | +try: |
| 27 | + import lxml.html as lhtml |
| 28 | +except ImportError: |
| 29 | + pass |
| 30 | + |
| 31 | + |
| 32 | +def node_to_xml_element(node, strip_whitespace=True): |
| 33 | + def children(): |
| 34 | + text = node.text |
| 35 | + if text: |
| 36 | + if strip_whitespace: |
| 37 | + text = text.strip() |
| 38 | + if text: |
| 39 | + yield String(text) |
| 40 | + for child in node: |
| 41 | + for element in node_to_xml_element(child, strip_whitespace): |
| 42 | + yield element |
| 43 | + tail = node.tail |
| 44 | + if tail: |
| 45 | + if strip_whitespace: |
| 46 | + tail = tail.strip() |
| 47 | + if tail: |
| 48 | + yield String(tail) |
| 49 | + |
| 50 | + def attributes(): |
| 51 | + for name, value in node.attrib.items(): |
| 52 | + yield Expression("Rule", from_python(name), from_python(value)) |
| 53 | + |
| 54 | + return [ |
| 55 | + Expression( |
| 56 | + "XMLElement", |
| 57 | + String(node.tag), |
| 58 | + Expression("List", *list(attributes())), |
| 59 | + Expression("List", *list(children())), |
| 60 | + ) |
| 61 | + ] |
| 62 | + |
| 63 | + |
| 64 | +def xml_object(tree): |
| 65 | + declaration = [ |
| 66 | + Expression( |
| 67 | + Expression("XMLObject", String("Declaration")), |
| 68 | + Expression( |
| 69 | + "Rule", String("Version"), String(tree.docinfo.xml_version or "1.0") |
| 70 | + ), |
| 71 | + Expression( |
| 72 | + "Rule", |
| 73 | + String("Standalone"), |
| 74 | + String("yes") if tree.docinfo.standalone else String("no"), |
| 75 | + ), |
| 76 | + Expression("Rule", String("Encoding"), String(tree.docinfo.encoding)), |
| 77 | + ) |
| 78 | + ] |
| 79 | + |
| 80 | + return Expression( |
| 81 | + Expression("XMLObject", String("Document")), |
| 82 | + Expression("List", *declaration), |
| 83 | + *node_to_xml_element(tree.getroot()) |
| 84 | + ) |
| 85 | + |
| 86 | + |
| 87 | +class ParseError(Exception): |
| 88 | + pass |
| 89 | + |
| 90 | + |
| 91 | +if "__pypy__" in sys.builtin_module_names: |
| 92 | + |
| 93 | + def parse_html_stream(f): |
| 94 | + parser = lhtml.HTMLParser(encoding="utf8") |
| 95 | + return lhtml.parse(f, parser) |
| 96 | + |
| 97 | + |
| 98 | +else: |
| 99 | + |
| 100 | + def parse_html_stream(f): |
| 101 | + return lhtml.parse(f) |
| 102 | + |
| 103 | + |
| 104 | +def parse_html_file(filename): |
| 105 | + with mathics_open(filename, "rb") as f: |
| 106 | + return parse_html_stream(f) |
| 107 | + |
| 108 | + |
| 109 | +def parse_html(parse, text, evaluation): |
| 110 | + try: |
| 111 | + return parse(text.get_string_value()) |
| 112 | + except IOError: |
| 113 | + evaluation.message("General", "noopen", text.get_string_value()) |
| 114 | + return Symbol("$Failed") |
| 115 | + except MessageException as e: |
| 116 | + e.message(evaluation) |
| 117 | + return Symbol("$Failed") |
| 118 | + |
| 119 | + |
| 120 | +class _HTMLBuiltin(Builtin): |
| 121 | + context = "HTML`" |
| 122 | + |
| 123 | + requires = ("lxml",) |
| 124 | + |
| 125 | + |
| 126 | +class _TagImport(_HTMLBuiltin): |
| 127 | + def _import(self, tree): |
| 128 | + raise NotImplementedError |
| 129 | + |
| 130 | + def apply(self, text, evaluation): |
| 131 | + """%(name)s[text_String]""" |
| 132 | + tree = parse_html(parse_html_file, text, evaluation) |
| 133 | + if isinstance(tree, Symbol): # $Failed? |
| 134 | + return tree |
| 135 | + return Expression("List", Expression("Rule", self.tag_name, self._import(tree))) |
| 136 | + |
| 137 | + |
| 138 | +class _Get(_HTMLBuiltin): |
| 139 | + context = "HTML`Parser`" |
| 140 | + |
| 141 | + messages = { |
| 142 | + "prserr": "``.", |
| 143 | + } |
| 144 | + |
| 145 | + def apply(self, text, evaluation): |
| 146 | + """%(name)s[text_String]""" |
| 147 | + root = parse_html(self._parse, text, evaluation) |
| 148 | + if isinstance(root, Symbol): # $Failed? |
| 149 | + return root |
| 150 | + else: |
| 151 | + return xml_object(root) |
| 152 | + |
| 153 | + |
| 154 | +class HTMLGet(_Get): |
| 155 | + def _parse(self, text): |
| 156 | + return parse_html_file(text) |
| 157 | + |
| 158 | + |
| 159 | +class HTMLGetString(_Get): |
| 160 | + """ |
| 161 | + #> Head[HTML`Parser`HTMLGetString["<a></a>"]] |
| 162 | + = XMLObject[Document] |
| 163 | +
|
| 164 | + #> Head[HTML`Parser`HTMLGetString["<a><b></a>"]] |
| 165 | + = XMLObject[Document] |
| 166 | + """ |
| 167 | + |
| 168 | + def _parse(self, text): |
| 169 | + with BytesIO() as f: |
| 170 | + f.write(text.encode("utf8")) |
| 171 | + f.seek(0) |
| 172 | + return parse_html_stream(f) |
| 173 | + |
| 174 | + |
| 175 | +class _DataImport(_TagImport): |
| 176 | + def _import(self, tree): |
| 177 | + full_data = self.full_data |
| 178 | + |
| 179 | + if full_data: |
| 180 | + |
| 181 | + def add_data(l, x): |
| 182 | + l.append(x) |
| 183 | + return l |
| 184 | + |
| 185 | + else: |
| 186 | + |
| 187 | + def add_data(l, x): |
| 188 | + if x is None: |
| 189 | + return l |
| 190 | + if l is None: |
| 191 | + return [x] |
| 192 | + elif len(x) == 1: |
| 193 | + l.extend(x) |
| 194 | + elif x: |
| 195 | + l.append(Expression("List", *x)) |
| 196 | + return l |
| 197 | + |
| 198 | + newline = re.compile(r"\s+") |
| 199 | + |
| 200 | + def add_text(l, node): |
| 201 | + deep_data = traverse(node) |
| 202 | + if deep_data: # if there's data, we ignore any text |
| 203 | + add_data(l, deep_data) |
| 204 | + else: |
| 205 | + t = [] |
| 206 | + for s in node.xpath(".//text()"): |
| 207 | + t.append(s) |
| 208 | + if t or full_data: |
| 209 | + l.append(String(newline.sub(" ", " ".join(t)))) |
| 210 | + |
| 211 | + def traverse(parent): |
| 212 | + if full_data: |
| 213 | + data = [] |
| 214 | + else: |
| 215 | + data = None |
| 216 | + |
| 217 | + for node in parent: |
| 218 | + tag = node.tag |
| 219 | + if tag == "table": |
| 220 | + row_data = [] |
| 221 | + for tr in node.xpath("tr"): |
| 222 | + col_data = [] |
| 223 | + for td in tr.xpath("th|td"): |
| 224 | + add_text(col_data, td) |
| 225 | + add_data(row_data, col_data) |
| 226 | + data = add_data(data, row_data) |
| 227 | + elif tag in ("ul", "ol"): |
| 228 | + list_data = [] |
| 229 | + for child in node: |
| 230 | + deep_data = traverse(child) |
| 231 | + if deep_data: |
| 232 | + add_data(list_data, deep_data) |
| 233 | + elif child.tag == "li": |
| 234 | + add_text(list_data, child) |
| 235 | + data = add_data(data, list_data) |
| 236 | + else: |
| 237 | + data = add_data(data, traverse(node)) |
| 238 | + |
| 239 | + if data and len(data) == 1: |
| 240 | + data = data[0] |
| 241 | + |
| 242 | + return data |
| 243 | + |
| 244 | + result = traverse(tree.getroot()) |
| 245 | + if result is None: |
| 246 | + result = [] |
| 247 | + |
| 248 | + return Expression("List", *result) |
| 249 | + |
| 250 | + |
| 251 | +class DataImport(_DataImport): |
| 252 | + """ |
| 253 | + >> Import["ExampleData/PrimeMeridian.html", "Data"][[1, 1, 2, 3]] |
| 254 | + = {Washington, D.C., 77°03′56.07″ W (1897) or 77°04′02.24″ W (NAD 27) or 77°04′01.16″ W (NAD 83), New Naval Observatory meridian} |
| 255 | +
|
| 256 | + #> Length[Import["ExampleData/PrimeMeridian.html", "Data"]] |
| 257 | + = 3 |
| 258 | + """ |
| 259 | + |
| 260 | + full_data = False |
| 261 | + tag_name = "Data" |
| 262 | + |
| 263 | + |
| 264 | +class FullDataImport(_DataImport): |
| 265 | + full_data = True |
| 266 | + tag_name = "FullData" |
| 267 | + |
| 268 | + |
| 269 | +class _LinksImport(_TagImport): |
| 270 | + def _links(self, root): |
| 271 | + raise NotImplementedError |
| 272 | + |
| 273 | + def _import(self, tree): |
| 274 | + return Expression("List", *list(self._links(tree))) |
| 275 | + |
| 276 | + |
| 277 | +class HyperlinksImport(_LinksImport): |
| 278 | + """ |
| 279 | + >> Import["ExampleData/PrimeMeridian.html", "Hyperlinks"][[1]] |
| 280 | + = /wiki/Prime_meridian_(Greenwich) |
| 281 | + """ |
| 282 | + |
| 283 | + tag_name = "Hyperlinks" |
| 284 | + |
| 285 | + def _links(self, tree): |
| 286 | + for link in tree.xpath("//a"): |
| 287 | + href = link.get("href") |
| 288 | + if href and not href.startswith("#"): |
| 289 | + yield href |
| 290 | + |
| 291 | + |
| 292 | +class ImageLinksImport(_LinksImport): |
| 293 | + """ |
| 294 | + >> Import["ExampleData/PrimeMeridian.html", "ImageLinks"][[6]] |
| 295 | + = //upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Prime_meridian.jpg/180px-Prime_meridian.jpg |
| 296 | + """ |
| 297 | + |
| 298 | + tag_name = "ImageLinks" |
| 299 | + |
| 300 | + def _links(self, tree): |
| 301 | + for link in tree.xpath("//img"): |
| 302 | + src = link.get("src") |
| 303 | + if src: |
| 304 | + yield src |
| 305 | + |
| 306 | + |
| 307 | +class PlaintextImport(_TagImport): |
| 308 | + """ |
| 309 | + >> DeleteDuplicates[StringCases[Import["ExampleData/PrimeMeridian.html"], RegularExpression["Wiki[a-z]+"]]] |
| 310 | + = {Wikipedia, Wikidata, Wikibase, Wikimedia} |
| 311 | + """ |
| 312 | + |
| 313 | + tag_name = "Plaintext" |
| 314 | + |
| 315 | + def _import(self, tree): |
| 316 | + def lines(): |
| 317 | + for s in tree.xpath("//text()"): |
| 318 | + t = s.strip() |
| 319 | + if t: |
| 320 | + yield t |
| 321 | + |
| 322 | + return String("\n".join(lines())) |
| 323 | + |
| 324 | + |
| 325 | +class SourceImport(_HTMLBuiltin): |
| 326 | + """ |
| 327 | + >> DeleteDuplicates[StringCases[Import["ExampleData/PrimeMeridian.html", "Source"], RegularExpression["<t[a-z]+>"]]] |
| 328 | + = {<title>, <tr>, <th>, <td>} |
| 329 | + """ |
| 330 | + |
| 331 | + def apply(self, text, evaluation): |
| 332 | + """%(name)s[text_String]""" |
| 333 | + |
| 334 | + def source(filename): |
| 335 | + with mathics_open(filename, "r", encoding="UTF-8") as f: |
| 336 | + return Expression( |
| 337 | + "List", Expression("Rule", "Source", String(f.read())) |
| 338 | + ) |
| 339 | + |
| 340 | + return parse_html(source, text, evaluation) |
| 341 | + |
| 342 | + |
| 343 | +class TitleImport(_TagImport): |
| 344 | + """ |
| 345 | + >> Import["ExampleData/PrimeMeridian.html", "Title"] |
| 346 | + = Prime meridian - Wikipedia |
| 347 | + """ |
| 348 | + |
| 349 | + tag_name = "Title" |
| 350 | + |
| 351 | + def _import(self, tree): |
| 352 | + for node in tree.xpath("//title"): |
| 353 | + return String(node.text_content()) |
| 354 | + return String("") |
| 355 | + |
| 356 | + |
| 357 | +class XMLObjectImport(_HTMLBuiltin): |
| 358 | + """ |
| 359 | + >> Part[Import["ExampleData/PrimeMeridian.html", "XMLObject"], 2, 3, 1, 3, 2] |
| 360 | + = XMLElement[title, {}, {Prime meridian - Wikipedia}] |
| 361 | + """ |
| 362 | + |
| 363 | + def apply(self, text, evaluation): |
| 364 | + """%(name)s[text_String]""" |
| 365 | + xml = Expression("HTML`Parser`HTMLGet", text).evaluate(evaluation) |
| 366 | + return Expression("List", Expression("Rule", "XMLObject", xml)) |
0 commit comments