Skip to content

Commit a58537e

Browse files
authored
Merge pull request #16 from xStrom/anchor
Add more robust support for HTML5 anchor tags.
2 parents 931105c + 66c9ccd commit a58537e

2 files changed

Lines changed: 24 additions & 3 deletions

File tree

html2text.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ const (
1616

1717
var legacyLBR = WIN_LBR
1818
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
19-
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
19+
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
2020
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
2121
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
2222
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
@@ -261,10 +261,13 @@ func HTML2TextWithOptions(html string, reqOpts ...Option) string {
261261
// parse link href
262262
// add special handling for a tags
263263
m := linkTagRE.FindStringSubmatch(tag)
264-
if len(m) == 4 {
264+
if len(m) == 5 {
265265
link := m[2]
266266
if len(link) == 0 {
267267
link = m[3]
268+
if len(link) == 0 {
269+
link = m[4]
270+
}
268271
}
269272

270273
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
@@ -280,10 +283,13 @@ func HTML2TextWithOptions(html string, reqOpts ...Option) string {
280283
if !opts.linksInnerText {
281284
// parse link href
282285
m := linkTagRE.FindStringSubmatch(tag)
283-
if len(m) == 4 {
286+
if len(m) == 5 {
284287
link := m[2]
285288
if len(link) == 0 {
286289
link = m[3]
290+
if len(link) == 0 {
291+
link = m[4]
292+
}
287293
}
288294

289295
if !badLinkHrefRE.MatchString(link) {

html2text_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ func TestHTML2Text(t *testing.T) {
1515

1616
// the original behavior
1717
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click test")
18+
So(HTML2Text(`click <A hRef="test">here</A>`), ShouldEqual, "click test")
19+
So(HTML2Text(`click <a href='test'>here</a>`), ShouldEqual, "click test")
20+
So(HTML2Text(`click <a href=test>here</a>`), ShouldEqual, "click test")
21+
So(HTML2Text(`click <a href =test>here</a>`), ShouldEqual, "click test")
22+
So(HTML2Text(`click <a href = test>here</a>`), ShouldEqual, "click test")
23+
So(HTML2Text(`click <a href = test>here</a>`), ShouldEqual, "click test")
24+
So(HTML2Text(`click <a href = test target="_blank">here</a>`), ShouldEqual, "click test")
1825
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
1926
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
2027
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
@@ -24,6 +31,13 @@ func TestHTML2Text(t *testing.T) {
2431

2532
// with inner text
2633
So(HTML2TextWithOptions(`click <a href="test">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
34+
So(HTML2TextWithOptions(`click <A hRef="test">here</A>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
35+
So(HTML2TextWithOptions(`click <a href='test'>here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
36+
So(HTML2TextWithOptions(`click <a href=test>here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
37+
So(HTML2TextWithOptions(`click <a href =test>here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
38+
So(HTML2TextWithOptions(`click <a href = test>here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
39+
So(HTML2TextWithOptions(`click <a href = test>here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
40+
So(HTML2TextWithOptions(`click <a href = test target="_blank">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
2741
So(HTML2TextWithOptions(`click <a class="x" href="test">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <test>")
2842
So(HTML2TextWithOptions(`click <a href="ents/&apos;x&apos;">here</a>`, WithLinksInnerText()), ShouldEqual, "click here <ents/'x'>")
2943
So(HTML2TextWithOptions(`click <a href="javascript:void(0)">here</a>`, WithLinksInnerText()), ShouldEqual, "click here")
@@ -87,6 +101,7 @@ func TestHTML2Text(t *testing.T) {
87101
Convey("Full HTML structure", func() {
88102
So(HTML2Text(``), ShouldEqual, "")
89103
So(HTML2Text(`<html><head><title>Good</title></head><body>x</body>`), ShouldEqual, "x")
104+
So(HTML2Text(`<html><head href="foo"><title>Good</title></head><body>x</body>`), ShouldEqual, "x")
90105
So(HTML2Text(`we are not <script type="javascript"></script>interested in scripts`),
91106
ShouldEqual, "we are not interested in scripts")
92107
})

0 commit comments

Comments
 (0)