Skip to content

Commit 35493e6

Browse files
devsteinMario Hros
authored andcommitted
Change behavior to output link after inner a tag text
1 parent bb16444 commit 35493e6

2 files changed

Lines changed: 26 additions & 13 deletions

File tree

html2text.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const (
1313
)
1414

1515
var lbr = WIN_LBR
16-
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
16+
var badTagnamesRE = regexp.MustCompile(`^(head|script|style)($|\s+)`)
1717
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
1818
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
1919
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
@@ -118,6 +118,8 @@ func HTML2Text(html string) string {
118118
inEnt := false
119119
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
120120
shouldOutput := true
121+
// maintain a stack of <a> tag href links and output it after the tag's inner text
122+
hrefs := []string{}
121123
// new line cannot be printed at the beginning or
122124
// for <p> after a new line created by previous <p></p>
123125
canPrintNewline := false
@@ -199,11 +201,18 @@ func HTML2Text(html string) string {
199201
outBuf.WriteString(lbr + lbr)
200202
}
201203
canPrintNewline = false
202-
} else if badTagnamesRE.MatchString(tagNameLowercase) {
203-
// unwanted block
204-
badTagStackDepth++
205-
204+
} else if tagNameLowercase == "/a" {
205+
// end of link
206+
// links can be empty can happen if the link matches the badLinkHrefRE
207+
if len(hrefs) > 0 {
208+
outBuf.WriteString(" <")
209+
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
210+
outBuf.WriteString(">")
211+
hrefs = hrefs[1:]
212+
}
213+
} else if linkTagRE.MatchString(tagNameLowercase) {
206214
// parse link href
215+
// add special handling for a tags
207216
m := linkTagRE.FindStringSubmatch(tag)
208217
if len(m) == 4 {
209218
link := m[2]
@@ -212,9 +221,12 @@ func HTML2Text(html string) string {
212221
}
213222

214223
if !badLinkHrefRE.MatchString(link) {
215-
outBuf.WriteString(HTMLEntitiesToText(link))
224+
hrefs = append(hrefs, link)
216225
}
217226
}
227+
} else if badTagnamesRE.MatchString(tagNameLowercase) {
228+
// unwanted block
229+
badTagStackDepth++
218230
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
219231
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
220232
// end of unwanted block

html2text_test.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,14 @@ func TestHTML2Text(t *testing.T) {
1212
Convey("Links", func() {
1313
So(HTML2Text(`<div></div>`), ShouldEqual, "")
1414
So(HTML2Text(`<div>simple text</div>`), ShouldEqual, "simple text")
15-
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click test")
16-
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
17-
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
18-
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
19-
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click test")
20-
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click http://bit.ly/2n4wXRs")
21-
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "/wiki/yet#English, /wiki/not_yet#English")
15+
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click here <test>")
16+
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click here <test>")
17+
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click here <ents/'x'>")
18+
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click here")
19+
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click here or here <test>")
20+
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click news <http://bit.ly/2n4wXRs>")
21+
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "yet </wiki/yet#English>, not yet </wiki/not_yet#English>")
22+
So(HTML2Text(`click <a href="one">here<a href="two"> or</a><span> here</span></a>`), ShouldEqual, "click here or <one> here <two>")
2223
})
2324

2425
Convey("Inlines", func() {

0 commit comments

Comments
 (0)