Skip to content

Commit 52104c9

Browse files
author
Mario Hros
committed
fix: allow # in link URLs and prevent unnecessary space, fixes #11
1 parent 62431c4 commit 52104c9

2 files changed

Lines changed: 7 additions & 4 deletions

File tree

html2text.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const (
1515
var lbr = WIN_LBR
1616
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s*)`)
1717
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
18-
var badLinkHrefRE = regexp.MustCompile(`#|javascript:`)
18+
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
1919
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
2020
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)
2121

@@ -124,12 +124,14 @@ func HTML2Text(html string) string {
124124
// skip new lines and spaces adding a single space if not there yet
125125
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
126126
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
127-
writeSpace(outBuf)
127+
if shouldOutput && badTagStackDepth == 0 && !inEnt {
128+
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
129+
writeSpace(outBuf)
130+
}
128131
continue
129132

130133
case r == ';' && inEnt: // end of html entity
131134
inEnt = false
132-
shouldOutput = true
133135
continue
134136

135137
case r == '&' && shouldOutput: // possible html entity
@@ -156,7 +158,6 @@ func HTML2Text(html string) string {
156158
if ent, isEnt := parseHTMLEntity(entName); isEnt {
157159
outBuf.WriteString(ent)
158160
inEnt = true
159-
shouldOutput = false
160161
continue
161162
}
162163
}

html2text_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ func TestHTML2Text(t *testing.T) {
1616
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
1717
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
1818
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
19+
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click test")
1920
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click http://bit.ly/2n4wXRs")
21+
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "/wiki/yet#English, /wiki/not_yet#English")
2022
})
2123

2224
Convey("Inlines", func() {

0 commit comments

Comments
 (0)