Skip to content

Commit 9556150

Browse files
authored
Merge pull request #8 from merlincox/master
add support for numeric HTML entities and Unix style line breaks
2 parents eef0eb5 + a307896 commit 9556150

2 files changed

Lines changed: 47 additions & 6 deletions

File tree

html2text.go

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,47 @@ import (
44
"bytes"
55
"regexp"
66
"strings"
7+
"strconv"
78
)
89

10+
const (
11+
WIN_LBR = "\r\n"
12+
UNIX_LBR = "\n"
13+
)
14+
15+
var lbr = WIN_LBR
916
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s*)`)
1017
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
1118
var badLinkHrefRE = regexp.MustCompile(`#|javascript:`)
1219
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
20+
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)
1321

1422
func parseHTMLEntity(entName string) (string, bool) {
1523
if r, ok := entity[entName]; ok {
1624
return string(r), true
1725
}
26+
27+
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
28+
digits := match[1]
29+
n, err := strconv.Atoi(digits)
30+
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
31+
return string(rune(n)), true
32+
}
33+
}
34+
1835
return "", false
1936
}
2037

38+
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
39+
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
40+
func SetUnixLbr(b bool) {
41+
if b {
42+
lbr = UNIX_LBR
43+
} else {
44+
lbr = WIN_LBR
45+
}
46+
}
47+
2148
// HTMLEntitiesToText decodes HTML entities inside a provided
2249
// string and returns decoded text
2350
func HTMLEntitiesToText(htmlEntsText string) string {
@@ -96,7 +123,7 @@ func HTML2Text(html string) string {
96123
switch {
97124
// skip new lines and spaces adding a single space if not there yet
98125
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
99-
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
126+
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
100127
writeSpace(outBuf)
101128
continue
102129

@@ -144,20 +171,20 @@ func HTML2Text(html string) string {
144171
tagName := strings.ToLower(html[tagStart:i])
145172

146173
if tagName == "/ul" {
147-
outBuf.WriteString("\r\n")
174+
outBuf.WriteString(lbr)
148175
} else if tagName == "li" || tagName == "li/" {
149-
outBuf.WriteString("\r\n")
176+
outBuf.WriteString(lbr)
150177
} else if headersRE.MatchString(tagName) {
151178
if canPrintNewline {
152-
outBuf.WriteString("\r\n\r\n")
179+
outBuf.WriteString(lbr + lbr)
153180
}
154181
canPrintNewline = false
155182
} else if tagName == "br" || tagName == "br/" {
156183
// new line
157-
outBuf.WriteString("\r\n")
184+
outBuf.WriteString(lbr)
158185
} else if tagName == "p" || tagName == "/p" {
159186
if canPrintNewline {
160-
outBuf.WriteString("\r\n\r\n")
187+
outBuf.WriteString(lbr + lbr)
161188
}
162189
canPrintNewline = false
163190
} else if badTagnamesRE.MatchString(tagName) {

html2text_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,25 @@ func TestHTML2Text(t *testing.T) {
6464
So(HTMLEntitiesToText("&abcdefghij;"), ShouldEqual, "&abcdefghij;")
6565
})
6666

67+
Convey("Numeric HTML Entities", func() {
68+
So(HTMLEntitiesToText("&#39;single quotes&#39; and &#52765;"), ShouldEqual, "'single quotes' and 츝")
69+
})
70+
6771
Convey("Full HTML structure", func() {
6872
So(HTML2Text(``), ShouldEqual, "")
6973
So(HTML2Text(`<html><head><title>Good</title></head><body>x</body>`), ShouldEqual, "x")
7074
So(HTML2Text(`we are not <script type="javascript"></script>interested in scripts`),
7175
ShouldEqual, "we are not interested in scripts")
7276
})
77+
78+
Convey("Switching Unix and Windows line breaks", func() {
79+
SetUnixLbr(true)
80+
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\nline\nbreaks")
81+
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\n\nparagraphs")
82+
SetUnixLbr(false)
83+
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
84+
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
85+
})
86+
7387
})
7488
}

0 commit comments

Comments
 (0)