Skip to content

Commit 34a52b5

Browse files
authored
Merge pull request #1 from merlincox/number_ents
Support numeric entities and Unix style line breaks
2 parents 2cdb1fa + 14f368a commit 34a52b5

2 files changed

Lines changed: 46 additions & 6 deletions

File tree

html2text.go

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,46 @@ import (
44
"bytes"
55
"regexp"
66
"strings"
7+
"strconv"
78
)
89

10+
const (
11+
WIN_LBR = "\r\n"
12+
UNIX_LBR = "\n"
13+
)
14+
15+
var lbr = WIN_LBR
916
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s*)`)
1017
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
1118
var badLinkHrefRE = regexp.MustCompile(`#|javascript:`)
1219
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
20+
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)
1321

1422
func parseHTMLEntity(entName string) (string, bool) {
23+
1524
if r, ok := entity[entName]; ok {
1625
return string(r), true
1726
}
27+
28+
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
29+
digits := match[1]
30+
n, err := strconv.Atoi(digits)
31+
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
32+
return string(rune(n)), true
33+
}
34+
}
35+
1836
return "", false
1937
}
2038

39+
func SetUnixLbr(b bool) {
40+
if b {
41+
lbr = UNIX_LBR
42+
} else {
43+
lbr = WIN_LBR
44+
}
45+
}
46+
2147
// HTMLEntitiesToText decodes HTML entities inside a provided
2248
// string and returns decoded text
2349
func HTMLEntitiesToText(htmlEntsText string) string {
@@ -96,7 +122,7 @@ func HTML2Text(html string) string {
96122
switch {
97123
// skip new lines and spaces adding a single space if not there yet
98124
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
99-
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
125+
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
100126
writeSpace(outBuf)
101127
continue
102128

@@ -144,20 +170,20 @@ func HTML2Text(html string) string {
144170
tagName := strings.ToLower(html[tagStart:i])
145171

146172
if tagName == "/ul" {
147-
outBuf.WriteString("\r\n")
173+
outBuf.WriteString(lbr)
148174
} else if tagName == "li" || tagName == "li/" {
149-
outBuf.WriteString("\r\n")
175+
outBuf.WriteString(lbr)
150176
} else if headersRE.MatchString(tagName) {
151177
if canPrintNewline {
152-
outBuf.WriteString("\r\n\r\n")
178+
outBuf.WriteString(lbr + lbr)
153179
}
154180
canPrintNewline = false
155181
} else if tagName == "br" || tagName == "br/" {
156182
// new line
157-
outBuf.WriteString("\r\n")
183+
outBuf.WriteString(lbr)
158184
} else if tagName == "p" || tagName == "/p" {
159185
if canPrintNewline {
160-
outBuf.WriteString("\r\n\r\n")
186+
outBuf.WriteString(lbr + lbr)
161187
}
162188
canPrintNewline = false
163189
} else if badTagnamesRE.MatchString(tagName) {

html2text_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,25 @@ func TestHTML2Text(t *testing.T) {
6464
So(HTMLEntitiesToText("&abcdefghij;"), ShouldEqual, "&abcdefghij;")
6565
})
6666

67+
Convey("Numeric HTML Entities", func() {
68+
So(HTMLEntitiesToText("&#39;single quotes&#39; and &#52765;"), ShouldEqual, "'single quotes' and 츝")
69+
})
70+
6771
Convey("Full HTML structure", func() {
6872
So(HTML2Text(``), ShouldEqual, "")
6973
So(HTML2Text(`<html><head><title>Good</title></head><body>x</body>`), ShouldEqual, "x")
7074
So(HTML2Text(`we are not <script type="javascript"></script>interested in scripts`),
7175
ShouldEqual, "we are not interested in scripts")
7276
})
77+
78+
Convey("Switching Unix and Windows line breaks", func() {
79+
SetUnixLbr(true)
80+
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\nline\nbreaks")
81+
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\n\nparagraphs")
82+
SetUnixLbr(false)
83+
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
84+
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
85+
})
86+
7387
})
7488
}

0 commit comments

Comments
 (0)