Skip to content

Commit 51179b0

Browse files
author
Mario Hros
committed
support hexadecimal numeric html entities, fixes #13
1 parent a9ab4df commit 51179b0

4 files changed

Lines changed: 32 additions & 3 deletions

File tree

go.mod

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
module github.com/k3a/html2text
2+
3+
go 1.16
4+
5+
require github.com/smartystreets/goconvey v1.6.4 // indirect

go.sum

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
2+
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
3+
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
4+
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
5+
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
6+
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
7+
github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
8+
github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
9+
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
10+
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
11+
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
12+
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
13+
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=

html2text.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,26 @@ var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
1717
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
1818
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
1919
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
20-
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)
20+
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
2121

2222
func parseHTMLEntity(entName string) (string, bool) {
2323
if r, ok := entity[entName]; ok {
2424
return string(r), true
2525
}
2626

2727
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
28-
digits := match[1]
29-
n, err := strconv.Atoi(digits)
28+
var (
29+
err error
30+
n int64
31+
digits = match[1]
32+
)
33+
34+
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
35+
n, err = strconv.ParseInt(digits[1:], 16, 64)
36+
} else {
37+
n, err = strconv.ParseInt(digits, 10, 64)
38+
}
39+
3040
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
3141
return string(rune(n)), true
3242
}

html2text_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ func TestHTML2Text(t *testing.T) {
6161
So(HTML2Text(`fish & chips`), ShouldEqual, "fish & chips")
6262
So(HTML2Text(`"I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey`), ShouldEqual, "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey")
6363
So(HTML2Text(`Google ®`), ShouldEqual, "Google ®")
64+
So(HTML2Text(`⁌ decimal and hex entities supported ⁍`), ShouldEqual, "⁌ decimal and hex entities supported ⁍")
6465
})
6566

6667
Convey("Large Entity", func() {

0 commit comments

Comments
 (0)