@@ -4,20 +4,46 @@ import (
44 "bytes"
55 "regexp"
66 "strings"
7+ "strconv"
78)
89
10+ const (
11+ WIN_LBR = "\r \n "
12+ UNIX_LBR = "\n "
13+ )
14+
15+ var lbr = WIN_LBR
916var badTagnamesRE = regexp .MustCompile (`^(head|script|style|a)($|\s*)` )
1017var linkTagRE = regexp .MustCompile (`a.*href=('([^']*?)'|"([^"]*?)")` )
1118var badLinkHrefRE = regexp .MustCompile (`#|javascript:` )
1219var headersRE = regexp .MustCompile (`^(\/)?h[1-6]` )
20+ var numericEntityRE = regexp .MustCompile (`^#([0-9]+)$` )
1321
1422func parseHTMLEntity (entName string ) (string , bool ) {
23+
1524 if r , ok := entity [entName ]; ok {
1625 return string (r ), true
1726 }
27+
28+ if match := numericEntityRE .FindStringSubmatch (entName ); len (match ) == 2 {
29+ digits := match [1 ]
30+ n , err := strconv .Atoi (digits )
31+ if err == nil && (n == 9 || n == 10 || n == 13 || n > 31 ) {
32+ return string (rune (n )), true
33+ }
34+ }
35+
1836 return "" , false
1937}
2038
39+ func SetUnixLbr (b bool ) {
40+ if b {
41+ lbr = UNIX_LBR
42+ } else {
43+ lbr = WIN_LBR
44+ }
45+ }
46+
2147// HTMLEntitiesToText decodes HTML entities inside a provided
2248// string and returns decoded text
2349func HTMLEntitiesToText (htmlEntsText string ) string {
@@ -96,7 +122,7 @@ func HTML2Text(html string) string {
96122 switch {
97123 // skip new lines and spaces adding a single space if not there yet
98124 case r <= 0xD , r == 0x85 , r == 0x2028 , r == 0x2029 , // new lines
99- r == ' ' , r >= 0x2008 && r <= 0x200B : // spaces
125+ r == ' ' , r >= 0x2008 && r <= 0x200B : // spaces
100126 writeSpace (outBuf )
101127 continue
102128
@@ -144,20 +170,20 @@ func HTML2Text(html string) string {
144170 tagName := strings .ToLower (html [tagStart :i ])
145171
146172 if tagName == "/ul" {
147- outBuf .WriteString (" \r \n " )
173+ outBuf .WriteString (lbr )
148174 } else if tagName == "li" || tagName == "li/" {
149- outBuf .WriteString (" \r \n " )
175+ outBuf .WriteString (lbr )
150176 } else if headersRE .MatchString (tagName ) {
151177 if canPrintNewline {
152- outBuf .WriteString (" \r \n \r \n " )
178+ outBuf .WriteString (lbr + lbr )
153179 }
154180 canPrintNewline = false
155181 } else if tagName == "br" || tagName == "br/" {
156182 // new line
157- outBuf .WriteString (" \r \n " )
183+ outBuf .WriteString (lbr )
158184 } else if tagName == "p" || tagName == "/p" {
159185 if canPrintNewline {
160- outBuf .WriteString (" \r \n \r \n " )
186+ outBuf .WriteString (lbr + lbr )
161187 }
162188 canPrintNewline = false
163189 } else if badTagnamesRE .MatchString (tagName ) {
0 commit comments