@@ -4,20 +4,47 @@ import (
44 "bytes"
55 "regexp"
66 "strings"
7+ "strconv"
78)
89
10+ const (
11+ WIN_LBR = "\r \n "
12+ UNIX_LBR = "\n "
13+ )
14+
15+ var lbr = WIN_LBR
916var badTagnamesRE = regexp .MustCompile (`^(head|script|style|a)($|\s*)` )
1017var linkTagRE = regexp .MustCompile (`a.*href=('([^']*?)'|"([^"]*?)")` )
1118var badLinkHrefRE = regexp .MustCompile (`#|javascript:` )
1219var headersRE = regexp .MustCompile (`^(\/)?h[1-6]` )
20+ var numericEntityRE = regexp .MustCompile (`^#([0-9]+)$` )
1321
1422func parseHTMLEntity (entName string ) (string , bool ) {
1523 if r , ok := entity [entName ]; ok {
1624 return string (r ), true
1725 }
26+
27+ if match := numericEntityRE .FindStringSubmatch (entName ); len (match ) == 2 {
28+ digits := match [1 ]
29+ n , err := strconv .Atoi (digits )
30+ if err == nil && (n == 9 || n == 10 || n == 13 || n > 31 ) {
31+ return string (rune (n )), true
32+ }
33+ }
34+
1835 return "" , false
1936}
2037
38+ // SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
39+ // with argument false sets Windows-style line-breaks in output ("\r\n", the default)
40+ func SetUnixLbr (b bool ) {
41+ if b {
42+ lbr = UNIX_LBR
43+ } else {
44+ lbr = WIN_LBR
45+ }
46+ }
47+
2148// HTMLEntitiesToText decodes HTML entities inside a provided
2249// string and returns decoded text
2350func HTMLEntitiesToText (htmlEntsText string ) string {
@@ -96,7 +123,7 @@ func HTML2Text(html string) string {
96123 switch {
97124 // skip new lines and spaces adding a single space if not there yet
98125 case r <= 0xD , r == 0x85 , r == 0x2028 , r == 0x2029 , // new lines
99- r == ' ' , r >= 0x2008 && r <= 0x200B : // spaces
126+ r == ' ' , r >= 0x2008 && r <= 0x200B : // spaces
100127 writeSpace (outBuf )
101128 continue
102129
@@ -144,20 +171,20 @@ func HTML2Text(html string) string {
144171 tagName := strings .ToLower (html [tagStart :i ])
145172
146173 if tagName == "/ul" {
147- outBuf .WriteString (" \r \n " )
174+ outBuf .WriteString (lbr )
148175 } else if tagName == "li" || tagName == "li/" {
149- outBuf .WriteString (" \r \n " )
176+ outBuf .WriteString (lbr )
150177 } else if headersRE .MatchString (tagName ) {
151178 if canPrintNewline {
152- outBuf .WriteString (" \r \n \r \n " )
179+ outBuf .WriteString (lbr + lbr )
153180 }
154181 canPrintNewline = false
155182 } else if tagName == "br" || tagName == "br/" {
156183 // new line
157- outBuf .WriteString (" \r \n " )
184+ outBuf .WriteString (lbr )
158185 } else if tagName == "p" || tagName == "/p" {
159186 if canPrintNewline {
160- outBuf .WriteString (" \r \n \r \n " )
187+ outBuf .WriteString (lbr + lbr )
161188 }
162189 canPrintNewline = false
163190 } else if badTagnamesRE .MatchString (tagName ) {
0 commit comments