@@ -53,6 +53,9 @@ func HTMLEntitiesToText(htmlEntsText string) string {
5353
5454 for i , r := range htmlEntsText {
5555 switch {
56+ case r == '\n' , r == '\r' : // skip new lines
57+ continue
58+
5659 case r == ';' && inEnt :
5760 inEnt = false
5861 continue
@@ -96,21 +99,30 @@ func HTMLEntitiesToText(htmlEntsText string) string {
9699
97100// HTML2Text converts html into a text form
98101func HTML2Text (html string ) string {
102+ inLen := len (html )
99103 tagStart := 0
100104 inEnt := false
101105 badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
102106 shouldOutput := true
107+ // new line cannot be printed at the beginning or
108+ // for <p> after a new line created by previous <p></p>
109+ canPrintNewline := false
103110
104111 outBuf := bytes .NewBufferString ("" )
105112
106113 for i , r := range html {
114+ if inLen > 0 && i == inLen - 1 {
115+ // prevent new line at the end of the document
116+ canPrintNewline = false
117+ }
118+
107119 switch {
108- case r == ';' && inEnt :
120+ case r == ';' && inEnt : // end of html entity
109121 inEnt = false
110122 shouldOutput = true
111123 continue
112124
113- case r == '&' && shouldOutput : //possible html entity
125+ case r == '&' && shouldOutput : // possible html entity
114126 entName := ""
115127 isEnt := false
116128
@@ -139,15 +151,25 @@ func HTML2Text(html string) string {
139151 }
140152 }
141153
142- case r == '<' :
154+ case r == '<' : // start of a tag
143155 tagStart = i + 1
144156 shouldOutput = false
145157 continue
146- case r == '>' :
158+
159+ case r == '>' : // end of a tag
147160 shouldOutput = true
148161 tagName := strings .ToLower (html [tagStart :i ])
149162
150- if badTagnamesRE .MatchString (tagName ) {
163+ if tagName == "br" || tagName == "br/" {
164+ // new line
165+ outBuf .WriteString ("\r \n " )
166+ } else if tagName == "p" || tagName == "/p" {
167+ if canPrintNewline {
168+ outBuf .WriteString ("\r \n " )
169+ }
170+ canPrintNewline = false
171+ } else if badTagnamesRE .MatchString (tagName ) {
172+ // unwanted block
151173 badTagStackDepth ++
152174
153175 // parse link href
@@ -162,16 +184,17 @@ func HTML2Text(html string) string {
162184 outBuf .WriteString (HTMLEntitiesToText (link ))
163185 }
164186 }
165-
166187 } else if len (tagName ) > 0 && tagName [0 ] == '/' &&
167188 badTagnamesRE .MatchString (tagName [1 :]) {
189+ // end of unwanted block
168190 badTagStackDepth --
169191 }
170-
171192 continue
172- }
193+
194+ } // switch end
173195
174196 if shouldOutput && badTagStackDepth == 0 && ! inEnt {
197+ canPrintNewline = true
175198 outBuf .WriteRune (r )
176199 }
177200 }
0 commit comments