Skip to content

Commit 514e0f4

Browse files
author
Mario Hros
committed
handing <br> and paragraphs
1 parent 8cc96f9 commit 514e0f4

2 files changed

Lines changed: 37 additions & 8 deletions

File tree

html2text.go

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ func HTMLEntitiesToText(htmlEntsText string) string {
5353

5454
for i, r := range htmlEntsText {
5555
switch {
56+
case r == '\n', r == '\r': // skip new lines
57+
continue
58+
5659
case r == ';' && inEnt:
5760
inEnt = false
5861
continue
@@ -96,21 +99,30 @@ func HTMLEntitiesToText(htmlEntsText string) string {
9699

97100
// HTML2Text converts html into a text form
98101
func HTML2Text(html string) string {
102+
inLen := len(html)
99103
tagStart := 0
100104
inEnt := false
101105
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
102106
shouldOutput := true
107+
// new line cannot be printed at the beginning or
108+
// for <p> after a new line created by previous <p></p>
109+
canPrintNewline := false
103110

104111
outBuf := bytes.NewBufferString("")
105112

106113
for i, r := range html {
114+
if inLen > 0 && i == inLen-1 {
115+
// prevent new line at the end of the document
116+
canPrintNewline = false
117+
}
118+
107119
switch {
108-
case r == ';' && inEnt:
120+
case r == ';' && inEnt: // end of html entity
109121
inEnt = false
110122
shouldOutput = true
111123
continue
112124

113-
case r == '&' && shouldOutput: //possible html entity
125+
case r == '&' && shouldOutput: // possible html entity
114126
entName := ""
115127
isEnt := false
116128

@@ -139,15 +151,25 @@ func HTML2Text(html string) string {
139151
}
140152
}
141153

142-
case r == '<':
154+
case r == '<': // start of a tag
143155
tagStart = i + 1
144156
shouldOutput = false
145157
continue
146-
case r == '>':
158+
159+
case r == '>': // end of a tag
147160
shouldOutput = true
148161
tagName := strings.ToLower(html[tagStart:i])
149162

150-
if badTagnamesRE.MatchString(tagName) {
163+
if tagName == "br" || tagName == "br/" {
164+
// new line
165+
outBuf.WriteString("\r\n")
166+
} else if tagName == "p" || tagName == "/p" {
167+
if canPrintNewline {
168+
outBuf.WriteString("\r\n")
169+
}
170+
canPrintNewline = false
171+
} else if badTagnamesRE.MatchString(tagName) {
172+
// unwanted block
151173
badTagStackDepth++
152174

153175
// parse link href
@@ -162,16 +184,17 @@ func HTML2Text(html string) string {
162184
outBuf.WriteString(HTMLEntitiesToText(link))
163185
}
164186
}
165-
166187
} else if len(tagName) > 0 && tagName[0] == '/' &&
167188
badTagnamesRE.MatchString(tagName[1:]) {
189+
// end of unwanted block
168190
badTagStackDepth--
169191
}
170-
171192
continue
172-
}
193+
194+
} // switch end
173195

174196
if shouldOutput && badTagStackDepth == 0 && !inEnt {
197+
canPrintNewline = true
175198
outBuf.WriteRune(r)
176199
}
177200
}

html2text_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ func TestHTML2Text(t *testing.T) {
2020
So(HTML2Text(`some <div id="a" class="b">div</div>`), ShouldEqual, "some div")
2121
})
2222

23+
Convey("Line breaks", func() {
24+
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
25+
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\nparagraphs")
26+
})
27+
2328
Convey("HTML entities", func() {
2429
So(HTML2Text(`two&nbsp;&nbsp;spaces`), ShouldEqual, "two spaces")
2530
So(HTML2Text(`&copy; 2017 K3A`), ShouldEqual, "© 2017 K3A")
@@ -31,6 +36,7 @@ func TestHTML2Text(t *testing.T) {
3136
})
3237

3338
Convey("Full HTML structure", func() {
39+
So(HTML2Text(``), ShouldEqual, "")
3440
So(HTML2Text(`<html><head><title>Good</title></head><body>x</body>`), ShouldEqual, "x")
3541
So(HTML2Text(`we are not <script type="javascript"></script>interested in scripts`),
3642
ShouldEqual, "we are not interested in scripts")

0 commit comments

Comments
 (0)