Skip to content

Commit 056b075

Browse files
author
Mario Hros
committed
handle more spaces in succession, handle new lines as a space
1 parent dd6806c commit 056b075

2 files changed

Lines changed: 14 additions & 2 deletions

File tree

html2text.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,13 @@ func HTMLEntitiesToText(htmlEntsText string) string {
6767
return outBuf.String()
6868
}
6969

70+
func writeSpace(outBuf *bytes.Buffer) {
71+
bts := outBuf.Bytes()
72+
if len(bts) > 0 && bts[len(bts)-1] != ' ' {
73+
outBuf.WriteString(" ")
74+
}
75+
}
76+
7077
// HTML2Text converts html into a text form
7178
func HTML2Text(html string) string {
7279
inLen := len(html)
@@ -87,7 +94,10 @@ func HTML2Text(html string) string {
8794
}
8895

8996
switch {
90-
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029: // skip new lines
97+
// skip new lines and spaces adding a single space if not there yet
98+
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
99+
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
100+
writeSpace(outBuf)
91101
continue
92102

93103
case r == ';' && inEnt: // end of html entity

html2text_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ func TestHTML2Text(t *testing.T) {
2323
So(HTML2Text(`some <div id="a" class="b">div</div>`), ShouldEqual, "some div")
2424
})
2525

26-
Convey("Line breaks", func() {
26+
Convey("Line breaks and spaces", func() {
27+
So(HTML2Text("should ignore more spaces"), ShouldEqual, "should ignore more spaces")
2728
So(HTML2Text("should \nignore \r\nnew lines"), ShouldEqual, "should ignore new lines")
29+
So(HTML2Text("a\nb\nc"), ShouldEqual, "a b c")
2830
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
2931
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
3032
})

0 commit comments

Comments
 (0)