Skip to content

Commit 26fd74f

Browse files
authored
Merge branch 'master' into unordered-lists
2 parents 90bb105 + ff40519 commit 26fd74f

2 files changed

Lines changed: 30 additions & 2 deletions

File tree

html2text.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s*)`)
1010
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
1111
var badLinkHrefRE = regexp.MustCompile(`#|javascript:`)
12+
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
1213

1314
func parseHTMLEntity(entName string) (string, bool) {
1415
entName = strings.ToLower(entName)
@@ -164,12 +165,17 @@ func HTML2Text(html string) string {
164165
outBuf.WriteString("\r\n")
165166
} else if tagName == "li" || tagName == "li/" {
166167
outBuf.WriteString("\r\n")
168+
} else if headersRE.MatchString(tagName) {
169+
if canPrintNewline {
170+
outBuf.WriteString("\r\n\r\n")
171+
}
172+
canPrintNewline = false
167173
} else if tagName == "br" || tagName == "br/" {
168174
// new line
169175
outBuf.WriteString("\r\n")
170176
} else if tagName == "p" || tagName == "/p" {
171177
if canPrintNewline {
172-
outBuf.WriteString("\r\n")
178+
outBuf.WriteString("\r\n\r\n")
173179
}
174180
canPrintNewline = false
175181
} else if badTagnamesRE.MatchString(tagName) {

html2text_test.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,22 @@ func TestHTML2Text(t *testing.T) {
2626
Convey("Line breaks", func() {
2727
So(HTML2Text("should \nignore \r\nnew lines"), ShouldEqual, "should ignore new lines")
2828
So(HTML2Text(`two<br>line<br/>breaks`), ShouldEqual, "two\r\nline\r\nbreaks")
29-
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\nparagraphs")
29+
So(HTML2Text(`<p>two</p><p>paragraphs</p>`), ShouldEqual, "two\r\n\r\nparagraphs")
30+
})
31+
32+
Convey("Headings", func() {
33+
So(HTML2Text("<h1>First</h1>main text"), ShouldEqual, "First\r\n\r\nmain text")
34+
So(HTML2Text("First<h2>Second</h2>next section"), ShouldEqual, "First\r\n\r\nSecond\r\n\r\nnext section")
35+
So(HTML2Text("<h2>Second</h2>next section"), ShouldEqual, "Second\r\n\r\nnext section")
36+
So(HTML2Text("Second<h3>Third</h3>next section"), ShouldEqual, "Second\r\n\r\nThird\r\n\r\nnext section")
37+
So(HTML2Text("<h3>Third</h3>next section"), ShouldEqual, "Third\r\n\r\nnext section")
38+
So(HTML2Text("Third<h4>Fourth</h4>next section"), ShouldEqual, "Third\r\n\r\nFourth\r\n\r\nnext section")
39+
So(HTML2Text("<h4>Fourth</h4>next section"), ShouldEqual, "Fourth\r\n\r\nnext section")
40+
So(HTML2Text("Fourth<h5>Fifth</h5>next section"), ShouldEqual, "Fourth\r\n\r\nFifth\r\n\r\nnext section")
41+
So(HTML2Text("<h5>Fifth</h5>next section"), ShouldEqual, "Fifth\r\n\r\nnext section")
42+
So(HTML2Text("Fifth<h6>Sixth</h6>next section"), ShouldEqual, "Fifth\r\n\r\nSixth\r\n\r\nnext section")
43+
So(HTML2Text("<h6>Sixth</h6>next section"), ShouldEqual, "Sixth\r\n\r\nnext section")
44+
So(HTML2Text("<h7>Not Header</h7>next section"), ShouldEqual, "Not Headernext section")
3045
})
3146

3247
Convey("HTML entities", func() {
@@ -38,6 +53,13 @@ func TestHTML2Text(t *testing.T) {
3853
So(HTML2Text(`Tom & Jerry is not an entity`), ShouldEqual, "Tom & Jerry is not an entity")
3954
So(HTML2Text(`this &neither; as you see`), ShouldEqual, "this &neither; as you see")
4055
So(HTML2Text(`list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>`), ShouldEqual, "list of items\r\nOne\r\nTwo\r\nThree\r\n")
56+
So(HTML2Text(`fish &amp; chips`), ShouldEqual, "fish & chips")
57+
So(HTML2Text(`&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey`), ShouldEqual, "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey")
58+
So(HTML2Text(`Google &reg;`), ShouldEqual, "Google ®")
59+
})
60+
61+
Convey("Large Entity", func() {
62+
So(HTMLEntitiesToText("&abcdefghij;"), ShouldEqual, "&abcdefghij;")
4163
})
4264

4365
Convey("Full HTML structure", func() {

0 commit comments

Comments
 (0)