1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // This program takes an HTML file and outputs a corresponding article file in
6 // present format. See: golang.org/x/tools/present
7 package main // import "golang.org/x/tools/cmd/html2article"
21 "golang.org/x/net/html"
22 "golang.org/x/net/html/atom"
28 err := convert(os.Stdout, os.Stdin)
34 func convert(w io.Writer, r io.Reader) error {
35 root, err := html.Parse(r)
40 style := find(root, isTag(atom.Style))
41 if err := parseStyles(style); err != nil {
42 log.Printf("couldn't parse all styles: %v", err)
45 body := find(root, isTag(atom.Body))
47 return errors.New("couldn't find body")
49 article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
50 _, err = fmt.Fprintf(w, "Title\n\n%s", article)
62 var cssRules = make(map[string]Style)
64 func parseStyles(style *html.Node) error {
65 if style == nil || style.FirstChild == nil {
66 return errors.New("couldn't find styles")
69 styles := style.FirstChild.Data
70 readUntil := func(end rune) (string, bool) {
71 i := strings.IndexRune(styles, end)
81 sel, ok := readUntil('{')
85 return fmt.Errorf("could not parse selector %q", styles)
88 value, ok := readUntil('}')
90 return fmt.Errorf("couldn't parse style body for %s", sel)
93 case strings.Contains(value, "italic"):
94 cssRules[sel] = Italic
95 case strings.Contains(value, "bold"):
97 case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"):
104 var newlineRun = regexp.MustCompile(`\n\n+`)
106 func limitNewlineRuns(s string) string {
107 return newlineRun.ReplaceAllString(s, "\n\n")
110 func makeHeadings(body string) string {
111 buf := new(bytes.Buffer)
112 lines := strings.Split(body, "\n")
113 for i, s := range lines {
114 if i == 0 && !isBoldTitle(s) {
115 buf.WriteString("* Introduction\n\n")
118 s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
127 func isBoldTitle(s string) bool {
128 return !strings.Contains(s, " ") &&
129 strings.HasPrefix(s, "*") &&
130 strings.HasSuffix(s, "*")
133 func indent(buf *bytes.Buffer, s string) {
134 for _, l := range strings.Split(s, "\n") {
143 func unwrap(buf *bytes.Buffer, s string) {
145 for _, l := range strings.Split(s, "\n") {
146 l = strings.TrimSpace(l)
163 func text(n *html.Node) string {
165 walk(n, func(n *html.Node) bool {
168 buf.WriteString(n.Data)
170 case html.ElementNode:
178 case hasStyle(Code)(n):
180 case hasStyle(Bold)(n):
182 case hasStyle(Italic)(n):
190 unwrap(&buf, childText(n))
191 buf.WriteString("\n\n")
193 buf.WriteString("- ")
194 unwrap(&buf, childText(n))
197 indent(&buf, childText(n))
200 href, text := attr(n, "href"), childText(n)
201 // Skip links with no text.
202 if strings.TrimSpace(text) == "" {
205 // Don't emit empty links.
206 if strings.TrimSpace(href) == "" {
207 buf.WriteString(text)
210 // Use original url for Google Docs redirections.
211 if u, err := url.Parse(href); err != nil {
212 log.Printf("parsing url %q: %v", href, err)
213 } else if u.Host == "www.google.com" && u.Path == "/url" {
214 href = u.Query().Get("q")
216 fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
218 buf.WriteString(highlight(n, "`"))
220 buf.WriteString(highlight(n, "*"))
222 buf.WriteString(highlight(n, "_"))
224 src := attr(n, "src")
225 fmt.Fprintf(&buf, ".image %s\n", src)
227 src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
228 fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
230 if attr(n, "name") == "movie" {
231 // Old style YouTube embed.
232 u := attr(n, "value")
233 u = strings.Replace(u, "/v/", "/embed/", 1)
234 if i := strings.Index(u, "&"); i >= 0 {
237 fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
248 func childText(node *html.Node) string {
250 for n := node.FirstChild; n != nil; n = n.NextSibling {
251 fmt.Fprint(&buf, text(n))
256 func highlight(node *html.Node, char string) string {
257 t := strings.Replace(childText(node), " ", char, -1)
258 return fmt.Sprintf("%s%s%s", char, t, char)
261 type selector func(*html.Node) bool
263 func isTag(a atom.Atom) selector {
264 return func(n *html.Node) bool {
265 return n.DataAtom == a
269 func hasClass(name string) selector {
270 return func(n *html.Node) bool {
271 for _, a := range n.Attr {
272 if a.Key == "class" {
273 for _, c := range strings.Fields(a.Val) {
284 func hasStyle(s Style) selector {
285 return func(n *html.Node) bool {
286 for rule, s2 := range cssRules {
290 if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
293 if n.DataAtom.String() == rule {
301 func attr(node *html.Node, key string) (value string) {
302 for _, attr := range node.Attr {
310 func find(n *html.Node, fn selector) *html.Node {
311 var result *html.Node
312 walk(n, func(n *html.Node) bool {
325 func walk(n *html.Node, fn selector) {
327 for c := n.FirstChild; c != nil; c = c.NextSibling {