The Go Programming Language Ex（6）

最近更新于 Dec 2, 2018 6 分钟阅读时间 Golang

Ex 5.1

Change the findlinks program to traverse the n.FirstChild linked list using recursive calls to visit instead of a loop.

package main

import (
	"fmt"
	"os"

	"golang.org/x/net/html"
)

func main() {
	doc, err := html.Parse(os.Stdin)
	if err != nil {
		fmt.Fprintf(os.Stderr, "findlinks1: %v\n", err)
		os.Exit(1)
	}
	for _, link := range visit(nil, doc) {
		fmt.Println(link)
	}
}

// visit appends to links each link found in n and returns the result.
func visit(links []string, n *html.Node) []string {

	if n == nil {
		return links
	} else {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" {
					links = append(links, a.Val)
				}
			}
		}
		links = visit(links, n.FirstChild)
		links = visit(links, n.NextSibling)
	}
	return links
}

$ ./ex1.7 http://www.baidu.com | ./ex5.1
/
javascript:;
javascript:;
javascript:;
/
javascript:;
https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F
http://news.baidu.com
http://www.hao123.com
http://map.baidu.com
http://v.baidu.com
http://tieba.baidu.com
http://xueshu.baidu.com
https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F
http://www.baidu.com/gaoji/preferences.html
http://www.baidu.com/more/
http://news.baidu.com/ns?cl=2&rn=20&tn=news&word=
http://tieba.baidu.com/f?kw=&fr=wwwt
http://zhidao.baidu.com/q?ct=17&pn=0&tn=ikaslist&rn=10&word=&fr=wwwt
http://music.baidu.com/search?fr=ps&ie=utf-8&key=
http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=
http://v.baidu.com/v?ct=301989888&rn=20&pn=0&db=0&s=25&ie=utf-8&word=
http://map.baidu.com/m?word=&fr=ps01000
http://wenku.baidu.com/search?word=&lm=0&od=0&ie=utf-8
//www.baidu.com/more/
//www.baidu.com/cache/sethelp/help.html
http://home.baidu.com
http://ir.baidu.com
http://e.baidu.com/?refer=888
http://www.baidu.com/duty/
http://jianyi.baidu.com/
http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11000002000001

Ex 5.2

Write a function to populate a mapping from element names—p, div, span, and so on—to the number of elements with that name in an HTML document tree.

package main

import (
	"fmt"
	"os"

	"golang.org/x/net/html"
)

func main() {
	doc, err := html.Parse(os.Stdin)
	if err != nil {
		fmt.Fprintf(os.Stderr, "findlinks1: %v\n", err)
		os.Exit(1)
	}
	m := make(map[string]int)
	for k, v := range check(m, doc) {
		fmt.Printf("%s:%d\n", k, v)
	}
}

func check(m map[string]int, n *html.Node) map[string]int {
	if n == nil {
		return m
	} else {
		if n.Type == html.ElementNode {
			m[n.Data]++
		}
		m = check(m, n.FirstChild)
		m = check(m, n.NextSibling)
	}
	return m
}

$ ./ex1.7 http://www.baidu.com | ./ex5.2
body:1
img:2
map:1
area:1
a:32
i:3
head:1
title:1
style:3
ul:1
meta:4
link:11
form:1
span:5
li:4
p:3
html:1
script:13
noscript:1
div:21
input:14
b:2

Ex 5.5

Implement countWordsAndImages. (See Exercise 4.9 for word-splitting.)

package main

import (
	"bufio"
	"fmt"
	"net/http"
	"strings"

	"golang.org/x/net/html"
)

func main() {
	url := "http://www.baidu.com"
	w, i, err := CountWordsAndImages(url)
	if err != nil {
		fmt.Println("CountWordsAndImages error: ", err)
	}
	fmt.Printf("words = %d,images = %d\n", w, i)
}

func CountWordsAndImages(url string) (words, images int, err error) {
	resp, err := http.Get(url)
	if err != nil {
		return
	}
	doc, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		err = fmt.Errorf("parsing HTML: %s", err)
		return
	}
	words, images = countWordsAndImages(doc)
	return
}

func countWordsAndImages(n *html.Node) (words, images int) {
	if n == nil {
		return
	} else {
		if n.Type == html.ElementNode {
			if n.Data == "img" {
				images++
			}
		} else if n.Type == html.TextNode {
			scanner := bufio.NewScanner(strings.NewReader(n.Data))
			scanner.Split(bufio.ScanWords)
			for scanner.Scan() {
				words++
			}
		}
		w1, i1 := countWordsAndImages(n.FirstChild)
		words += w1
		images += i1
		w2, i2 := countWordsAndImages(n.NextSibling)
		words += w2
		images += i2
	}
	return
}

$ go run countwi.go
words = 2805,images = 2

Ex 5.6

Modify the corner function in gopl.io/ch3/surface (§3.2) to use named results and a bare return statement.

package main

import (
	"fmt"
	"math"
	"os"
)

const (
	width, height = 600, 320            // canvas size in pixels
	cells         = 100                 // number of grid cells
	xyrange       = 30.0                // axis ranges (-xyrange..+xyrange)
	xyscale       = width / 2 / xyrange // pixels per x or y unit
	zscale        = height * 0.4        // pixels per z unit
	angle         = math.Pi / 6         // angle of x, y axes (=30°)
)

var sin30, cos30 = math.Sin(angle), math.Cos(angle) // sin(30°), cos(30°)

func main() {
	//fmt.Printf("")
}

func corner(i, j int) (sx, sy float64) {
	// Find point (x,y) at corner of cell (i,j).
	x := xyrange * (float64(i)/cells - 0.5)
	y := xyrange * (float64(j)/cells - 0.5)

	// Compute surface height z.
	z := f(x, y)

	// Project (x,y,z) isometrically onto 2-D SVG canvas (sx,sy).
	sx = width/2 + (x-y)*cos30*xyscale
	sy = height/2 + (x+y)*sin30*xyscale - z*zscale
	return
}

func f(x, y float64) float64 {
	r := math.Hypot(x, y) // distance from (0,0)
	return math.Sin(r) / r
}

Ex 5.7

Develop startElement and endElement into a general HTML pre Print comment nodes, text nodes, and the attributes of each element (). Use short forms like instead of when an element has no children. Write a test to ensure that the output can be parsed successfully. (See Chapter 11.)

package main

import (
	"fmt"
	"net/http"
	"os"

	"golang.org/x/net/html"
)

func main() {
	for _, url := range os.Args[1:] {
		outline(url)
	}
}

func outline(url string) error {
	resp, err := http.Get(url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	doc, err := html.Parse(resp.Body)
	if err != nil {
		return err
	}

	forEachNode(doc, startElement, endElement)

	return nil
}

func forEachNode(n *html.Node, pre, post func(n *html.Node)) {
	if pre != nil {
		pre(n)
	}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		forEachNode(c, pre, post)
	}
	if post != nil {
		post(n)
	}
}

var depth int

func startElement(n *html.Node) {
	if n.Type == html.ElementNode ||
		n.Type == html.TextNode ||
		n.Type == html.CommentNode {
		if n.DataAtom != 0 { //去除无法识别标签的节点，如空白的textnode等
			var attr string
			for _, v := range n.Attr {
				attr += " " + v.Key + "='" + v.Val + "'"
			}
			if n.FirstChild == nil {
				fmt.Printf("%*s<%s/>\n", depth*2, "", n.Data)
			} else {
				fmt.Printf("%*s<%s%s>\n", depth*2, "", n.Data, attr)
			}
			depth++
		}
	}
}
func endElement(n *html.Node) {
	if n.Type == html.ElementNode ||
		n.Type == html.TextNode ||
		n.Type == html.CommentNode {
		if n.DataAtom != 0 {
			depth--
			if n.FirstChild != nil {
				fmt.Printf("%*s\n", depth*2, "", n.Data)
			}
		}
	}
}

$ ./ex5.7 http://gopl.io

遍历TextNode时，含有很多空节点（其实就是空格，也当成了一个节点），或许应该在parse阶段就抛弃这些不是节点的节点。本习题中，我暂时借用了元素节点的字段DataAtom atom.Atom，若为0，则说明该节点标签无法识别。

// A Node consists of a NodeType and some Data (tag name for element nodes, // content for text) and are part of a tree of Nodes. Element nodes may also // have a Namespace and contain a slice of Attributes. Data is unescaped, so // that it looks like “a

Ex 5.8

Modify forEachNode so that the pre and post functions return a boolean re indicating whether to continue the traversal. Use it to write a function ElementByID with the following signature that ﬁnds the ﬁrst HTML element with the speciﬁed id attribute. The function should stop the traversal as soon as a match is found.

func ElementByID(doc *html.Node, id string) *html.Node

package main

import (
	"fmt"
	"log"
	"net/http"
	"os"

	"golang.org/x/net/html"
)

var custID = "css_index_result"

//var custID = "toc"

func main() {
	for _, url := range os.Args[1:] {
		outline(url)
	}
}

func outline(url string) error {
	resp, err := http.Get(url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	doc, err := html.Parse(resp.Body)
	if err != nil {
		return err
	}
	//forEachNode(doc, "", startElement, endElement)
	fmt.Printf("%#v:\n", getElementByID(doc, custID))
	return nil
}

func forEachNode(n *html.Node, id string, pre, post func(n *html.Node, id string) bool) *html.Node {
	var rtn *html.Node
	if pre != nil {
		if !pre(n, id) {
			rtn = n
		}
	}
	for c := n.FirstChild; c != nil && rtn == nil; c = c.NextSibling {
		rtn = forEachNode(c, id, pre, post)
	}
	if post != nil {
		if !post(n, id) {
			rtn = n
		}
	}
	return rtn
}

var depth int

func startElement(n *html.Node, id string) bool {
	if n.Type == html.ElementNode ||
		n.Type == html.TextNode ||
		n.Type == html.CommentNode {
		if n.DataAtom != 0 { //去除无法识别标签的节点，如空白的textnode等
			var attr string
			for _, v := range n.Attr {
				attr += " " + v.Key + "='" + v.Val + "'"
			}
			if n.FirstChild == nil {
				fmt.Printf("%*s<%s/>\n", depth*2, "", n.Data)
			} else {
				fmt.Printf("%*s<%s%s>\n", depth*2, "", n.Data, attr)
			}
			depth++
		}
		if n.Type == html.ElementNode {
			for _, v := range n.Attr {
				if v.Key == "id" && v.Val == id {
					log.Printf("%#v\n", n)
					return false
				}
			}
		}
	}
	return true
}
func endElement(n *html.Node, id string) bool {
	if n.Type == html.ElementNode ||
		n.Type == html.TextNode ||
		n.Type == html.CommentNode {
		if n.DataAtom != 0 {
			depth--
			if n.FirstChild != nil {
				fmt.Printf("%*s\n", depth*2, "", n.Data)
			}
		}
	}
	return true
}

func getElementByID(doc *html.Node, id string) *html.Node {
	return forEachNode(doc, id, startElement, endElement)
}

$ ./ex5.7 http://www.baidu.com

  
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
  

&html.Node{Parent:(*html.Node)(0xc4201322a0), FirstChild:(*html.Node)(0xc4201336c0), LastChild:(*html.Node)(0xc4201336c0), PrevSibling:(*html.Node)(0xc4201335e0), NextSibling:(*html.Node)(0xc420133730), Type:0x3, DataAtom:0x6f905, Data:"style", Namespace:"", Attr:[]html.Attribute{html.Attribute{Namespace:"", Key:"data-for", Val:"result"}, html.Attribute{Namespace:"", Key:"id", Val:"css_index_result"}, html.Attribute{Namespace:"", Key:"type", Val:"text/css"}}}:

Ex 5.12

The startElement and endElement functions in gopl.io/ch5/outline2 (§5.5) share a global variable, depth. Turn them into anonymous functions that share a variable local to the outline function.

// Copyright © 2016 Alan A. A. Donovan & Brian W. Kernighan.
// License: https://creativecommons.org/licenses/by-nc-sa/4.0/

// See page 133.

// Outline prints the outline of an HTML document tree.
package main

import (
	"fmt"
	"net/http"
	"os"

	"golang.org/x/net/html"
)

func main() {
	for _, url := range os.Args[1:] {
		outline(url)
	}
}

func outline(url string) error {
	resp, err := http.Get(url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return err
	}
	var depth int
	startElement := func(n *html.Node) {
		if n.Type == html.ElementNode {
			fmt.Printf("%*s<%s>\n", depth*2, "", n.Data)
			depth++
		}
	}

	endElement := func(n *html.Node) {
		if n.Type == html.ElementNode {
			depth--
			fmt.Printf("%*s\n", depth*2, "", n.Data)
		}
	}
	//!+call
	forEachNode(doc, startElement, endElement)
	//!-call

	return nil
}

//!+forEachNode
// forEachNode calls the functions pre(x) and post(x) for each node
// x in the tree rooted at n. Both functions are optional.
// pre is called before the children are visited (preorder) and
// post is called after (postorder).
func forEachNode(n *html.Node, pre, post func(n *html.Node)) {
	if pre != nil {
		pre(n)
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		forEachNode(c, pre, post)
	}

	if post != nil {
		post(n)
	}
}

Ex 5.13

Modify crawl to make local copies of the pages it ﬁnds, creating directories as necessary. Don’t make copies of pages that come from a different domain. For example, if the original page comes from golang.org, save all ﬁles from there, but exclude ones from vimeo.com.

package main

import (
	"fmt"
	"io"
	"log"
	"net/http"
	"net/url"
	"os"
	"strconv"
	"strings"
	"time"

	"golang.org/x/net/html"
	"gopl.io/ch5/links"
)

// Extract makes an HTTP GET request to the specified URL, parses
// the response as HTML, and returns the links in the HTML document.
func Extract(urlstr string) ([]string, error) {
	resp, err := http.Get(urlstr)
	if err != nil {
		return nil, err
	}
	if resp.StatusCode != http.StatusOK {
		resp.Body.Close()
		return nil, fmt.Errorf("getting %s: %s", urlstr, resp.Status)
	}
	doc, err := html.Parse(resp.Body)
	resp.Body.Close()
	if err != nil {
		return nil, fmt.Errorf("parsing %s as HTML: %v", urlstr, err)
	}
	var links []string
	visitNode := func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key != "href" {
					continue
				}
				link, err := resp.Request.URL.Parse(a.Val)
				if err != nil {
					continue // ignore bad URLs
				}
				links = append(links, link.String())
			}
		}
	}
	forEachNode(doc, visitNode, nil)
	return links, nil
}

//!-Extract

// Copied from gopl.io/ch5/outline2.
func forEachNode(n *html.Node, pre, post func(n *html.Node)) {
	if pre != nil {
		pre(n)
	}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		forEachNode(c, pre, post)
	}
	if post != nil {
		post(n)
	}
}

func breadthFirst(f func(item string, host []string) []string, worklist []string) {
	seen := make(map[string]bool)
	host := make([]string, len(worklist))
	for i, v := range worklist {
		u, err := url.Parse(v)
		if err != nil {
			fmt.Println("initial url parse failed")
			return
		}
		host[i] = u.Host
	}
	for len(worklist) > 0 {
		items := worklist
		worklist = nil
		for _, item := range items {
			if !seen[item] {
				seen[item] = true
				worklist = append(worklist, f(item, host)...)
			}
		}
	}
}

//!-breadthFirst

//!+crawl
func crawl(urlstr string, sl []string) []string {
	fmt.Println(urlstr)
	copycontent(urlstr, sl)
	list, err := links.Extract(urlstr)
	if err != nil {
		log.Print(err)
	}
	return list
}

func copycontent(s string, sl []string) {
	u, err := url.Parse(s)
	if err != nil {
		fmt.Println("url parse failed")
		return
	}
	for _, v := range sl {
		if u.Host == v {
			resp, err := http.Get(s)
			if err != nil {
				fmt.Println(err)
				return
			}
			if resp.StatusCode != http.StatusOK {
				resp.Body.Close()
				return
			}
			//fmt.Println(path)
			dir, _ := os.Getwd() //当前的目录
			var filename, filepart, dirpart string
			ns := strings.Count(u.Path, "/")
			if ns == 0 || ns == 1 && len(u.Path) == 1 {
				filepart = ""
				filename = strconv.FormatInt(time.Now().Unix(), 10) + ".html"
				dirpart = "/"
			} else {
				filepart = u.Path[strings.LastIndex(u.Path, "/")+1:]
				dirpart = u.Path[:strings.LastIndex(u.Path, "/")+1]
				if strings.Contains(filepart, ".") {
					filename = filepart
				} else {
					dirpart = u.Path
					filename = strconv.FormatInt(time.Now().Unix(), 10) + ".html"
				}
			}

			fullpath := dir + "/" + u.Host + dirpart
			_, err = os.Stat(fullpath)
			if err != nil {
				err = os.MkdirAll(fullpath, os.ModePerm) //在当前目录下生成md目录
				if err != nil {
					fmt.Println("create folder failed! ", fullpath, err)
					return
				}
			}
			filename = dir + "/" + u.Host + dirpart + "/" + filename
			f, err := os.Create(filename)
			if err != nil {
				fmt.Println("create file error:", err, s)
				return
			}
			_, err = io.Copy(f, resp.Body)
			if err != nil {
				fmt.Println("failed in copy")
				return
			}
			resp.Body.Close()
		}
	}
}
func main() {
	breadthFirst(crawl, os.Args[1:])
}

The Go Programming Language Ex Go Func Go crawler

最近更新于 Dec 2, 2018