用Golang写爬虫(六) – 使用colly

❯ go get -u github.com/gocolly/colly/…
package main
import (
“fmt”
“github.com/gocolly/colly”
)
func main() {
c := colly.NewCollector(
colly.UserAgent(“Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)”),
)
c.OnRequest(func(r *colly.Request) {
fmt.Println(“Visiting”, r.URL)
})
c.OnError(func(_ *colly.Response, err error) {
fmt.Println(“Something went wrong:”, err)
})
c.OnResponse(func(r *colly.Response) {
fmt.Println(“Visited”, r.Request.URL)
})
c.OnHTML(“.paginator a”, func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr(“href”))
})
c.OnScraped(func(r *colly.Response) {
fmt.Println(“Finished”, r.Request.URL)
})
c.Visit(“https://movie.douban.com/top250?start=0&filter=”)
}
❯ go run colly/doubanCrawler1.go
Visiting https://movie.douban.com/top250?start=0&filter=
Visited https://movie.douban.com/top250?start=0&filter=
Visiting https://movie.douban.com/top250?start=25&filter=
Visited https://movie.douban.com/top250?start=25&filter=

Finished https://movie.douban.com/top250?start=25&filter=
Finished https://movie.douban.com/top250?start=0&filter=
1.OnRequest。请求前
1.
2.OnError。请求过程中发生错误
2.
3.OnResponse。收到响应后
3.
4.OnHTML。如果收到的响应内容是HTML调用它。
4.
5.OnXML。如果收到的响应内容是XML 调用它。写爬虫基本用不到,所以上面我没有使用它。
5.
6.OnScraped。在OnXML/OnHTML回调完成后调用。不过官网写的是Called after OnXML callbacks,实际上对于OnHTML也有效,大家可以注意一下。
6.

  1. ….

package main
import (
“log”
“strings”
“github.com/gocolly/colly”
)
func main() {
c := colly.NewCollector(
colly.Async(true),
colly.UserAgent(“Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)”),
)
c.Limit(&colly.LimitRule{DomainGlob: “*.douban.*”, Parallelism: 5})
c.OnRequest(func(r *colly.Request) {
log.Println(“Visiting”, r.URL)
})
c.OnError(func(_ *colly.Response, err error) {
log.Println(“Something went wrong:”, err)
})
c.OnHTML(“.hd”, func(e *colly.HTMLElement) {
log.Println(strings.Split(e.ChildAttr(“a”, “href”), “/”)[4],
strings.TrimSpace(e.DOM.Find(“span.title”).Eq(0).Text()))
})
c.OnHTML(“.paginator a”, func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr(“href”))
})
c.Visit(“https://movie.douban.com/top250?start=0&filter=”)
c.Wait()
}
c.OnHTML(“.hd”, func(e *colly.HTMLElement) {
log.Println(strings.Split(e.ChildAttr(“a”, “href”), “/”)[4],
strings.TrimSpace(e.DOM.Find(“span.title”).Eq(0).Text()))
})
import “github.com/antchfx/htmlquery”
c.OnResponse(func(r *colly.Response) {
doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
if err != nil {
log.Fatal(err)
}
nodes := htmlquery.Find(doc, `//ol[@class=”grid_view”]/li//div[@class=”hd”]`)
for _, node := range nodes {
url := htmlquery.FindOne(node, “./a/@href”)
title := htmlquery.FindOne(node, `.//span[@class=”title”]/text()`)
log.Println(strings.Split(htmlquery.InnerText(url), “/”)[4],
htmlquery.InnerText(title))
}
})
用Golang写爬虫(五) – 使用XPath[1]
这个地址[2]
最多200字,当前共字