Go语言-爬虫框架Colly-官方示例

摘要

文章内容是本人基于官方文档以及源码的学习,在学习过程中记录整理。

官方示例文档

允许访问的域名配置

通过colly.AllowedDomains()方法控制可以访问的域名,不在白名单内的域名就不会发起请求了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
)

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}

异常处理

可以对访问失败的页面进行处理,以下情况视为异常:

  • 请求目标地址失败
    • 状态码大于203
    • response为空
  • 页面解析失败(OnHTML,OnXML
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
)

c.OnError(func(r *colly.Response, err error) {
log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}

登录自动保存session

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
)
// 登录之后会自动保存session
err := c.Post("https://c.isme.pub/login", map[string]string{"username": "admin", "password": "admin"})
if err != nil {
log.Fatal(err)
}

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}

请求深度

通过colly.MaxDepth()可以设置爬虫自动请求深度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
)

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}

开启异步请求并设置并发度

  • 通过配置colly.Async(true)可以开启异步并发请求,但同时需要通过c.Wait()阻塞爬取任务退出。
  • 在开启并发请求后可以通过c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})配置设置并发数。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
colly.Async(true),
)
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 4,
})

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
c.Wait()
}

代理轮询

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package main

import (
"bytes"
"log"

"github.com/gocolly/colly"
"github.com/gocolly/colly/proxy"
)

func main() {
// Instantiate default collector
c := colly.NewCollector(colly.AllowURLRevisit())

// Rotate two socks5 proxies
rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
if err != nil {
log.Fatal(err)
}
c.SetProxyFunc(rp)

// Print the response
c.OnResponse(func(r *colly.Response) {
log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
})

// Fetch httpbin.org/ip five times
for i := 0; i < 5; i++ {
c.Visit("https://httpbin.org/ip")
}
}

队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
package main

import (
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
)

q, _ := queue.New(
3, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
q.AddURL(e.Request.AbsoluteURL(link))
})
c.Visit(url)
q.Run(c)
}

设置请求延迟

通过设置RandomDelay可以让每次请求目标地址前延迟一段时间,延迟的时间为小于RandomDelay设置值的随机时间,通过设置此参数可以一定程度是避免反爬机制。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package main

import (
"github.com/gocolly/colly/v2"
"log"
"time"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(1),
colly.AllowedDomains("c.isme.pub"),
)
c.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 5 * time.Second,
})

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})
c.Visit(url)
}

使用redis作为存储、队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
package main

import (
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
"github.com/gocolly/redisstorage"
"log"
)

func main() {
url := "https://c.isme.pub/"

c := colly.NewCollector(
colly.MaxDepth(1),
colly.AllowedDomains("c.isme.pub"),
)

// create the redis storage
storage := &redisstorage.Storage{
Address: "127.0.0.1:6379",
Password: "",
DB: 2,
Prefix: "isme",
}

// add storage to the collector
err := c.SetStorage(storage)
if err != nil {
panic(err)
}

// delete previous data from storage
if err = storage.Clear(); err != nil {
log.Fatal(err)
}

// close redis client
defer storage.Client.Close()

// create a new request queue with redis storage backend
q, _ := queue.New(2, storage)

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
q.AddURL(e.Request.AbsoluteURL(link))
})

// add URLs to the queue
q.AddURL(url)
// consume requests
q.Run(c)
}

操作上下文

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
)

c.OnRequest(func(r *colly.Request) {
r.Ctx.Put("custom", "isme")
})

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL, r.Ctx.Get("custom"))
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}

启动爬虫web服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
package main

import (
"encoding/json"
"log"
"net/http"

"github.com/gocolly/colly"
)

type pageInfo struct {
StatusCode int
Links map[string]int
}

func handler(w http.ResponseWriter, r *http.Request) {
URL := r.URL.Query().Get("url")
if URL == "" {
log.Println("missing URL argument")
return
}
log.Println("visiting", URL)

c := colly.NewCollector()

p := &pageInfo{Links: make(map[string]int)}

// count links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if link != "" {
p.Links[link]++
}
})

// extract status code
c.OnResponse(func(r *colly.Response) {
log.Println("response received", r.StatusCode)
p.StatusCode = r.StatusCode
})
c.OnError(func(r *colly.Response, err error) {
log.Println("error:", r.StatusCode, err)
p.StatusCode = r.StatusCode
})

c.Visit(URL)

// dump results
b, err := json.Marshal(p)
if err != nil {
log.Println("failed to serialize response:", err)
return
}
w.Header().Add("Content-Type", "application/json")
w.Write(b)
}

func main() {
// example usage: curl -s 'http://127.0.0.1:7171/?url=http://go-colly.org/'
addr := ":7171"

http.HandleFunc("/", handler)

log.Println("listening on", addr)
log.Fatal(http.ListenAndServe(addr, nil))
}

设置路由过滤条件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package main

import (
"github.com/gocolly/colly/v2"
"log"
"regexp"
)

func main() {
url := "https://c.isme.pub/page/2/"

c := colly.NewCollector(
colly.MaxDepth(3),
colly.AllowedDomains("c.isme.pub"),
colly.URLFilters(
regexp.MustCompile("https://c\\.isme\\.pub/page/"),
),
)

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}

爬取商品价格网站并将信息保存在文件中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
package main

import (
"encoding/csv"
"log"
"os"

"github.com/gocolly/colly"
)

func main() {
fName := "cryptocoinmarketcap.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()

// Write CSV header
writer.Write([]string{"Name", "Symbol", "Price (USD)", "Volume (USD)", "Market capacity (USD)", "Change (1h)", "Change (24h)", "Change (7d)"})

// Instantiate default collector
c := colly.NewCollector()

c.OnHTML("table tbody tr", func(e *colly.HTMLElement) {
if e.ChildText(".cmc-table__column-name--name") != "" {
writer.Write([]string{
e.ChildText(".cmc-table__column-name--name"),
e.ChildText(".cmc-table__cell--sort-by__symbol"),
e.ChildText(".cmc-table__cell--sort-by__price"),
e.ChildText(".cmc-table__cell--sort-by__volume-24-h"),
e.ChildText(".cmc-table__cell--sort-by__market-cap"),
e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"),
e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"),
e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"),
})
}
})

c.Visit("https://coinmarketcap.com/all/views/all/")

log.Printf("Scraping finished, check file %q for results\n", fName)
}

设置本地缓存目录

通过colly.CacheDir("./isme") 方法可以在指定路径下生成缓存,下次请求同一个页面会支持访问缓存数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
package main

import (
"github.com/gocolly/colly/v2"
"log"
)

func main() {
url := "https://c.isme.pub"

c := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowedDomains("c.isme.pub"),
colly.CacheDir("./isme"),
)

c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
e.Request.Visit(link)
})

c.Visit(url)
}