摘要

文章内容是本人基于官方文档以及源码的学习，在学习过程中记录整理。

官方示例文档

允许访问的域名配置

通过colly.AllowedDomains()方法控制可以访问的域名，不在白名单内的域名就不会发起请求了

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
	)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}

异常处理

可以对访问失败的页面进行处理，以下情况视为异常：

请求目标地址失败
- 状态码大于203
- response为空
页面解析失败（OnHTML,OnXML）

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
	)

	c.OnError(func(r *colly.Response, err error) {
		log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}

登录自动保存session

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
	)
	// 登录之后会自动保存session
	err := c.Post("https://c.isme.pub/login", map[string]string{"username": "admin", "password": "admin"})
	if err != nil {
		log.Fatal(err)
	}
	
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}

请求深度

通过colly.MaxDepth()可以设置爬虫自动请求深度

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
	)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}

开启异步请求并设置并发度

通过配置colly.Async(true)可以开启异步并发请求，但同时需要通过c.Wait()阻塞爬取任务退出。
在开启并发请求后可以通过c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})配置设置并发数。

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
		colly.Async(true),
	)
	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		Parallelism: 4,
	})

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
	c.Wait()
}

代理轮询

package main

import (
	"bytes"
	"log"

	"github.com/gocolly/colly"
	"github.com/gocolly/colly/proxy"
)

func main() {
	// Instantiate default collector
	c := colly.NewCollector(colly.AllowURLRevisit())

	// Rotate two socks5 proxies
	rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
	if err != nil {
		log.Fatal(err)
	}
	c.SetProxyFunc(rp)

	// Print the response
	c.OnResponse(func(r *colly.Response) {
		log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
	})

	// Fetch httpbin.org/ip five times
	for i := 0; i < 5; i++ {
		c.Visit("https://httpbin.org/ip")
	}
}

队列

package main

import (
	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/queue"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
	)

	q, _ := queue.New(
		3,                                           // Number of consumer threads
		&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
	)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		q.AddURL(e.Request.AbsoluteURL(link))
	})
	c.Visit(url)
	q.Run(c)
}

设置请求延迟

通过设置RandomDelay可以让每次请求目标地址前延迟一段时间，延迟的时间为小于RandomDelay设置值的随机时间，通过设置此参数可以一定程度是避免反爬机制。

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
	"time"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(1),
		colly.AllowedDomains("c.isme.pub"),
	)
	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		RandomDelay: 5 * time.Second,
	})

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})
	c.Visit(url)
}

使用redis作为存储、队列

package main

import (
	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/queue"
	"github.com/gocolly/redisstorage"
	"log"
)

func main() {
	url := "https://c.isme.pub/"

	c := colly.NewCollector(
		colly.MaxDepth(1),
		colly.AllowedDomains("c.isme.pub"),
	)

	// create the redis storage
	storage := &redisstorage.Storage{
		Address:  "127.0.0.1:6379",
		Password: "",
		DB:       2,
		Prefix:   "isme",
	}

	// add storage to the collector
	err := c.SetStorage(storage)
	if err != nil {
		panic(err)
	}

	// delete previous data from storage
	if err = storage.Clear(); err != nil {
		log.Fatal(err)
	}

	// close redis client
	defer storage.Client.Close()

	// create a new request queue with redis storage backend
	q, _ := queue.New(2, storage)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		q.AddURL(e.Request.AbsoluteURL(link))
	})

	// add URLs to the queue
	q.AddURL(url)
	// consume requests
	q.Run(c)
}

操作上下文

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
	)

	c.OnRequest(func(r *colly.Request) {
		r.Ctx.Put("custom", "isme")
	})

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL, r.Ctx.Get("custom"))
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}

启动爬虫web服务

package main

import (
	"encoding/json"
	"log"
	"net/http"

	"github.com/gocolly/colly"
)

type pageInfo struct {
	StatusCode int
	Links      map[string]int
}

func handler(w http.ResponseWriter, r *http.Request) {
	URL := r.URL.Query().Get("url")
	if URL == "" {
		log.Println("missing URL argument")
		return
	}
	log.Println("visiting", URL)

	c := colly.NewCollector()

	p := &pageInfo{Links: make(map[string]int)}

	// count links
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Request.AbsoluteURL(e.Attr("href"))
		if link != "" {
			p.Links[link]++
		}
	})

	// extract status code
	c.OnResponse(func(r *colly.Response) {
		log.Println("response received", r.StatusCode)
		p.StatusCode = r.StatusCode
	})
	c.OnError(func(r *colly.Response, err error) {
		log.Println("error:", r.StatusCode, err)
		p.StatusCode = r.StatusCode
	})

	c.Visit(URL)

	// dump results
	b, err := json.Marshal(p)
	if err != nil {
		log.Println("failed to serialize response:", err)
		return
	}
	w.Header().Add("Content-Type", "application/json")
	w.Write(b)
}

func main() {
	// example usage: curl -s 'http://127.0.0.1:7171/?url=http://go-colly.org/'
	addr := ":7171"

	http.HandleFunc("/", handler)

	log.Println("listening on", addr)
	log.Fatal(http.ListenAndServe(addr, nil))
}

设置路由过滤条件

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
	"regexp"
)

func main() {
	url := "https://c.isme.pub/page/2/"

	c := colly.NewCollector(
		colly.MaxDepth(3),
		colly.AllowedDomains("c.isme.pub"),
		colly.URLFilters(
			regexp.MustCompile("https://c\\.isme\\.pub/page/"),
		),
	)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}

爬取商品价格网站并将信息保存在文件中

package main

import (
	"encoding/csv"
	"log"
	"os"

	"github.com/gocolly/colly"
)

func main() {
	fName := "cryptocoinmarketcap.csv"
	file, err := os.Create(fName)
	if err != nil {
		log.Fatalf("Cannot create file %q: %s\n", fName, err)
		return
	}
	defer file.Close()
	writer := csv.NewWriter(file)
	defer writer.Flush()

	// Write CSV header
	writer.Write([]string{"Name", "Symbol", "Price (USD)", "Volume (USD)", "Market capacity (USD)", "Change (1h)", "Change (24h)", "Change (7d)"})

	// Instantiate default collector
	c := colly.NewCollector()

	c.OnHTML("table tbody tr", func(e *colly.HTMLElement) {
		if e.ChildText(".cmc-table__column-name--name") != "" {
			writer.Write([]string{
				e.ChildText(".cmc-table__column-name--name"),
				e.ChildText(".cmc-table__cell--sort-by__symbol"),
				e.ChildText(".cmc-table__cell--sort-by__price"),
				e.ChildText(".cmc-table__cell--sort-by__volume-24-h"),
				e.ChildText(".cmc-table__cell--sort-by__market-cap"),
				e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"),
				e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"),
				e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"),
			})
		}
	})

	c.Visit("https://coinmarketcap.com/all/views/all/")

	log.Printf("Scraping finished, check file %q for results\n", fName)
}

设置本地缓存目录

通过colly.CacheDir("./isme") 方法可以在指定路径下生成缓存，下次请求同一个页面会支持访问缓存数据

package main

import (
	"github.com/gocolly/colly/v2"
	"log"
)

func main() {
	url := "https://c.isme.pub"

	c := colly.NewCollector(
		colly.MaxDepth(2),
		colly.AllowedDomains("c.isme.pub"),
		colly.CacheDir("./isme"),
	)

	c.OnResponse(func(r *colly.Response) {
		log.Println("Visited", r.Request.URL)
	})

	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		e.Request.Visit(link)
	})

	c.Visit(url)
}