Colly

Fast and Elegant Scraping Framework for Gophers

Colly provides a clean interface to write any kind of crawler/scraper/spider

With Colly you can easily extract structured data from websites, which can be used for a wide range of applications, like data mining, data processing or archiving.

Install

1	`go get -u github.com/gocolly/colly/...`

Getting started

1	`import "github.com/gocolly/colly"`

Collector

Colly’s main entity.

Manages the network communication and responsible for the execution of the attached callbacks while a collector job is running.

To work with colly, you have to initialize a Collector:

1	`c := colly.NewCollector()`

Callbacks

Attach different type of callback functions to a Collector to control a collecting job or retrieve information.

Add callbacks to a Collector

// 注册请求回调，每次发送请求时执行该回调
c.OnRequest(func(r *colly.Request) {
    fmt.Println("Visiting", r.URL)
})

// 注册错误回调，执行请求发生时执行该回调
c.OnError(func(_ *colly.Response, err error) {
    log.Println("Something went wrong:", err)
})
// 注册响应回调，每次收到响应时执行该回调
c.OnResponse(func(r *colly.Response) {
    fmt.Println("Visited", r.Request.URL)
})
// 注册HTML回调，对每一个href属性的a元素执行回调函数。
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
    e.Request.Visit(e.Attr("href"))
})
// CSS选择器
c.OnHTML("tr td:nth-of-type(1)", func(e *colly.HTMLElement) {
    fmt.Println("First column of a table row:", e.Text)
})
// 如果接收到的内容是HTML或XML ,则在之后调用(非必需)
c.OnXML("//h1", func(e *colly.XMLElement) {
    fmt.Println(e.Text)
})
// OnXML回调后调用(非必需)
c.OnScraped(func(r *colly.Response) {
    fmt.Println("Finished", r.Request.URL)
})

Combat

豆瓣

package main

import (
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	"regexp"
	"strings"
	"time"
)

func main() {
	t := time.Now()
	number := 1

	c := colly.NewCollector(func(c *colly.Collector) {
		extensions.RandomUserAgent(c) // 设置随机头
		c.Async = true
	},
		//过滤url,去除不是https://movie.douban.com/top250?start=0&filter= 的url
		colly.URLFilters(
			regexp.MustCompile("^(https://movie\\.douban\\.com/top250)\\?start=[0-9].*&filter="),
		),
	) // 创建收集器
	// 响应的格式为HTML,提取页面中的链接
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		//fmt.Printf("find link: %s\n", e.Request.AbsoluteURL(link))
		c.Visit(e.Request.AbsoluteURL(link))
	})
	// 获取电影信息
	c.OnHTML("div.info", func(e *colly.HTMLElement) {
		e.DOM.Each(func(i int, selection *goquery.Selection) {
			movies := selection.Find("span.title").First().Text()
			director := strings.Join(strings.Fields(selection.Find("div.bd p").First().Text()), " ")
			quote := selection.Find("p.quote span.inq").Text()
			fmt.Printf("%d --> %s:%s %s\n", number, movies, director, quote)
			number += 1
		})
	})
	c.OnError(func(response *colly.Response, err error) {
		fmt.Println(err)
	})
  // 最后我们调用c.Visit()开始访问第一个页面
	c.Visit("https://movie.douban.com/top250?start=0&filter=")
	c.Wait()
	fmt.Printf("花费时间:%s", time.Since(t))
}

Step1 创建收集器

c := colly.NewCollector(func(c *colly.Collector) {
		extensions.RandomUserAgent(c) // 设置随机头
		c.Async = true
	},
		//过滤url,去除不是https://movie.douban.com/top250?start=0&filter= 的url
		colly.URLFilters(
      // 正则表达
			regexp.MustCompile("^(https://movie\\.douban\\.com/top250)\\?start=[0-9].*&filter="),
		),
	)

Step2 HTML回调

// 响应的格式为HTML,提取页面中的链接
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		//fmt.Printf("find link: %s\n", e.Request.AbsoluteURL(link))
		c.Visit(e.Request.AbsoluteURL(link))
	})
	// 获取电影信息
	c.OnHTML("div.info", func(e *colly.HTMLElement) {
		e.DOM.Each(func(i int, selection *goquery.Selection) {
			movies := selection.Find("span.title").First().Text()
			director := strings.Join(strings.Fields(selection.Find("div.bd p").First().Text()), " ")
			quote := selection.Find("p.quote span.inq").Text()
			fmt.Printf("%d --> %s:%s %s\n", number, movies, director, quote)
			number += 1
		})
	})

<div class="info">
                    <div class="hd">
                        <a href="https://movie.douban.com/subject/1291546/" class="">
                            <span class="title">霸王别姬</span>
                                <span class="other">&nbsp;/&nbsp;再见，我的妾  /  Farewell My Concubine</span>
                        </a>


                            <span class="playable">[可播放]</span>
                    </div>
                    <div class="bd">
                        <p class="">
                            导演: 陈凯歌 Kaige Chen&nbsp;&nbsp;&nbsqp;主演: 张国荣 Leslie Cheung / 张丰毅 Fengyi Zha...<br>
                            1993&nbsp;/&nbsp;中国大陆 中国香港&nbsp;/&nbsp;剧情 爱情 同性
                        </p>

                        
                        <div class="star">
                                <span class="rating5-t"></span>
                                <span class="rating_num" property="v:average">9.6</span>
                                <span property="v:best" content="10.0"></span>
                                <span>2008403人评价</span>
                        </div>

                            <p class="quote">
                                <span class="inq">风华绝代。</span>
                            </p>
                    </div>
                </div>

Step3 错误处理，回调页面

c.OnError(func(response *colly.Response, err error) {
		fmt.Println(err)
	})
	c.Visit("https://movie.douban.com/top250?start=0&filter=")
	c.Wait()
	fmt.Printf("花费时间:%s", time.Since(t))

Colly

http://example.com/2022/10/08/Colly/

Author

WYX

Posted on

October 8, 2022

Licensed under

debug调试工具使用总结 Previous

第2章寻址方式 Next