go爬虫框架的Colly框架使用
package mainimport ("fmt""github.com/gocolly/colly")func main() {c := colly.NewCollector()c.OnHTML("a", func(e *colly.HTMLElement) {e.Request.Visit(e.Attr("href"))})c.OnRequest(func(r *colly.Request) {fmt.Println("Visiting", r.URL)})c.Visit("https://docs.dbsgw.com/")}
2.爬取博客首页和文章页
package mainimport ("fmt""github.com/gocolly/colly""strings""time")func main() {c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))c1 := c.Clone()c1.Async = true//限速c1.Limit(&colly.LimitRule{DomainRegexp: "",DomainGlob: "*.jianshu.com/p/*",Delay: 10 * time.Second,RandomDelay: 0,Parallelism: 1,})// 首页提取链接c.OnHTML("body > section > div.sidebar > div.widget.widget_ui_tags.wow.article-categories.fadeInUp > div", func(e *colly.HTMLElement) {e.ForEach("a", func(i int, element *colly.HTMLElement) {href := element.Attr("href")if strings.Contains(href, "http") {fmt.Println(href,">>>>>包含",i)c1.Request("GET",href,nil,nil,nil)}else {fmt.Println(element.Text,"<<<<<<不包含")}})})// 从链接里面 异步提取详情c1.OnHTML("body > section > div.content-wrap > div > article.excerpt.excerpt-1.wow.fadeInUp > header > h2 > a", func(e *colly.HTMLElement) {fmt.Println(e.Text)})err := c.Visit("https://blog.dbsgw.com/")if err != nil {fmt.Println(err.Error())}c1.Wait()}
3.爬取需要登录的网页(通过设置cookie实现登录)
package mainimport ("fmt""github.com/gocolly/colly""github.com/gocolly/colly/debug""github.com/gocolly/colly/extensions"_ "github.com/gocolly/colly/extensions""net/http")func main() {url := "https://blog.dbsgw.com/admin/"c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))c.OnHTML("#admindex_servinfo > ul", func(e *colly.HTMLElement) {fmt.Println(e.Text,"---------------")})//设置随机useragentextensions.RandomUserAgent(c)//设置登录cookiec.SetCookies(url, []*http.Cookie{&http.Cookie{Name: "PHPSESSID",Value: "3ur579rq6lindrkolomq24br17",Path: "/",Domain: "blog.dbsgw.com",Secure: true,HttpOnly: true,},})c.SetCookies(url, []*http.Cookie{&http.Cookie{Name: "EM_AUTHCOOKIE_95yUDyuGoTu7WYejbLXmH8m6O630lMsd",Value: "admin%7C%7Ca7e91e464d86db572e6e588ef9dd5815",Path: "/",Domain: "blog.dbsgw.com",Secure: true,HttpOnly: true,},})c.OnRequest(func(r *colly.Request) {fmt.Println("爬取页面:", r.URL)})c.OnError(func(r *colly.Response, err error) {fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)})err := c.Visit(url)if err != nil {fmt.Println(err.Error())}}
