go爬虫框架的Colly框架使用
package main
import (
"fmt"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
c.OnHTML("a", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("https://docs.dbsgw.com/")
}
2.爬取博客首页和文章页
package main
import (
"fmt"
"github.com/gocolly/colly"
"strings"
"time"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
c1 := c.Clone()
c1.Async = true
//限速
c1.Limit(&colly.LimitRule{
DomainRegexp: "",
DomainGlob: "*.jianshu.com/p/*",
Delay: 10 * time.Second,
RandomDelay: 0,
Parallelism: 1,
})
// 首页提取链接
c.OnHTML("body > section > div.sidebar > div.widget.widget_ui_tags.wow.article-categories.fadeInUp > div", func(e *colly.HTMLElement) {
e.ForEach("a", func(i int, element *colly.HTMLElement) {
href := element.Attr("href")
if strings.Contains(href, "http") {
fmt.Println(href,">>>>>包含",i)
c1.Request("GET",href,nil,nil,nil)
}else {
fmt.Println(element.Text,"<<<<<<不包含")
}
})
})
// 从链接里面 异步提取详情
c1.OnHTML("body > section > div.content-wrap > div > article.excerpt.excerpt-1.wow.fadeInUp > header > h2 > a", func(e *colly.HTMLElement) {
fmt.Println(e.Text)
})
err := c.Visit("https://blog.dbsgw.com/")
if err != nil {
fmt.Println(err.Error())
}
c1.Wait()
}
3.爬取需要登录的网页(通过设置cookie实现登录)
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/extensions"
_ "github.com/gocolly/colly/extensions"
"net/http"
)
func main() {
url := "https://blog.dbsgw.com/admin/"
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
c.OnHTML("#admindex_servinfo > ul", func(e *colly.HTMLElement) {
fmt.Println(e.Text,"---------------")
})
//设置随机useragent
extensions.RandomUserAgent(c)
//设置登录cookie
c.SetCookies(url, []*http.Cookie{
&http.Cookie{
Name: "PHPSESSID",
Value: "3ur579rq6lindrkolomq24br17",
Path: "/",
Domain: "blog.dbsgw.com",
Secure: true,
HttpOnly: true,
},
})
c.SetCookies(url, []*http.Cookie{
&http.Cookie{
Name: "EM_AUTHCOOKIE_95yUDyuGoTu7WYejbLXmH8m6O630lMsd",
Value: "admin%7C%7Ca7e91e464d86db572e6e588ef9dd5815",
Path: "/",
Domain: "blog.dbsgw.com",
Secure: true,
HttpOnly: true,
},
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取页面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
err := c.Visit(url)
if err != nil {
fmt.Println(err.Error())
}
}