vlambda博客
学习文章列表

go爬虫框架的Colly框架使用

package main
import ( "fmt" "github.com/gocolly/colly")
func main() { c := colly.NewCollector()
c.OnHTML("a", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) })
c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL) })
c.Visit("https://docs.dbsgw.com/")}

2.爬取博客首页和文章页

package main
import ( "fmt" "github.com/gocolly/colly" "strings" "time")
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
c1 := c.Clone() c1.Async = true //限速 c1.Limit(&colly.LimitRule{ DomainRegexp: "", DomainGlob: "*.jianshu.com/p/*", Delay: 10 * time.Second, RandomDelay: 0, Parallelism: 1, }) // 首页提取链接 c.OnHTML("body > section > div.sidebar > div.widget.widget_ui_tags.wow.article-categories.fadeInUp > div", func(e *colly.HTMLElement) {
e.ForEach("a", func(i int, element *colly.HTMLElement) { href := element.Attr("href") if strings.Contains(href, "http") { fmt.Println(href,">>>>>包含",i) c1.Request("GET",href,nil,nil,nil)
}else { fmt.Println(element.Text,"<<<<<<不包含") } })
})
// 从链接里面 异步提取详情 c1.OnHTML("body > section > div.content-wrap > div > article.excerpt.excerpt-1.wow.fadeInUp > header > h2 > a", func(e *colly.HTMLElement) { fmt.Println(e.Text)
})
err := c.Visit("https://blog.dbsgw.com/") if err != nil { fmt.Println(err.Error()) } c1.Wait()}

3.爬取需要登录的网页(通过设置cookie实现登录)

package main
import ( "fmt" "github.com/gocolly/colly" "github.com/gocolly/colly/debug" "github.com/gocolly/colly/extensions" _ "github.com/gocolly/colly/extensions" "net/http")
func main() { url := "https://blog.dbsgw.com/admin/"
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{})) c.OnHTML("#admindex_servinfo > ul", func(e *colly.HTMLElement) { fmt.Println(e.Text,"---------------") })
//设置随机useragent extensions.RandomUserAgent(c) //设置登录cookie c.SetCookies(url, []*http.Cookie{ &http.Cookie{ Name: "PHPSESSID", Value: "3ur579rq6lindrkolomq24br17", Path: "/", Domain: "blog.dbsgw.com", Secure: true, HttpOnly: true, }, }) c.SetCookies(url, []*http.Cookie{ &http.Cookie{ Name: "EM_AUTHCOOKIE_95yUDyuGoTu7WYejbLXmH8m6O630lMsd", Value: "admin%7C%7Ca7e91e464d86db572e6e588ef9dd5815", Path: "/", Domain: "blog.dbsgw.com", Secure: true, HttpOnly: true, }, })

c.OnRequest(func(r *colly.Request) { fmt.Println("爬取页面:", r.URL) })
c.OnError(func(r *colly.Response, err error) { fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err) }) err := c.Visit(url) if err != nil { fmt.Println(err.Error()) }}