goquery是一个使用go语言写成的HTML解析库,可以让你像jQuery那样的方式来操作DOM文档,使用起来非常的简便。
闲话少说,直接上代码:
package main
import (
"gopkg.in/mgo.v2"
"github.com/PuerkitoBio/goquery"
"log"
"gopkg.in/mgo.v2/bson"
)
type Article struct {
Title string
Decription string
Image string
Author string
Time string
}
func main() {
getArticle()
}
func getArticle() {
//连接MongoDB数据库
session, err := mgo.Dial("")
if err != nil {
panic(err)
}
defer session.Close()
session.SetMode(mgo.Monotonic, true)
c := session.DB("test").C("article")
//使用goquery开始抓取
doc, err := goquery.NewDocument("http://weixin.sogou.com/")
if err != nil {
log.Fatal(err)
}
//编辑文章节点
doc.Find(".news-list li").Each(func(i int, contentSelection *goquery.Selection) {
//文章标题
title := contentSelection.Find(".txt-box h3 a").Text()
decription := contentSelection.Find(".txt-box .txt-info").Text()
image, _ := contentSelection.Find(".img-box img").Attr("src")
author := contentSelection.Find(".txt-box .account").Text()
time, _ := contentSelection.Find(".txt-box .s-p").Attr("t")
result := Article{}
//是否已经保存过该文章
err = c.Find(bson.M{"title": title}).One(&result)
if err != nil {
log.Println("第", i+1, "篇文章:", title)
//保存至数据库
err = c.Insert(&Article{title, decription, image, author, time})
if err != nil {
panic(err)
}
}
log.Println(result)
})
}
MongoDB查看爬取到的文章: