goquery是一个使用go语言写成的HTML解析库,可以让你像jQuery那样的方式来操作DOM文档,使用起来非常的简便。

闲话少说,直接上代码:

package main

import (
    "gopkg.in/mgo.v2"
    "github.com/PuerkitoBio/goquery"
    "log"
    "gopkg.in/mgo.v2/bson"
)

type Article struct {
    Title      string
    Decription string
    Image      string
    Author     string
    Time       string
}

func main() {
    getArticle()
}

func getArticle() {

    //连接MongoDB数据库
    session, err := mgo.Dial("")
    if err != nil {
        panic(err)
    }
    defer session.Close()

    session.SetMode(mgo.Monotonic, true)
    c := session.DB("test").C("article")

    //使用goquery开始抓取
    doc, err := goquery.NewDocument("http://weixin.sogou.com/")
    if err != nil {
        log.Fatal(err)
    }

    //编辑文章节点
    doc.Find(".news-list li").Each(func(i int, contentSelection *goquery.Selection) {

        //文章标题
        title := contentSelection.Find(".txt-box h3 a").Text()
        decription := contentSelection.Find(".txt-box .txt-info").Text()
        image, _ := contentSelection.Find(".img-box img").Attr("src")
        author := contentSelection.Find(".txt-box .account").Text()
        time, _ := contentSelection.Find(".txt-box .s-p").Attr("t")

        result := Article{}

        //是否已经保存过该文章
        err = c.Find(bson.M{"title": title}).One(&result)

        if err != nil {
            log.Println("第", i+1, "篇文章:", title)
            //保存至数据库
            err = c.Insert(&Article{title, decription, image, author, time})
            if err != nil {
                panic(err)
            }
        }
        log.Println(result)
    })
}

MongoDB查看爬取到的文章: