前两篇文章中,简单用scrapy写了一个小demo,本篇文章主要目标是完整用scrapy爬取,慕课网所有免费的课程、标题、图片、地址、学习人数、难度、方向、分类、时长、评分、评论数等。

其实,很简单,在上一次的代码中修改调整一下就好。

Spider

# -*- coding: utf-8 -*-

import scrapy
import json
from urllib import parse as urlparse
from scrapyDemo.ImoocCourseItem import ImoocCourseItem

# 慕课网爬取
class ImoocSpider(scrapy.Spider):
    # spider的名字定义了Scrapy如何定位(并初始化)spider,所以其必须是唯一的
    name = "imooc"

    # URL列表
    start_urls = ['http://www.imooc.com/course/list']
    #  域名不在列表中的URL不会被爬取。
    allowed_domains = ['www.imooc.com']

    def parse(self, response):
        # 课程类型
        types = response.css('div.course-content .course-nav-row')[2].css(
            '.bd ul li a')
        for key in range(len(types)):
            if key == 0:
                continue
            course_type = types[key].css('::text').extract_first()
            # 类型的url
            type_url = types[key].css('::attr(href)').extract_first()
            # print (item)
            yield scrapy.Request(
                url=urlparse.urljoin(response.url, type_url),
                callback=self.parse_by_type,
                meta={
                    'course_type': course_type
                })

    # 按课程类型爬取
    def parse_by_type(self, response):
        itemBase = response.meta
        item = ImoocCourseItem()
        item['course_type'] = itemBase['course_type']
        # print(item)
        learn_nodes = response.css('a.course-card')
        # 遍历该页上所有课程列表
        for learn_node in learn_nodes:
            course_url = learn_node.css("::attr(href)").extract_first()
            # 拼接课程详情页地址
            course_url = urlparse.urljoin(response.url, course_url)
            # 课程地址
            item['course_url'] = course_url
            # 课程图片
            item['image'] = learn_node.css(
                "img.course-banner::attr(src)").extract_first()
            # 分类
            cate = learn_node.css("div.course-label label::text").extract()
            item['cate'] = ','.join(cate)
            # 进入课程详情页面
            yield scrapy.Request(
                url=course_url, callback=self.parse_learn, meta=item)

        # 下一页地址
        next_page_url = response.css(
            u'div.page a:contains("下一页")::attr(href)').extract_first()
        if next_page_url:
            yield scrapy.Request(
                url=urlparse.urljoin(response.url, next_page_url),
                callback=self.parse_by_type,
                meta={
                    'course_type': item['course_type']
                })

    # 课程详情
    def parse_learn(self, response):
        item = response.meta
        # 课程标题
        item['title'] = response.xpath(
            '//h2[@class="l"]/text()').extract_first()
        # 课程简介
        item['brief'] = response.xpath(
            '//div[@class="course-brief"]/p/text()').extract_first()

        staticItem = response.css(
            'div#main div.statics div.static-item span.meta-value::text'
        ).extract()
        # 难度级别
        item['difficulty_level'] = staticItem[0]
        # 课程时长
        item['duration'] = staticItem[1]
        # 综合评分
        item['overall_rating'] = staticItem[2]
        # 评论数
        item['evaluation_number'] = response.css(
            'a.person-num span.person-num::text').extract_first().replace(
                '人评价', '')
        # 教师id
        item['teacher_id'] = response.css(
            'div.teacher-info a img::attr(data-userid)').extract_first()
        # 学习人数
        ids = response.url.split('/')[-1]
        yield scrapy.Request(
            url=urlparse.urljoin(response.url,
                                 '/course/AjaxCourseMembers?ids=' + ids),
            callback=self.parse_learn_num,
            meta=item)

    # 爬取学习人数
    def parse_learn_num(self, response):
        item = response.meta
        data = json.loads(response.body_as_unicode())
        # 学习人数
        item['learn_num'] = data['data'][0]['numbers']
        # print (item)
        yield item

Item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class ImoocCourseItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()    
    image = scrapy.Field()    
    brief = scrapy.Field()
    cate = scrapy.Field()
    course_type = scrapy.Field()
    course_url = scrapy.Field()
    learn_num = scrapy.Field()
    difficulty_level = scrapy.Field()
    duration = scrapy.Field()
    overall_rating = scrapy.Field()
    evaluation_number = scrapy.Field()
    teacher_id = scrapy.Field()
    pass

pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapyDemo.db.dbhelper import DBHelper

class ScrapydemoPipeline(object):
    # 连接数据库
    def __init__(self):
        self.db = DBHelper()

    def process_item(self, item, spider):
        # 插入数据库
        self.db.insert(item)
        return item

保存至数据库

这里也附上建表语句吧

CREATE TABLE `imooc_courses` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) DEFAULT NULL,
  `cate` varchar(255) DEFAULT NULL,
  `type` varchar(11) DEFAULT NULL,
  `image` varchar(255) DEFAULT NULL,
  `brief` varchar(255) DEFAULT NULL,
  `course_url` varchar(255) DEFAULT NULL,
  `learn_num` int(11) DEFAULT '0',
  `difficulty_level` varchar(255) DEFAULT NULL,
  `duration` varchar(255) DEFAULT NULL,
  `overall_rating` varchar(255) DEFAULT NULL,
  `evaluation_number` int(11) DEFAULT '0',
  `teacher_id` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=792 DEFAULT CHARSET=utf8mb4;

下面是保存到数据库的类

# -*- coding: utf-8 -*-

import pymysql
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings  #导入seetings配置
import time

class DBHelper():
    '''这个类也是读取settings中的配置,自行修改代码进行操作'''

    def __init__(self):
        settings = get_project_settings()  #获取settings配置,设置需要的信息

        dbparams = dict(
            host=settings['MYSQL_HOST'],  #读取settings中的配置
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            port=settings['MYSQL_PORT'],
            charset='utf8',  #编码要加上,否则可能出现中文乱码问题
            cursorclass=pymysql.cursors.DictCursor,
            use_unicode=False,
        )
        #**表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
        dbpool = adbapi.ConnectionPool('pymysql', **dbparams)

        self.dbpool = dbpool

    def connect(self):
        return self.dbpool

    #创建数据库
    def insert(self, item):
        sql = "insert into imooc_courses(title,type,cate,image,brief,course_url,learn_num,difficulty_level,duration,overall_rating,evaluation_number,teacher_id) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        #调用插入的方法
        query = self.dbpool.runInteraction(self._conditional_insert, sql, item)
        #调用异常处理方法
        query.addErrback(self._handle_error)

        return item

    #写入数据库中
    def _conditional_insert(self, tx, sql, item):
        params = (item["title"], item['course_type'],item['cate'], item['image'], item['brief'],
                  item['course_url'], item['learn_num'],
                  item['difficulty_level'], item['duration'],
                  item['overall_rating'], item['evaluation_number'],
                  item['teacher_id'])
        tx.execute(sql, params)

    #错误处理方法

    def _handle_error(self, failue):
        print('--------------database operation exception!!-----------------')
        print(failue)

大功告成

scrapy crawl imooc

至此,我们完美的获取了慕课网的免费课程,小伙伴们快来学习吧