因为我的电脑上之前有安装过pip,可以直接使用pip install scrapy
安装好scrapy之后,就可以用scrapy命令初始化项目
scrapy startproject pachong
创建一个爬虫项目,名称为pachong
scrapy.cfg – 项目的配置文件
pachong/ – 该项目的python模块,之后您将在此加入代码
pachong/items.py – 项目中的item文件
pachong/pipelines.py – 项目中的pipelines文件
pachong/settings.py – 项目的设置文件
pachong/spiders/ – 放置spider代码的目录
定义Item容器
写爬虫
在命令行运行爬虫 scrapy crawl pachong
,可以看到数据显示在terminal上面,在setting中添加配置,将爬到的数据输出到json;网上也有博文提到使用pipline输出,我这里暂时先不用pipline的方式,先简单一点...
这个例子是参照网上博文抓取的慕课网,稍后再试一下抓取前程无忧
imooc.com
:
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class PachongItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
# 课程标题
title = scrapy.Field()
# 课程url
url = scrapy.Field()
# 课程标题图片
image_url = scrapy.Field()
# 课程描述
introduction = scrapy.Field()
# 学习人数
student = scrapy.Field()
image_path = scrapy.Field()
MySpider.py
#coding=utf-8
import scrapy
from pachong.items import PachongItem
class MySpider(scrapy.Spider):
# 用于区别Spider
name = "pachong"
# 允许访问的域
allowed_domains = ['imooc.com']
# 爬取的地址
start_urls = ['https://www.imooc.com/course/list']
# 爬取方法
# def parse(self, response):
#pass
# 编写爬取方法
def parse(self, response):
# 实例一个容器保存爬取的信息
item = PachongItem()
# 这部分是爬取部分,使用xpath的方式选择信息,具体方法根据网页结构而定
# 先获取每个课程的div
for box in response.xpath('//div[@class="course-card-container"]'):
# 获取每个div中的课程路径
item['url'] = 'http://www.imooc.com' + box.xpath('.//@href').extract()[0]
# 获取div中的课程标题
item['title'] = box.xpath('.//h3[@class="course-card-name"]/text()').extract()[0].strip()
# 获取div中的标题图片地址
item['image_url'] = 'http:' + box.xpath('.//@src').extract()[0]
# 获取div中的学生人数
item['student'] = box.xpath('.//span/text()').extract()[1].strip()
# 获取div中的课程简介
item['introduction'] = box.xpath('.//p/text()').extract()[0].strip()
# 返回信息
yield item
# url跟进开始
# 获取下一页的url信息
url = response.xpath("//a[contains(text(),'下一页')]/@href").extract()
if url:
# 将信息组合成下一页的url
page = 'http://www.imooc.com' + url[0]
# 返回url
yield scrapy.Request(page, callback=self.parse)
# url跟进结束
settings.py
添加两行配置,用于输出数据到json
# -*- coding: utf-8 -*-
# Scrapy settings for pachong project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'pachong'
SPIDER_MODULES = ['pachong.spiders']
NEWSPIDER_MODULE = 'pachong.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'pachong (+http://www.yourdomain.com)'
# 添加下面两行配置,可将爬到的数据输出到 ./data/文件名.json 文件中
FEED_URI = 'data/%(name)s/%(time)s.json'
FEED_FORMAT = 'json'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'pachong.middlewares.PachongSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'pachong.middlewares.PachongDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'pachong.pipelines.PachongPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
抓51job二级页面时遇到如下错误
File "/anaconda3/lib/python3.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
2019-03-12 14:04:08 [scrapy.core.scraper] ERROR: Error downloading <GET https://jobs.51job.com/beijing-hdq/111409291.html?s=01&t=0>
Traceback (most recent call last):
File "/anaconda3/lib/python3.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
解决方法:在settings.py中添加 user-agent配置即可
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'