scrapy startproject getblog
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class BlogItem(Item): title = Field() desc = Field()
# coding=utf-8
from scrapy.spider import Spider
from getblog.items import BlogItem
from scrapy.selector import Selector
class BlogSpider(Spider):
# 标识名称
name = 'blog'
# 起始地址
start_urls = ['http://www.cnblogs.com/']
def parse(self, response):
sel = Selector(response) # Xptah 选择器
# 选择所有含有class属性,值为‘post_item'的div 标签内容
# 下面的 第2个div 的 所有内容
sites = sel.xpath('//div[@class="post_item"]/div[2]')
items = []
for site in sites:
item = BlogItem()
# 选取h3标签下,a标签下,的文字内容 ‘text()'
item['title'] = site.xpath('h3/a/text()').extract()
# 同上,p标签下的 文字内容 ‘text()'
item['desc'] = site.xpath('p[@class="post_item_summary"]/text()').extract()
items.append(item)
return items
scrapy crawl blog # 即可
# 输出文件位置 FEED_URI = 'blog.xml' # 输出文件格式 可以为 json,xml,csv FEED_FORMAT = 'xml'
dizzy@dizzy-pc:~$ scrapy shell "http://www.baidu.com/"
2014-08-21 04:09:11+0800 [scrapy] INFO: Scrapy 0.24.4 started (bot: scrapybot)
2014-08-21 04:09:11+0800 [scrapy] INFO: Optional features available: ssl, http11, django
2014-08-21 04:09:11+0800 [scrapy] INFO: Overridden settings: {'LOGSTATS_INTERVAL': 0}
2014-08-21 04:09:11+0800 [scrapy] INFO: Enabled extensions: TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2014-08-21 04:09:11+0800 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-08-21 04:09:11+0800 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2014-08-21 04:09:11+0800 [scrapy] INFO: Enabled item pipelines:
2014-08-21 04:09:11+0800 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
2014-08-21 04:09:11+0800 [scrapy] DEBUG: Web service listening on 127.0.0.1:6081
2014-08-21 04:09:11+0800 [default] INFO: Spider opened
2014-08-21 04:09:12+0800 [default] DEBUG: Crawled (200) <GET http://www.baidu.com/> (referer: None)
[s] Available Scrapy objects:
[s] crawler <scrapy.crawler.Crawler object at 0xa483cec>
[s] item {}
[s] request <GET http://www.baidu.com/>
[s] response <200 http://www.baidu.com/>
[s] settings <scrapy.settings.Settings object at 0xa0de78c>
[s] spider <Spider 'default' at 0xa78086c>
[s] Useful shortcuts:
[s] shelp() Shell help (print this help)
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
[s] view(response) View response in a browser
>>>
# response.body 返回的所有内容
# response.xpath('//ul/li') 可以测试所有的xpath内容
More important, if you type response.selector you will access a selector object you can use to
query the response, and convenient shortcuts like response.xpath() and response.css() mapping to
response.selector.xpath() and response.selector.css()
scrapy shell 'http://scrapy.org' --nolog # 参数 --nolog 没有日志
from scrapy import Spider
from scrapy_test.items import DmozItem
class DmozSpider(Spider):
name = 'dmoz'
allowed_domains = ['dmoz.org']
start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/,'
'']
def parse(self, response):
for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/@href').extract()
item['desc'] = sel.xpath('text()').extract()
yield item
scrapy crawl -o 'a.json' -t 'json'
scrapy genspider baidu baidu.com
# -*- coding: utf-8 -*-
import scrapy
class BaiduSpider(scrapy.Spider):
name = "baidu"
allowed_domains = ["baidu.com"]
start_urls = (
'http://www.baidu.com/',
)
def parse(self, response):
pass
#coding=utf-8
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
import scrapy
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/']
rules = (
# 元组
Rule(LinkExtractor(allow=('category.php', ), deny=('subsection.php', ))),
Rule(LinkExtractor(allow=('item.php', )), callback='pars_item'),
)
def parse_item(self, response):
self.log('item page : %s' % response.url)
item = scrapy.Item()
item['id'] = response.xpath('//td[@id="item_id"]/text()').re('ID:(d+)')
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
return item
>>> from scrapy.selector import Selector >>> from scrapy.http import HtmlResponse
from scrapy.exceptions import DropItem
class PricePipeline(object):
vat_factor = 1.5
def process_item(self, item, spider):
if item['price']:
if item['price_excludes_vat']:
item['price'] *= self.vat_factor
else:
raise DropItem('Missing price in %s' % item)
import json
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('json.jl', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + 'n'
self.file.write(line)
return item
from scrapy.exceptions import DropItem
class Duplicates(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['id'] in self.ids_seen:
raise DropItem('Duplicate item found : %s' % item)
else:
self.ids_seen.add(item['id'])
return item
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有