<div class="content"> <div class="widget-gallery"> <ul class="pagelist-wrapper"> <li class="gallery-item...
<div class="content"> <div class="widget-gallery"></div> </div>
https://tuchong.com/rest/tags/美女/posts?page=1&count=20&order=weekly&before_timestamp=
{
"postList": [
{
"post_id": "15624611",
"type": "multi-photo",
"url": "https://weishexi.tuchong.com/15624611/",
"site_id": "443122",
"author_id": "443122",
"published_at": "2017-10-28 18:01:03",
"excerpt": "10月18日",
"favorites": 4052,
"comments": 353,
"rewardable": true,
"parent_comments": "165",
"rewards": "2",
"views": 52709,
"title": "微风不燥 秋意正好",
"image_count": 15,
"images": [
{
"img_id": 11585752,
"user_id": 443122,
"title": "",
"excerpt": "",
"width": 5016,
"height": 3840
},
{
"img_id": 11585737,
"user_id": 443122,
"title": "",
"excerpt": "",
"width": 3840,
"height": 5760
},
...
],
"title_image": null,
"tags": [
{
"tag_id": 131,
"type": "subject",
"tag_name": "人像",
"event_type": "",
"vote": ""
},
{
"tag_id": 564,
"type": "subject",
"tag_name": "美女",
"event_type": "",
"vote": ""
}
],
"favorite_list_prefix": [],
"reward_list_prefix": [],
"comment_list_prefix": [],
"cover_image_src": "https://photo.tuchong.com/443122/g/11585752.webp",
"is_favorite": false
}
],
"siteList": {...},
"following": false,
"coverUrl": "https://photo.tuchong.com/443122/ft640/11585752.webp",
"tag_name": "美女",
"tag_id": "564",
"url": "https://tuchong.com/tags/%E7%BE%8E%E5%A5%B3/",
"more": true,
"result": "SUCCESS"
}
(PROJECT) │ scrapy.cfg │ └─tuchong │ items.py │ middlewares.py │ pipelines.py │ settings.py │ __init__.py │ ├─spiders │ │ photo.py │ │ __init__.py │ │ │ └─__pycache__ │ __init__.cpython-36.pyc │ └─__pycache__ settings.cpython-36.pyc __init__.cpython-36.pyc
import scrapy class TuchongItem(scrapy.Item): post_id = scrapy.Field() site_id = scrapy.Field() title = scrapy.Field() type = scrapy.Field() url = scrapy.Field() image_count = scrapy.Field() images = scrapy.Field() tags = scrapy.Field() excerpt = scrapy.Field() ...
import scrapy class PhotoSpider(scrapy.Spider): name = 'photo' allowed_domains = ['tuchong.com'] start_urls = ['http://tuchong.com/'] def parse(self, response): pass
import scrapy, json
from ..items import TuchongItem
class PhotoSpider(scrapy.Spider):
name = 'photo'
# allowed_domains = ['tuchong.com']
# start_urls = ['http://tuchong.com/']
def start_requests(self):
url = 'https://tuchong.com/rest/tags/%s/posts?page=%d&count=20&order=weekly';
# 抓取10个页面,每页20个图集
# 指定 parse 作为回调函数并返回 Requests 请求对象
for page in range(1, 11):
yield scrapy.Request(url=url % ('美女', page), callback=self.parse)
# 回调函数,处理抓取内容填充 TuchongItem 属性
def parse(self, response):
body = json.loads(response.body_as_unicode())
items = []
for post in body['postList']:
item = TuchongItem()
item['type'] = post['type']
item['post_id'] = post['post_id']
item['site_id'] = post['site_id']
item['title'] = post['title']
item['url'] = post['url']
item['excerpt'] = post['excerpt']
item['image_count'] = int(post['image_count'])
item['images'] = {}
# 将 images 处理成 {img_id: img_url} 对象数组
for img in post.get('images', ''):
img_id = img['img_id']
url = 'https://photo.tuchong.com/%s/f/%s.jpg' % (item['site_id'], img_id)
item['images'][img_id] = url
item['tags'] = []
# 将 tags 处理成 tag_name 数组
for tag in post.get('tags', ''):
item['tags'].append(tag['tag_name'])
items.append(item)
return items
...
def process_item(self, item, spider):
# 不符合条件触发 scrapy.exceptions.DropItem 异常,符合条件的输出地址
if int(item['image_count']) < 3:
raise DropItem("美女太少: " + item['url'])
elif item['type'] != 'multi-photo':
raise DropItem("格式不对: " + + item['url'])
else:
print(item['url'])
return item
...
ITEM_PIPELINES = {
'tuchong.pipelines.TuchongPipeline': 300, # 管道名称: 运行优先级(数字小优先)
}
scrapy crawl photo
[scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 491,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 10224,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 11, 27, 7, 20, 24, 414201),
'item_dropped_count': 5,
'item_dropped_reasons_count/DropItem': 5,
'item_scraped_count': 15,
'log_count/DEBUG': 18,
'log_count/INFO': 8,
'log_count/WARNING': 5,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2017, 11, 27, 7, 20, 23, 867300)}
scrapy crawl photo -o output.json # 输出为JSON文件 scrapy crawl photo -o output.csv # 输出为CSV文件
... def process_item(self, item, spider): ... else: print(item['url']) self.myblog.add_post(item) # myblog 是一个数据库类,用于处理数据库操作 return item ...
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有