$ npm install crawl-pet -g --production
$ crawl-pet > Set project dir: ./test-crawl-pet > Create crawl-pet in ./test-crawl-pet [y/n]: y > Set target url: http://foodshot.co/ > Set save rule [url/simple/group]: url > Set file type limit: > The limit: not limit > Set parser rule module: > The module: use default crawl-pet.parser
$ crawl-pet -o ./test-crawl-pet
$ crawl-pet --create-parser ./test-crawl-pet/parser.js
// crawl-pet 支持使用 cheerio,来进行页面分析,如果你有这个需要
const cheerio = require("cheerio")
/*
* header 函数是在请求发送前调用,可以配置请求的头信息,如果返回 false,则中断请求
*
* 参数:
* options: 详细设置请看 https://github.com/request/request
* crawler_handle: 与队列通信的对象,详情见下
*
* header 函数是可选的,可不写
*/
exports.header = function(options, crawler_handle) {
}
/*
* body 函数是在请求返回后调用,用来解析返回结果
*
* 参数:
* url: 请求的 url
* body: 请求返回结果, string 类型
* response: 请求的响应,详情请看: https://github.com/request/request
* crawler_handle: 与队列通信的对象,该对象包含以下方法
* .info : crawl-pet 的配置信息
* .uri : 当前请求的 uri 信息
* .addPage(url) : 向队列里添加一个待解析页面
* .addDown(url) : 向队列里添加一个待下载文件
* .save(content, ext) : 保存文本到本地,ext 设置保存文件的后缀名
* .over() : 结束当前队列,取出下一条队列数据
*/
exports.body = function(url, body, response, crawler_handle) {
const re = /\b(href|src)\s*=\s*["']([^'"#]+)/ig
var m = null
while (m = re.exec(body)){
let href = m[2]
if (/\.(png|gif|jpg|jpeg|mp4)\b/i.test(href)) {
// 这理添加了一条下载
crawler_handle.addDown(href)
}else if(!/\.(css|js|json|xml|svg)/.test(href)){
// 这理添加了一个待解析页面
crawler_handle.addPage(href)
}
}
// 记得在解析结束后一定要执行
crawler_handle.over()
}
$ crawl-pet -f ./test-crawl-pet/photos.foodshot.co/*.jpg
$ crawl-pet -l queue
$ crawl-pet --help
Crawl-pet options help:
-u, --url string Destination address
-o, --outdir string Save the directory, Default use pwd
-r, --restart Reload all page
--clear Clear queue
--save string Save file rules following options
= url: Save the path consistent with url
= simple: Save file in the project path
= group: Save 500 files in one folder
--types array Limit download file type
--limit number=5 Concurrency limit
--sleep number=200 Concurrent interval
--timeout number=180000 Queue timeout
--proxy string Set up proxy
--parser string Set crawl rule, it's a js file path!
The default load the parser.js file in the project path
--maxsize number Limit the maximum size of the download file
--minwidth number Limit the minimum width of the download file
--minheight number Limit the minimum height of the download file
-i, --info View the configuration file
-l, --list array View the queue data
e.g. [page/down/queue],0,-1
-f, --find array Find the download URL of the local file
--json Print result to json format
-v, --version View version
-h, --help View help
$ crawl-pet -u https://www.reddit.com/r/funny/ -o reddit --save group
{
"url": "https://www.reddit.com/r/funny/",
"outdir": ".",
"save": "group",
"types": "",
"limit": "5",
"parser": "my_parser.js",
"sleep": "200",
"timeout": "180000",
"proxy": "",
"maxsize": 0,
"minwidth": 0,
"minheight": 0,
"cookie": "over18=1"
}
exports.body = function(url, body, response, crawler_handle) {
const re = /\b(data-url|href|src)\s*=\s*["']([^'"#]+)/ig
var m = null
while (m = re.exec(body)){
let href = m[2]
if (/thumb|user|icon|\.(css|json|js|xml|svg)\b/i.test(href)) {
continue
}
if (/\.(png|gif|jpg|jpeg|mp4)\b/i.test(href)) {
crawler_handle.addDown(href)
continue
}
if(/reddit\.com\/r\//i.test(href)){
crawler_handle.addPage(href)
}
}
crawler_handle.over()
}
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有