easy_install beautifulsoup4
pip install beautifulsoup4
apt-get install Python-bs4
Python setup.py install
# coding=utf-8
'''
@通过BeautifulSoup下载百度贴吧图片
'''
import urllib
from bs4 import BeautifulSoup
url = 'http://tieba.baidu.com/p/3537654215'
# 下载网页
html = urllib.urlopen(url)
content = html.read()
html.close()
# 使用BeautifulSoup匹配图片
html_soup = BeautifulSoup(content)
# 图片代码我们在[Python爬虫基础1--urllib]( http://blog.xiaolud.com/2015/01/22/spider-1st/ "Python爬虫基础1--urllib")里面已经分析过了
# 相较通过正则表达式去匹配,BeautifulSoup提供了一个更简单灵活的方式
all_img_links = html_soup.findAll('img', class_='BDE_Image')
# 接下来就是老生常谈的下载图片
img_counter = 1
for img_link in all_img_links:
img_name = '%s.jpg' % img_counter
urllib.urlretrieve(img_link['src'], img_name)
img_counter += 1
<div id="video-summary-content">
<div class="video-summary"> <!-- first video -->
<div class="thumbnail-data">...</div>
<div class="video-summary-data">
<div>
<strong><a href="#link to video page#">#title#</a></strong>
</div>
</div>
</div>
<div class="video-summary"> <!-- second video -->
...
</div>
...
</div>
import requests
response = requests.get('http://pyvideo.org/category/50/pycon-us-2014')
import bs4
soup = bs4.BeautifulSoup(response.text)
links = soup.select('div.video-summary-data a[href^=/video]')
import requests
import bs4
root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
def get_video_page_urls():
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text)
return [a.attrs.get('href') for a in soup.select('div.video-summary-data a[href^=/video]')]
print(get_video_page_urls())
def get_video_data(video_page_url):
video_data = {}
response = requests.get(root_url + video_page_url)
soup = bs4.BeautifulSoup(response.text)
video_data['title'] = soup.select('div#videobox h3')[0].get_text()
video_data['speakers'] = [a.get_text() for a in soup.select('div#sidebar a[href^=/speaker]')]
video_data['youtube_url'] = soup.select('div#sidebar a[href^=http://www.youtube.com]')[0].get_text()
def get_video_data(video_page_url):
# ...
response = requests.get(video_data['youtube_url'])
soup = bs4.BeautifulSoup(response.text)
video_data['views'] = int(re.sub('[^0-9]', '',
soup.select('.watch-view-count')[0].get_text().split()[0]))
video_data['likes'] = int(re.sub('[^0-9]', '',
soup.select('.likes-count')[0].get_text().split()[0]))
video_data['dislikes'] = int(re.sub('[^0-9]', '',
soup.select('.dislikes-count')[0].get_text().split()[0]))
return video_data
def show_video_stats():
video_page_urls = get_video_page_urls()
for video_page_url in video_page_urls:
print get_video_data(video_page_url)
from multiprocessing import Pool def show_video_stats(options): pool = Pool(8) video_page_urls = get_video_page_urls() results = pool.map(get_video_data, video_page_urls)
import argparse
import re
from multiprocessing import Pool
import requests
import bs4
root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
def get_video_page_urls():
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text)
return [a.attrs.get('href') for a in soup.select('div.video-summary-data a[href^=/video]')]
def get_video_data(video_page_url):
video_data = {}
response = requests.get(root_url + video_page_url)
soup = bs4.BeautifulSoup(response.text)
video_data['title'] = soup.select('div#videobox h3')[0].get_text()
video_data['speakers'] = [a.get_text() for a in soup.select('div#sidebar a[href^=/speaker]')]
video_data['youtube_url'] = soup.select('div#sidebar a[href^=http://www.youtube.com]')[0].get_text()
response = requests.get(video_data['youtube_url'])
soup = bs4.BeautifulSoup(response.text)
video_data['views'] = int(re.sub('[^0-9]', '',
soup.select('.watch-view-count')[0].get_text().split()[0]))
video_data['likes'] = int(re.sub('[^0-9]', '',
soup.select('.likes-count')[0].get_text().split()[0]))
video_data['dislikes'] = int(re.sub('[^0-9]', '',
soup.select('.dislikes-count')[0].get_text().split()[0]))
return video_data
def parse_args():
parser = argparse.ArgumentParser(description='Show PyCon 2014 video statistics.')
parser.add_argument('--sort', metavar='FIELD', choices=['views', 'likes', 'dislikes'],
default='views',
help='sort by the specified field. Options are views, likes and dislikes.')
parser.add_argument('--max', metavar='MAX', type=int, help='show the top MAX entries only.')
parser.add_argument('--csv', action='store_true', default=False,
help='output the data in CSV format.')
parser.add_argument('--workers', type=int, default=8,
help='number of workers to use, 8 by default.')
return parser.parse_args()
def show_video_stats(options):
pool = Pool(options.workers)
video_page_urls = get_video_page_urls()
results = sorted(pool.map(get_video_data, video_page_urls), key=lambda video: video[options.sort],
reverse=True)
max = options.max
if max is None or max > len(results):
max = len(results)
if options.csv:
print(u'"title","speakers", "views","likes","dislikes"')
else:
print(u'Views +1 -1 Title (Speakers)')
for i in range(max):
if options.csv:
print(u'"{0}","{1}",{2},{3},{4}'.format(
results[i]['title'], ', '.join(results[i]['speakers']), results[i]['views'],
results[i]['likes'], results[i]['dislikes']))
else:
print(u'{0:5d} {1:3d} {2:3d} {3} ({4})'.format(
results[i]['views'], results[i]['likes'], results[i]['dislikes'], results[i]['title'],
', '.join(results[i]['speakers'])))
if __name__ == '__main__':
show_video_stats(parse_args())
(venv) $ python pycon-scraper.py --sort views --max 25 --workers 8 Views +1 -1 Title (Speakers) 3002 27 0 Keynote - Guido Van Rossum (Guido Van Rossum) 2564 21 0 Computer science fundamentals for self-taught programmers (Justin Abrahms) 2369 17 0 Ansible - Python-Powered Radically Simple IT Automation (Michael Dehaan) 2165 27 6 Analyzing Rap Lyrics with Python (Julie Lavoie) 2158 24 3 Exploring Machine Learning with Scikit-learn (Jake Vanderplas, Olivier Grisel) 2065 13 0 Fast Python, Slow Python (Alex Gaynor) 2024 24 0 Getting Started with Django, a crash course (Kenneth Love) 1986 47 0 It's Dangerous to Go Alone: Battling the Invisible Monsters in Tech (Julie Pagano) 1843 24 0 Discovering Python (David Beazley) 1672 22 0 All Your Ducks In A Row: Data Structures in the Standard Library and Beyond (Brandon Rhodes) 1558 17 1 Keynote - Fernando Pérez (Fernando Pérez) 1449 6 0 Descriptors and Metaclasses - Understanding and Using Python's More Advanced Features (Mike Müller) 1402 12 0 Flask by Example (Miguel Grinberg) 1342 6 0 Python Epiphanies (Stuart Williams) 1219 5 0 0 to 00111100 with web2py (G. Clifford Williams) 1169 18 0 Cheap Helicopters In My Living Room (Ned Jackson Lovely) 1146 11 0 IPython in depth: high productivity interactive and parallel python (Fernando Perez) 1127 5 0 2D/3D graphics with Python on mobile platforms (Niko Skrypnik) 1081 8 0 Generators: The Final Frontier (David Beazley) 1067 12 0 Designing Poetic APIs (Erik Rose) 1064 6 0 Keynote - John Perry Barlow (John Perry Barlow) 1029 10 0 What Is Async, How Does It Work, And When Should I Use It? (A. Jesse Jiryu Davis) 981 11 0 The Sorry State of SSL (Hynek Schlawack) 961 12 2 Farewell and Welcome Home: Python in Two Genders (Naomi Ceder) 958 6 0 Getting Started Testing (Ned Batchelder)
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有