<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test Html</title>
</head>
<body>
<div id="content">
<ul id="like">
<li>like one</li>
<li>like two</li>
<li>like three</li>
</ul>
<ul id="hate">
<li>hate one</li>
<li>hate two</li>
<li>hate three</li>
</ul>
<div id="url">
<a href="http://www.baidu.com">百度一下</a>
<a href="http://www.hao123.com">好123</a>
</div>
</div>
</body></html>
# coding=utf-8
from lxml import etree
f = open('myHtml.html','r')
html = f.read()
f.close()
selector = etree.HTML(html)
content = selector.xpath('//*[@id="like"]/li/text()')
for each in content:
print each
like one like two like three
content = selector.xpath('//*[@id="url"]/a/@href')
for each in content:
print each
http://www.baidu.com http://www.hao123.com
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div id="likeone">like one</div> <div id="liketwo">like two</div> <div id="likethree">like three</div> </body> </html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="content">
<div id="text">
<p>hello
<b> world
<font color="#ffe4c4">
Python
</font>
</b>
</p>
</div>
</div>
</body>
</html>
content = selector.xpath('//*[@id="text"]/p/text()')
for each in content:
print each
content = selector.xpath('//*[@id="text"]/p')[0]
info = content.xpath('string(.)')
data = info.replace('\n','').replace(' ','')
print data
# coding=utf-8
import requests
from multiprocessing.dummy import Pool as ThreadPool
import time
def getsource(url):
html = requests.get(url)
if __name__ == '__main__':
urls = []
for i in range(50, 500, 50):
newpage = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=' + str(i)
urls.append(newpage)
# 单线程计时
time1 = time.time()
for i in urls:
print i
getsource(i)
time2 = time.time()
print '单线程耗时 : ' + str(time2 - time1) + ' s'
# 多线程计时
pool = ThreadPool(4)
time3 = time.time()
results = pool.map(getsource, urls)
pool.close()
pool.join()
time4 = time.time()
print '多线程耗时 : ' + str(time4 - time3) + ' s'
http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=250 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=300 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=350 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=400 http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=450 单线程耗时 : 7.26399993896 s 多线程耗时 : 2.49799990654 s
# coding=utf8
import requests
import re
import time
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def changepage(url, total):
urls = []
nowpage = int(re.search('(\d+)', url, re.S).group(1))
for i in range(nowpage, total + 1):
link = re.sub('page_index=(\d+)', 'page_index=%s' % i, url, re.S)
urls.append(link)
return urls
def spider(url):
html = requests.get(url)
content = html.text
selector = etree.HTML(content)
title = []
title = selector.xpath('//*[@id="component_0__0__6612"]/li/a/@title')
detail = []
detail = selector.xpath('//*[@id="component_0__0__6612"]/li/p[3]/span[1]/text()')
saveinfo(title,detail)
def saveinfo(title, detail):
length1 = len(title)
for i in range(0, length1 - 1):
f.writelines(title[i] + '\n')
f.writelines(detail[i] + '\n\n')
if __name__ == '__main__':
pool = ThreadPool(4)
f = open('info.txt', 'a')
url = 'http://search.dangdang.com/?key=Java&act=input&page_index=1'
urls = changepage(url, 80)
time1 = time.time()
pool.map(spider, urls)
pool.close()
pool.join()
f.close()
print '爬取成功!'
time2 = time.time()
print '多线程耗时 : ' + str(time2 - time1) + 's'
# time1 = time.time()
# for each in urls:
# spider(each)
# time2 = time.time()
# f.close()
# print '单线程耗时 : ' + str(time2 - time1) + 's'
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有