import re
findall(string[, pos[, endpos]]) | re.findall(pattern, string[, flags])
compile(pattern[,flags] )
>>> import re >>> string="A1.45,b5,6.45,8.82" >>> regex = re.compile(r"\d+\.?\d*") >>> print regex.findall(string) ['1.45', '5', '6.45', '8.82'] >>>
match(string[, pos[, endpos]]) | re.match(pattern, string[, flags])
search(string[, pos[, endpos]]) | re.search(pattern, string[, flags])
# coding=utf-8 import re import urllib url = "http://www.baidu.com/" content = urllib.urlopen(url).read() title = re.findall(r'<title>(.*?)</title>', content) print title[0] # 百度一下,你就知道
pat = r'(?<=<title>).*?(?=</title>)' ex = re.compile(pat, re.M|re.S) obj = re.search(ex, content) title = obj.group() print title # 百度一下,你就知道
# coding=utf-8 import re import urllib url = "http://www.baidu.com/" content = urllib.urlopen(url).read() #获取完整超链接 res = r"<a.*?href=.*?<\/a>" urls = re.findall(res, content) for u in urls: print unicode(u,'utf-8') #获取超链接<a>和</a>之间内容 res = r'<a .*?>(.*?)</a>' texts = re.findall(res, content, re.S|re.M) for t in texts: print unicode(t,'utf-8')
#获取完整超链接 <a href="http://news.baidu.com" rel="external nofollow" rel="external nofollow" name="tj_trnews" class="mnav">新闻</a> <a href="http://www.hao123.com" rel="external nofollow" rel="external nofollow" name="tj_trhao123" class="mnav">hao123</a> <a href="http://map.baidu.com" rel="external nofollow" rel="external nofollow" name="tj_trmap" class="mnav">地图</a> <a href="http://v.baidu.com" rel="external nofollow" rel="external nofollow" name="tj_trvideo" class="mnav">视频</a> ... #获取超链接<a>和</a>之间内容 新闻 hao123 地图 视频 ...
<html> <head><title>表格</title></head> <body> <table border=1> <tr><th>学号</th><th>姓名</th></tr> <tr><td>1001</td><td>杨秀璋</td></tr> <tr><td>1002</td><td>严娜</td></tr> </table> </body> </html>
# coding=utf-8
import re
import urllib
content = urllib.urlopen("test.html").read() #打开本地文件
#获取<tr></tr>间内容
res = r'<tr>(.*?)</tr>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
print m
#获取<th></th>间内容
for m in texts:
res_th = r'<th>(.*?)</th>'
m_th = re.findall(res_th, m, re.S|re.M)
for t in m_th:
print t
#直接获取<td></td>间内容
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
print m[0],m[1]
>>> <th>学号</th><th>姓名</th> <td>1001</td><td>杨秀璋</td> <td>1002</td><td>严娜</td> 学号 姓名 1001 杨秀璋 1002 严娜 >>>
# coding=utf-8 import re content = ''' <a href="http://news.baidu.com" rel="external nofollow" rel="external nofollow" name="tj_trnews" class="mnav">新闻</a> <a href="http://www.hao123.com" rel="external nofollow" rel="external nofollow" name="tj_trhao123" class="mnav">hao123</a> <a href="http://map.baidu.com" rel="external nofollow" rel="external nofollow" name="tj_trmap" class="mnav">地图</a> <a href="http://v.baidu.com" rel="external nofollow" rel="external nofollow" name="tj_trvideo" class="mnav">视频</a> ''' res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" urls = re.findall(res, content, re.I|re.S|re.M) for url in urls: print url
>>> http://news.baidu.com http://www.hao123.com http://map.baidu.com http://v.baidu.com >>>
content = '''<img alt="Python" src="http://www..csdn.net/eastmount.jpg" />'''
urls = re.findall('src="(.*?)"', content, re.I|re.S|re.M)
print urls
# ['http://www..csdn.net/eastmount.jpg']
content = '''<img alt="Python" src="http://www..csdn.net/eastmount.jpg" />'''
urls = 'http://www..csdn.net/eastmount.jpg'
name = urls.split('/')[-1]
print name
# eastmount.jpg
start = content.find(r'<table class="infobox"') #起点位置 end = content.find(r'</table>') #重点点位置 infobox = text[start:end] print infobox
# coding=utf-8 import re content = ''' <tr><td>1001</td><td>杨秀璋<br /></td></tr> <tr><td>1002</td><td>颜 娜</td></tr> <tr><td>1003</td><td><B>Python</B></td></tr> ''' res = r'<td>(.*?)</td><td>(.*?)</td>' texts = re.findall(res, content, re.S|re.M) for m in texts: print m[0],m[1]
>>> 1001 杨秀璋<br /> 1002 颜 娜 1003 <B>Python</B> >>>
# coding=utf-8
import re
content = '''
<tr><td>1001</td><td>杨秀璋<br /></td></tr>
<tr><td>1002</td><td>颜 娜</td></tr>
<tr><td>1003</td><td><B>Python</B></td></tr>
'''
res = r'<td>(.*?)</td><td>(.*?)</td>'
texts = re.findall(res, content, re.S|re.M)
for m in texts:
value0 = m[0].replace('<br />', '').replace(' ', '')
value1 = m[1].replace('<br />', '').replace(' ', '')
if '<B>' in value1:
m_value = re.findall(r'<B>(.*?)</B>', value1, re.S|re.M)
print value0, m_value[0]
else:
print value0, value1
>>> 1001 杨秀璋 1002 颜娜 1003 Python >>>
<div class="essay"> <h1 style="text-align:center"> <a href="http://blog.csdn.net/eastmount/.../52201984" rel="external nofollow" > 再见北理工:忆北京研究生的编程时光 </a> </h1> <p style="text-indent: 2em;"> 两年前,我本科毕业写了这样一篇文章:《 回忆自己的大学四年得与失 》,感慨了自己在北理软院四年的所得所失;两年后,我离开了帝都,回到了贵州家乡,准备开启一段新的教师生涯,在此也写一篇文章纪念下吧! 还是那句话:这篇文章是写给自己的,希望很多年之后,回想起自己北京的六年时光,也是美好的回忆。文章可能有点长,但希望大家像读小说一样耐心品读,.... </p> </div>
import re import urllib url = "http://www.eastmountyxz.com/" content = urllib.urlopen(url).read() urls = re.findall(r'src="(.*?)"', content) for url in urls: print url
import re import urllib url = "http://www.eastmountyxz.com/" content = urllib.urlopen(url).read() start = content.find(r'<div class="essay">') end = content.find(r'<div class="essay1">') print content[start:end]
import re
import urllib
url = "http://www.eastmountyxz.com/"
content = urllib.urlopen(url).read()
start = content.find(r'<div class="essay">')
end = content.find(r'<div class="essay1">')
page = content[start:end]
res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
t1 = re.findall(res, page) #超链接
print t1[0]
t2 = re.findall(r'<a .*?>(.*?)</a>', page) #标题
print t2[0]
t3 = re.findall('<p style=.*?>(.*?)</p>', page, re.M|re.S) #摘要(
print t3[0]
>>> http://blog.csdn.net/eastmount/article/details/52201984 再见北理工:忆北京研究生的编程时光 两年前,我本科毕业写了这样一篇文章:《 回忆自己的大学四年得与失 》,感慨了自己在北理软院四年的所得所失;两年后,我离开了帝都,回到了贵州家乡,准备开启一段新的教师生涯,在此也写一篇文章纪念下吧! 还是那句话:这篇文章是写给自己的,希望很多年之后,回想起自己北京的六年时光,也是美好的回忆。文章可能有点长,但希望大家像读小说一样耐心品读,.... >>>
#coding:utf-8
import re
import urllib
url = "http://www.eastmountyxz.com/"
content = urllib.urlopen(url).read()
#爬取标题
title = re.findall(r'<title>(.*?)</title>', content)
print title[0]
#爬取图片地址
urls = re.findall(r'src="(.*?)"', content)
for url in urls:
print url
#爬取内容
start = content.find(r'<div class="essay">')
end = content.find(r'<div class="essay1">')
page = content[start:end]
res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
t1 = re.findall(res, page) #超链接
print t1[0]
t2 = re.findall(r'<a .*?>(.*?)</a>', page) #标题
print t2[0]
t3 = re.findall('<p style=.*?>(.*?)</p>', page, re.M|re.S) #摘要(
print t3[0]
print ''
start = content.find(r'<div class="essay1">')
end = content.find(r'<div class="essay2">')
page = content[start:end]
res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
t1 = re.findall(res, page) #超链接
print t1[0]
t2 = re.findall(r'<a .*?>(.*?)</a>', page) #标题
print t2[0]
t3 = re.findall('<p style=.*?>(.*?)</p>', page, re.M|re.S) #摘要(
print t3[0]
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有