pip install beautifulsoup4
<html> <head> <title>hello, world</title> </head> <body> <h1>BeautifulSoup</h1> <p>如何使用BeautifulSoup</p> <body> </html>
from bs4 import BeautifulSoup text = """ <html> <head> <title >hello, world</title> </head> <body> <h1>BeautifulSoup</h1> <p class="bold">如何使用BeautifulSoup</p> <p class="big" id="key1"> 第二个p标签</p> <a href="http://foofish.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >python</a> </body> </html> """ soup = BeautifulSoup(text, "html.parser") # title 标签 >>> soup.title <title>hello, world</title> # p 标签 >>> soup.p <p class="bold">\u5982\u4f55\u4f7f\u7528BeautifulSoup</p> # p 标签的内容 >>> soup.p.string u'\u5982\u4f55\u4f7f\u7528BeautifulSoup'
>>> type(soup) <class 'bs4.BeautifulSoup'> >>> type(soup.h1) <class 'bs4.element.Tag'> >>> type(soup.p.string) <class 'bs4.element.NavigableString'>
>>> soup.h1.name u'h1' >>> soup.p.name u'p'
>>> soup.p['class'] [u'bold']
>>> soup.p.string u'\u5982\u4f55\u4f7f\u7528BeautifulSoup' >>> type(soup.p.string) <class 'bs4.element.NavigableString'> >>> unicode_str = unicode(soup.p.string) >>> unicode_str u'\u5982\u4f55\u4f7f\u7528BeautifulSoup'
>>> soup.body <body>\n<h1>BeautifulSoup</h1>\n<p class="bold">\u5982\u4f55\u4f7f\u7528BeautifulSoup</p>\n</body>
>>> soup.body.p <p class="bold">\u5982\u4f55\u4f7f\u7528BeautifulSoup</p>
>>> soup.body.p.string \u5982\u4f55\u4f7f\u7528BeautifulSoup
find_all( name , attrs , recursive , text , **kwargs )
# 找到所有标签名为title的节点
>>> soup.find_all("title")
[<title>hello, world</title>]
>>> soup.find_all("p")
[<p class="bold">\xc8\xe7\xba\xce\xca\xb9\xd3\xc3BeautifulSoup</p>,
<p class="big"> \xb5\xda\xb6\xfe\xb8\xf6p\xb1\xea\xc7\xa9</p>]
# 找到所有class属性为big的p标签
>>> soup.find_all("p", "big")
[<p class="big"> \xb5\xda\xb6\xfe\xb8\xf6p\xb1\xea\xc7\xa9</p>]
>>> soup.find_all("p", class_="big")
[<p class="big"> \xb5\xda\xb6\xfe\xb8\xf6p\xb1\xea\xc7\xa9</p>]
>>> soup.find_all(href="http://foofish.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" ) [<a href="http://foofish.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >python</a>]
>>> import re
>>> soup.find_all(href=re.compile("^http"))
[<a href="http://foofish.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >python</a>]
>>> soup.find_all(id="key1") [<p class="big" id="key1"> \xb5\xda\xb6\xfe\xb8\xf6p\xb1\xea\xc7\xa9</p>] >>> soup.find_all(id=True) [<p class="big" id="key1"> \xb5\xda\xb6\xfe\xb8\xf6p\xb1\xea\xc7\xa9</p>]
>>> body_tag = soup.body
>>> body_tag.find_all("a")
[<a href="http://foofish.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >python</a>]
>>> body_tag.find("a")
<a href="http://foofish.net" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >python</a>
>>> body_tag.find("p")
<p class="bold">\xc8\xe7\xba\xce\xca\xb9\xd3\xc3BeautifulSoup</p>
>>> p1 = body_tag.find('p').get_text()
>>> type(p1)
<type 'unicode'>
>>> p1
u'\xc8\xe7\xba\xce\xca\xb9\xd3\xc3BeautifulSoup'
>>> p2 = body_tag.find("p").string
>>> type(p2)
<class 'bs4.element.NavigableString'>
>>> p2
u'\xc8\xe7\xba\xce\xca\xb9\xd3\xc3BeautifulSoup'
>>>
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有