# -*- coding: utf-8 -*-
import urllib2
import re
import chardet
class Book_Spider:
def __init__(self):
self.pages = []
# 抓取一个章节
def GetPage(self):
myUrl = "http://www.quanben.com/xiaoshuo/0/910/59302.html";
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
request = urllib2.Request(myUrl, headers = headers)
myResponse = urllib2.urlopen(request)
myPage = myResponse.read()
#先检测网页的字符编码,最后统一转为 utf-8
charset = chardet.detect(myPage)
charset = charset['encoding']
if charset == 'utf-8' or charset == 'UTF-8':
myPage = myPage
else:
myPage = myPage.decode('gb2312','ignore').encode('utf-8')
unicodePage = myPage.decode("utf-8")
try:
#抓取标题
my_title = re.search('<h1>(.*?)</h1>',unicodePage,re.S)
my_title = my_title.group(1)
except:
print '标题 HTML 变化,请重新分析!'
return False
try:
#抓取章节内容
my_content = re.search('<div.*?id="htmlContent" class="contentbox">(.*?)<div',unicodePage,re.S)
my_content = my_content.group(1)
except:
print "内容 HTML 变化,请重新分析!"
return False
#替换正文中的网页代码
my_content = my_content.replace("<br />","\n")
my_content = my_content.replace(" "," ")
#用字典存储一章的标题和内容
onePage = {'title':my_title,'content':my_content}
return onePage
# 用于加载章节
def LoadPage(self):
try:
# 获取新的章节
myPage = self.GetPage()
if myPage == False:
print '抓取失败!'
return False
self.pages.append(myPage)
except:
print '无法连接服务器!'
#显示一章
def ShowPage(self,curPage):
print curPage['title']
print curPage['content']
def Start(self):
print u'开始阅读......\n'
#把这一页加载进来
self.LoadPage()
# 如果self的pages数组中存有元素
if self.pages:
nowPage = self.pages[0]
self.ShowPage(nowPage)
#----------- 程序的入口处 -----------
print u"""
---------------------------------------
程序:阅读呼叫转移
版本:0.1
作者:angryrookie
日期:2014-07-05
语言:Python 2.7
功能:按下回车浏览章节
---------------------------------------
"""
print u'请按下回车:'
raw_input()
myBook = Book_Spider()
myBook.Start()
<div id="footlink"> <script type="text/javascript" charset="utf-8" src="/scripts/style5.js"></script> <a href="http://www.quanben.com/xiaoshuo/0/910/59301.html">上一页</a> <a href="http://www.quanben.com/xiaoshuo/0/910/">返回目录</a> <a href="http://www.quanben.com/xiaoshuo/0/910/59303.html">下一页</a> </div>
# -*- coding: utf-8 -*-
import urllib2
import re
import thread
import chardet
class Book_Spider:
def __init__(self):
self.pages = []
self.page = 1
self.flag = True
self.url = "http://www.quanben.com/xiaoshuo/10/10412/2095096.html"
# 将抓取一个章节
def GetPage(self):
myUrl = self.url
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(myUrl, headers = headers)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
charset = chardet.detect(myPage)
charset = charset['encoding']
if charset == 'utf-8' or charset == 'UTF-8':
myPage = myPage
else:
myPage = myPage.decode('gb2312','ignore').encode('utf-8')
unicodePage = myPage.decode("utf-8")
# 找出 id="content"的div标记
try:
#抓取标题
my_title = re.search('<h1>(.*?)</h1>',unicodePage,re.S)
my_title = my_title.group(1)
except:
print '标题 HTML 变化,请重新分析!'
return False
try:
#抓取章节内容
my_content = re.search('<div.*?id="htmlContent" class="contentbox">(.*?)<div',unicodePage,re.S)
my_content = my_content.group(1)
except:
print "内容 HTML 变化,请重新分析!"
return False
my_content = my_content.replace("<br />","\n")
my_content = my_content.replace(" "," ")
#用字典存储一章的标题和内容
onePage = {'title':my_title,'content':my_content}
try:
#找到页面下方的连接区域
foot_link = re.search('<div.*?class="chapter_Turnpage">(.*?)</div>',unicodePage,re.S)
foot_link = foot_link.group(1)
#在连接的区域找下一页的连接,根据网页特点为第三个
nextUrl = re.findall(u'<a.*?href="(.*?)".*?>(.*?)</a>',foot_link,re.S)
nextUrl = nextUrl[2][0]
# 更新下一次进行抓取的链接
self.url = nextUrl
except:
print "底部链接变化,请重新分析!"
return False
return onePage
# 用于加载章节
def LoadPage(self):
while self.flag:
if(len(self.pages) - self.page < 3):
try:
# 获取新的页面
myPage = self.GetPage()
if myPage == False:
print '抓取失败!'
self.flag = False
self.pages.append(myPage)
except:
print '无法连接网页!'
self.flag = False
#显示一章
def ShowPage(self,curPage):
print curPage['title']
print curPage['content']
print "\n"
user_input = raw_input("当前是第 %d 章,回车读取下一章或者输入 quit 退出:" % self.page)
if(user_input == 'quit'):
self.flag = False
print "\n"
def Start(self):
print u'开始阅读......\n'
# 新建一个线程
thread.start_new_thread(self.LoadPage,())
# 如果self的page数组中存有元素
while self.flag:
if self.page <= len(self.pages):
nowPage = self.pages[self.page-1]
self.ShowPage(nowPage)
self.page += 1
print u"本次阅读结束"
#----------- 程序的入口处 -----------
print u"""
---------------------------------------
程序:阅读呼叫转移
版本:0.2
作者:angryrookie
日期:2014-07-07
语言:Python 2.7
功能:按下回车浏览下一章节
---------------------------------------
"""
print u'请按下回车:'
raw_input(' ')
myBook = Book_Spider()
myBook.Start()
# -*- coding:utf-8 -*-
import urllib2
import urllib
import re
import thread
import chardet
class Book_Spider:
def __init__(self):
self.pages = []
self.page = 1
self.flag = True
self.url = "http://www.quanben.com/xiaoshuo/0/910/59302.html"
# 将抓取一个章节
def GetPage(self):
myUrl = self.url
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(myUrl, headers = headers)
myResponse = urllib2.urlopen(req)
myPage = myResponse.read()
charset = chardet.detect(myPage)
charset = charset['encoding']
if charset == 'utf-8' or charset == 'UTF-8':
myPage = myPage
else:
myPage = myPage.decode('gb2312','ignore').encode('utf-8')
unicodePage = myPage.decode("utf-8")
# 找出 id="content"的div标记
try:
#抓取标题
my_title = re.search('<h1>(.*?)</h1>',unicodePage,re.S)
my_title = my_title.group(1)
except:
print '标题 HTML 变化,请重新分析!'
return False
try:
#抓取章节内容
my_content = re.search('<div.*?id="htmlContent" class="contentbox">(.*?)<div',unicodePage,re.S)
my_content = my_content.group(1)
except:
print "内容 HTML 变化,请重新分析!"
return False
my_content = my_content.replace("<br />","\n")
my_content = my_content.replace(" "," ")
#用字典存储一章的标题和内容
onePage = {'title':my_title,'content':my_content}
try:
#找到页面下方的连接区域
foot_link = re.search('<div.*?class="chapter_Turnpage">(.*?)</div>',unicodePage,re.S)
foot_link = foot_link.group(1)
#在连接的区域找下一页的连接,根据网页特点为第三个
nextUrl = re.findall(u'<a.*?href="(.*?)".*?>(.*?)</a>',foot_link,re.S)
#目录链接
dir_url = nextUrl[1][0]
nextUrl = nextUrl[2][0]
# 更新下一次进行抓取的链接
self.url = nextUrl
if(dir_url == nextUrl):
self.flag = False
return onePage
except:
print "底部链接变化,请重新分析!"
return False
# 用于加载章节
def downloadPage(self):
f_txt = open(u"斗罗大陆.txt",'w+')
while self.flag:
try:
# 获取新的页面
myPage = self.GetPage()
if myPage == False:
print '抓取失败!'
self.flag = False
title = myPage['title'].encode('utf-8')
content = myPage['content'].encode('utf-8')
f_txt.write(title + '\n\n')
f_txt.write(content)
f_txt.write('\n\n\n')
print "已下载 ",myPage['title']
except:
print '无法连接服务器!'
self.flag = False
f_txt.close()
def Start(self):
print u'开始下载......\n'
self.downloadPage()
print u"下载完成"
#----------- 程序的入口处 -----------
print u"""
---------------------------------------
程序:阅读呼叫转移
版本:0.3
作者:angryrookie
日期:2014-07-08
语言:Python 2.7
功能:按下回车开始下载
---------------------------------------
"""
print u'请按下回车:'
raw_input(' ')
myBook = Book_Spider()
myBook.Start()
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有