源码网商城,靠谱的源码在线交易网站 我的订单 购物车 帮助

源码网商城

python多线程抓取天涯帖子内容示例

  • 时间:2021-07-01 15:15 编辑: 来源: 阅读:
  • 扫一扫,手机访问
摘要:python多线程抓取天涯帖子内容示例
使用re, urllib, threading 多线程抓取天涯帖子内容,设置url为需抓取的天涯帖子的第一页,设置file_name为下载后的文件名
[url=\S*?]    page_result = page_pattern.search(html_page)     if page_result:         page_num = int(page_result.group(1))         return page_num   def write_text(dict, fn):     """把字典内容按键(页数)写入文本,每个键值为每页内容的list列表"""     tx_file = open(fn, 'w+')     pn = len(dict)     for i in range(1, pn+1):         tx_list = dict[i]         for tx in tx_list:             tx = tx.replace('<br>', '\r\n').replace('<br />', '\r\n').replace(' ', '')             tx_file.write(tx.strip()+'\r\n'*4)     tx_file.close() def main():     url = 'http://bbs.tianya.cn/post-16-996521-1.shtml'     file_name ='abc.txt'     my_page = page(url)     my_dict = {}     print 'page num is : %s' % my_page     threads = []     """根据页数构造urls进行多线程下载"""     for num in range(1, my_page+1):         myurl = '%s%s.shtml' % (url[:-7], num)         downlist = Down_Tianya(myurl, num, my_dict)         downlist.start()         threads.append(downlist)     """检查下载完成后再进行写入"""     for t in threads:         t.join()     write_text(my_dict, file_name)     print 'All download finished. Save file at directory: %s' % os.getcwd() if __name__ == '__main__':     main()
down_tianya.py
#coding:utf-8 import urllib import re import threading import os class Down_Tianya(threading.Thread):     """多线程下载"""     def __init__(self, url, num, dt):         threading.Thread.__init__(self)         self.url = url         self.num = num         self.txt_dict = dt     def run(self):         print 'downling from %s' % self.url         self.down_text()     def down_text(self):         """根据传入的url抓出各页内容,按页数做键存入字典"""         html_content =urllib.urlopen(self.url).read()         text_pattern = re.compile('<div class="atl-item".*?<span>时间:(.*?)</span>.*?<!-- <div class="host-ico">楼主</div> -->.*?<div class="bbs-content.*?>\s*(.*?)</div>', re.DOTALL)         text = text_pattern.findall(html_content)         text_join = ['\r\n\r\n\r\n\r\n'.join(item) for item in text]         self.txt_dict[self.num] = text_join   def page(url):     """根据第一页地址抓取总页数"""     html_page = urllib.urlopen(url).read()     page_pattern = re.compile(r'<a href="\S*?">(\d*)</a>\s*<a href="\S*?" class="\S*?">下页</a>')     page_result = page_pattern.search(html_page)     if page_result:         page_num = int(page_result.group(1))         return page_num   def write_text(dict, fn):     """把字典内容按键(页数)写入文本,每个键值为每页内容的list列表"""     tx_file = open(fn, 'w+')     pn = len(dict)     for i in range(1, pn+1):         tx_list = dict[i]         for tx in tx_list:             tx = tx.replace('<br>', '\r\n').replace('<br />', '\r\n').replace(' ', '')             tx_file.write(tx.strip()+'\r\n'*4)     tx_file.close() def main():     url = 'http://bbs.tianya.cn/post-16-996521-1.shtml'     file_name ='abc.txt'     my_page = page(url)     my_dict = {}     print 'page num is : %s' % my_page     threads = []     """根据页数构造urls进行多线程下载"""     for num in range(1, my_page+1):         myurl = '%s%s.shtml' % (url[:-7], num)         downlist = Down_Tianya(myurl, num, my_dict)         downlist.start()         threads.append(downlist)     """检查下载完成后再进行写入"""     for t in threads:         t.join()     write_text(my_dict, file_name)     print 'All download finished. Save file at directory: %s' % os.getcwd() if __name__ == '__main__':     main()
  • 全部评论(0)
联系客服
客服电话:
400-000-3129
微信版

扫一扫进微信版
返回顶部