#!/usr/bin/python
#-*- coding: utf-8 -*-
'''''
Created on 2014-03-16
@author: Kris
'''
import urllib2, re, cookielib
def httpCrawler(url):
'''''
@summary: 网页抓取
'''
content = httpRequest(url)
title = parseHtml(content)
saveData(title)
def httpRequest(url):
'''''
@summary: 网络请求
'''
try:
ret = None
SockFile = None
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)')
request.add_header('Pragma', 'no-cache')
opener = urllib2.build_opener()
SockFile = opener.open(request)
ret = SockFile.read()
finally:
if SockFile:
SockFile.close()
return ret
def parseHtml(html):
'''''
@summary: 抓取结构化数据
'''
content = None
pattern = '<title>([^<]*?)</title>'
temp = re.findall(pattern, html)
if temp:
content = temp[0]
return content
def saveData(data):
'''''
@summary: 数据存储
'''
f = open('test', 'wb')
f.write(data)
f.close()
if __name__ == '__main__':
url = 'http://www.baidu.com'
httpCrawler(url)
ckjar = cookielib.MozillaCookieJar()
cookies = urllib2.HTTPCookieProcessor(ckjar) #定义cookies对象
def httpRequest(url):
'''''
@summary: 网络请求
'''
try:
ret = None
SockFile = None
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)')
request.add_header('Pragma', 'no-cache')
opener = urllib2.build_opener(cookies) #传递cookies对象
SockFile = opener.open(request)
ret = SockFile.read()
finally:
if SockFile:
SockFile.close()
return ret
content = content.decode('gbk', 'ignore') #将gbk编码转为unicode编码
content = content.encode('utf-8', 'ignore') #将unicode编码转为utf-8编码
机械节能产品生产企业官网模板...
大气智能家居家具装修装饰类企业通用网站模板...
礼品公司网站模板
宽屏简约大气婚纱摄影影楼模板...
蓝白WAP手机综合医院类整站源码(独立后台)...苏ICP备2024110244号-2 苏公网安备32050702011978号 增值电信业务经营许可证编号:苏B2-20251499 | Copyright 2018 - 2025 源码网商城 (www.ymwmall.com) 版权所有