源码网商城,靠谱的源码在线交易网站 我的订单 购物车 帮助

源码网商城

python读取html中指定元素生成excle文件示例

  • 时间:2021-06-06 03:15 编辑: 来源: 阅读:
  • 扫一扫,手机访问
摘要:python读取html中指定元素生成excle文件示例
Python2.7编写的读取html中指定元素,并生成excle文件
[u]复制代码[/u] 代码如下:
#coding=gbk import string import codecs import os,time import xlwt import xlrd from bs4 import BeautifulSoup from xlrd import open_workbook class LogMsg:         def __init__(self,logfile,Level=0):                 try:                         import logging                         #self.logger = None                         self.logger = logging.getLogger()                         self.hdlr = logging.FileHandler(logfile)                         formatter = logging.Formatter("[%(asctime)s]: %(message)s","%Y%m%d %H:%M:%S")                         self.hdlr.setFormatter(formatter)                         self.logger.addHandler(self.hdlr)                         #logger.setLevel()                         if Level == 10:                                 self.logger.setLevel(logging.DEBUG)                         elif Level == 20:                                 self.logger.setLevel(logging.INFO)                         elif Level == 30:                                 self.logger.setLevel(logging.WARNING)                         elif Level == 40:                                 self.logger.setLevel(logging.ERROR)                         elif Level == 50:                                 self.logger.setLevel(logging.CRITICAL)                         else:                                 self.logger.setLevel(logging.NOTSET)                 except:                         print "log init error!"                         exit(1)         def output(self,logInfo):                 Level = self.logger.getEffectiveLevel()                 try:                         if Level == 10:                                 self.logger.debug(logInfo)                         elif Level == 20:                                 self.logger.info(logInfo)                         elif Level == 30:                                 self.logger.warning(logInfo)                         elif Level == 40:                                 self.logger.error(logInfo)                         elif Level == 50:                                 self.logger.critical(logInfo)                         else:                                 self.logger.info(logInfo)                 except:                         print "log output error!"                         exit(1)         def close(self):                 try:                 #logging.shutdown([self.hdlr])                         self.logger.removeHandler(self.hdlr)                 except:                         print "log closed error!"                         exit(1) Logtime = time.strftime("%Y%m%d%H%M%S",time.localtime()) logFileTime = time.strftime("%Y%m%d",time.localtime()) Logfile = '/data/pyExample/logs/htmlparser_%s.log' % logFileTime log = LogMsg(Logfile,20) DATAPATH = '/data/pyExample/' XLSname = 'dangjian_'+Logtime+'.xls' if __name__ == '__main__':         wbk = xlwt.Workbook(encoding = 'gbk')     sheet = wbk.add_sheet('基本内容导入模板')     sheet.write(0,0,'内容类型 ')     sheet.write(0,1,'栏目名称')     sheet.write(0,2,'栏目编号')     sheet.write(0,3,'内容名称')     sheet.write(0,4,'时长')     sheet.write(0,5,'关键字')     sheet.write(0,6,'看点')     sheet.write(0,7,'作者')     sheet.write(0,8,'来源')     sheet.write(0,9,'子内容1')     sheet.write(0,10,'子内容2')     xlsContent = []       files = os.listdir(DATAPATH)     k = 0     for f in files:          if os.path.splitext(f)[1] == '.html':             content=[]             log.output('当前文件:'+f)             htmlFile =codecs.open(DATAPATH+f,'r','gbk')             lines = htmlFile.readlines()             if not lines:                 log.output ('not line')             for line in lines:                 if line.strip()=='\n':                     log.output('该处是空行')                 else:                     line = line.replace(' ','')                     soup  = BeautifulSoup(line)                     for tdd in soup.findAll('td'):                          #print tdd.text.encode("gbk")                         content.append(tdd.text.encode("gbk"))                       #print line.encode('gbk')             htmlFile.close()                for i in content:                 print content.index(i),',',i                 log.output(i)                 log.output(content.index(i))             print '----------------------------------------'                         folderName =  content[6]             contentName=  content[4]                   duration =    filter(str.isdigit, content[16])             int_duration = string.atoi(duration)*60             str_duration = "%i"%int_duration             keyWord =     content[6]             desciption =  content[36]             videoName_1 = content[10]             print folderName             print contentName             print str_duration             print keyWord             print desciption             print videoName_1             log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')             print k                        sheet.write(k+1,0,'')             sheet.write(k+1,1,folderName)             sheet.write(k+1,2,'')             sheet.write(k+1,3,contentName)             sheet.write(k+1,4,str_duration)             sheet.write(k+1,5,keyWord)             sheet.write(k+1,6,desciption)             sheet.write(k+1,7,'管理员')             sheet.write(k+1,8,'华数编辑')             sheet.write(k+1,9,videoName_1)             sheet.write(k+1,10,'')             k+=1     wbk.save(DATAPATH + XLSname)            print '=========================================' 
  • 全部评论(0)
联系客服
客服电话:
400-000-3129
微信版

扫一扫进微信版
返回顶部