源码网商城,靠谱的源码在线交易网站 我的订单 购物车 帮助

源码网商城

python实现爬虫下载漫画示例

  • 时间:2022-08-05 00:50 编辑: 来源: 阅读:
  • 扫一扫,手机访问
摘要:python实现爬虫下载漫画示例
[u]复制代码[/u] 代码如下:
#!/usr/bin/python3.2 import os,socket import urllib import urllib.request,threading,time import re,sys global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2 weburl='' floder='' chapterbegin=0 currentthreadnum=0 threadcount=6 if len(sys.argv)>=3:   weburl=sys.argv[1]   floder=sys.argv[2] else:     print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")     sys.exit(0) if len(sys.argv)>=4:   chapterbegin=int(sys.argv[3]) if len(sys.argv)>=5:   threadcount=(int)(sys.argv[4])   def jin(i,jinzhi):         finalans=""         answer=i%jinzhi         i=int(i/jinzhi)         if answer>9:                 finalans=finalans+chr(ord('a')+(answer-10))         else:                 finalans=finalans+str(answer)         if i!=0:                 finalans=jin(i,jinzhi)+finalans         return finalans def urlparse(p,a,c,k):         d={}         e=lambda c:     jin(c,36)         if 1:                 while c:                         c=c-1                         if not k[c]:                                 d[jin(c,36)]=jin(c,36)                         else:                                 d[jin(c,36)]=k[c]                 k=[lambda e:d[e]]                 e=lambda c:'\w+'                 c=1         newstr=""         while c:                 c=c-1                 if k[c]:                         for i in range(0,len(p)):                                 tempi=p[i]                                 tempi=ord(tempi)                                 if tempi>=ord('a') and tempi<=ord('f'):                                         newstr+=d[chr(tempi)]                                 elif tempi>=ord('0') and tempi<=ord('9'):                                         newstr+=d[chr(tempi)]                                 else:                                         newstr+=chr(tempi)         return newstr def meispower(s):         p=re.compile(r"(?=}().*",re.IGNORECASE)         s=p.findall(s)         s=s[0]         s=s[0:(len(s)-19)]         par=s.split(',')         par[3]=par[3][1:len(par[3])]         answer=par[3].split('|')         chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)         allurl=re.findall('imgpath=[^;]*',chapterpath)[0]         allurl=allurl[10:(len(allurl)-2)]         return allurl def pictofile(weburl,filename,loop=100):         if loop<0:                 print('can't download the picture %s'%weburl)                 return         loop=loop-1         if os.path.exists(filename):             return         try:                 url=urllib.request.urlopen(weburl)                 data=url.read()                 if len(data)<2048:                         url.close()                         pictofile(weburl,filename,loop)                 else:                         print('download from %s name is %sn'%(weburl,filename))                         myfile=open('%s'%filename,'wb')                         myfile.write(data)                         myfile.close()                         url.close();         except socket.timeout:                 print('timeout')                 pictofile(weburl,filename,loop)         except Exception as e:           print('error',e)           pictofile(weburl,filename,loop)         finally:             pass def downloadpic(url,loadpicdir,num):     #download the all url picture to loadpicdir     global currentthreadnum,mutex,mutex2     mymode=re.compile(r'[0-9a-z.]*Z')     try:                 mutex2.acquire()                 os.chdir(loadpicdir)                 mutex2.release()     except:                 print("can't open the floder %s will be create"%loadpicdir)                 try:                     if(mutex2.locked()):                         os.mkdir(loadpicdir)                         os.chdir(loadpicdir)                         mutex2.release()                     print('create floder succeed')                 except:                     print("can't create floder %s"%loadpicdir)                     if(mutex.acquire()):                         mutex.release()                     quit(0)     name=mymode.findall(url)     filename='manhua'+name[0]     pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)     mutex.acquire()     currentthreadnum=currentthreadnum-1     mutex.release() def downloadchapter(url,loadpicdir,num,begin=0):         global manhuaweb,threadcount,currentthreadnum,mutex         print(manhuaweb+url)         webdata=urllib.request.urlopen(manhuaweb+url).read()         webdata=webdata.decode('UTF-8')         chaptername=re.findall(r'<title>[^_]*',webdata)[0]         chaptername=chaptername[7:len(chaptername)]         webscrip=re.findall(r'eval.*[^<>]',webdata)         chapterurl=meispower(webscrip[0]);         chapterurl='http://mhimg.ali213.net'+chapterurl         for i in range(begin,num):                 try:                         while(currentthreadnum>=threadcount):                                 time.sleep(0.5)                         mutex.acquire()                         currentthreadnum=currentthreadnum+1                         mutex.release()                         threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()                 except socket.error:                         mutex.acquire()                         i=i-1                         currentthreadnum=currentthreadnum-1                         mutex.release()                 except Exception as error:                         print(error,'break')                         print('download chapter %d of picture make a error'%i)                         break if __name__=='__main__':         manhuaweb=r'http://manhua.ali213.net'         socket.setdefaulttimeout(60.0)         mutex=threading.Lock()         mutex2=threading.Lock()                 webfile=urllib.request.urlopen(weburl)         webdata=webfile.read();         webdata=webdata.decode('UTF-8')         meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')         meshdata=meshmode.findall(webdata)[0]         indexmode=re.compile(r'([0-9]*页)')         indexdata=indexmode.findall(meshdata)         picurlmode=re.compile(r'/comic/[0-9/]*.html')         picurldata=picurlmode.findall(meshdata)         chapterlength=len(picurldata)         nummode=re.compile(r'[d]+')         i=chapterbegin         while i<chapterlength:                 manhuachapter=picurldata[chapterlength-i-1]                 downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))                 i=i+1
  • 全部评论(0)
联系客服
客服电话:
400-000-3129
微信版

扫一扫进微信版
返回顶部