百度图片爬虫-python版-阿里云开发者社区

1 # coding:utf-8
  2
  3 """
  4
  5 Created on 2015-9-17
  6
  7
  8
  9 @author: huangxie
10
11 """
12
13 import time,math,os,re,urllib,urllib2,cookielib
14
15 from bs4 import BeautifulSoup
16
17 import time
18
19 import re
20
21 import uuid
22
23 import json
24
25 from threading import Thread
26
27 from Queue import Queue
28
29 import MySQLdb as mdb
30
31 import sys
32
33 import threading
34
35 import utils
36
37 import imitate_browser
38
39 from MySQLdb.constants.REFRESH import STATUS
40
41 reload(sys)
42
43 sys.setdefaultencoding( ' utf-8 ')
44
45
46
47 DB_HOST = ' 127.0.0.1 '
48
49 DB_USER = ' root '
50
51 DB_PASS = ' root '
52
53 proxy = {u ' http ':u ' 222.39.64.13:8118 '}
54
55 TOP_URL= " http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn} "
56
57 KEYWORD_URL= " https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd} "
58
59
60
61 """
62
63 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
64
65               'Accept':'json;q=0.9,*/*;q=0.8',
66
67               'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
68
69               'Accept-Encoding':'gzip',
70
71               'Connection':'close',
72
73               'Referer':None #注意如果依然不能抓取的话，这里可以设置抓取网站的host
74
75             }
76
77 """
78
79 i_headers = { ' User-Agent ': ' Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48 '}
80
81
82
83 def GetDateString():
84
85     x = time.localtime(time.time())
86
87     foldername = str(x. __getattribute__( " tm_year "))+ " - "+str(x. __getattribute__( " tm_mon "))+ " - "+str(x. __getattribute__( " tm_mday "))
88
89      return foldername
90
91
92
93 class BaiduImage(threading.Thread):
94
95
96
97      def __init__(self):
98
99         Thread. __init__(self)
100
101         self.browser=imitate_browser.BrowserBase()
102
103         self.chance=0
104
105         self.chance1=0
106
107         self.request_queue=Queue()
108
109         self.wait_ana_queue=Queue()
110
111          # self.key_word_queue.put((("动态图", 0, 24)))
112
113         self.count=0
114
115         self.mutex = threading.RLock() # 可重入锁，使单线程可以再次获得已经获得的锁
116
117         self.commit_count=0
118
119         self.ID=500
120
121         self.next_proxy_set = set()
122
123         self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ' sosogif ', charset= ' utf8 ')
124
125         self.dbconn.autocommit(False)
126
127         self.dbcurr = self.dbconn.cursor()
128
129         self.dbcurr.execute( ' SET NAMES utf8 ')
130
131
132
133      """
134
135     def run(self):
136
137         while True:
138
139             self.get_pic()
140
141      """
142
143
144
145      def work(self,item):
146
147          print " start thread ",item
148
149          while True: # MAX_REQUEST条以上则等待
150
151             self.get_pic()
152
153             self.prepare_request()
154
155
156
157      def format_keyword_url(self,keyword):
158
159
160
161          return KEYWORD_URL.format(wd=keyword).encode( ' utf-8 ')
162
163
164
165      def generateSeed(self,url):
166
167
168
169         html = self.browser.openurl(url).read()
170
171          if html:
172
173              try:
174
175                 soup = BeautifulSoup(html)
176
177                 trs = soup.find( ' div ', id= ' rs ').find( ' table ').find_all( ' tr ') # 获得所有行
178
179                  for tr in trs:
180
181                     ths=tr.find_all( ' th ')
182
183                      for th in ths:
184
185                         a=th.find_all( ' a ')[0]
186
187                         keyword=a.text.strip()
188
189                          if " 动态图 " in keyword or " gif " in keyword:
190
191                              print " keyword ",keyword
192
193                             self.dbcurr.execute( ' select id from info where word=%s ',(keyword))
194
195                             y = self.dbcurr.fetchone()
196
197                              if not y:
198
199                                 self.dbcurr.execute( ' INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0) ',(keyword))
200
201                     self.dbconn.commit()
202
203              except:
204
205                  pass
206
207
208
209
210
211      def prepare_request(self):
212
213         self.lock()
214
215         self.dbcurr.execute( ' select * from info where status=0 ')
216
217         result = self.dbcurr.fetchone()
218
219          if result:
220
221             id,word,status,page_num,left_num,how_many=result
222
223             self.request_queue.put((id,word,page_num))
224
225              if page_num==0 and left_num==0 and how_many==0:
226
227                 url=self.format_keyword_url(word)
228
229                 self.generateSeed(url)
230
231                 html= ""
232
233                  try:
234
235                     url=self.format_top_url(word, page_num, 24)
236
237                     html = self.browser.openurl(url).read()
238
239                  except Exception as err:
240
241                      print " err ",err
242
243                      # pass
244
245                  if html!= "":
246
247                     how_many=self.how_many(html)
248
249                      print " how_many ",how_many
250
251                      if how_many==None:
252
253                         how_many=0
254
255                     t=math.ceil(how_many/24*100) # 只要前1/100即可
256
257                     num = int(t)
258
259                      for i   in xrange(0,num-1):
260
261                         self.dbcurr.execute( ' INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s) ',(word,0,i*24,num-i,how_many))
262
263                     self.dbcurr.execute( ' update info SET status=1 WHERE id=%s ',(id)) # 置为已经访问
264
265                     self.dbconn.commit()
266
267         self.unlock()
268
269
270
271
272
273      def start_work(self,req_max):
274
275          for item in xrange(req_max):
276
277             t = threading.Thread(target=self.work, args=(item,))
278
279             t.setDaemon(True)
280
281             t.start()
282
283
284
285      def lock(self): # 加锁
286
287         self.mutex.acquire()
288
289
290
291      def unlock(self): # 解锁
292
293         self.mutex.release()
294
295
296
297      def get_para(self,url,key):
298
299         values = url.split( ' ? ')[-1]
300
301          for key_value in values.split( ' & '):
302
303             value=key_value.split( ' = ')
304
305              if value[0]==key:
306
307                  return value[1]
308
309          return None
310
311
312
313      def makeDateFolder( self,par,child):
314
315          # self.lock()
316
317          if os.path.isdir( par ):
318
319             path=par + ' // ' + GetDateString()
320
321             newFolderName = path+ ' // '+child
322
323              if not os.path.isdir(path):
324
325                 os.mkdir(path)
326
327              if not os.path.isdir( newFolderName ):
328
329                 os.mkdir( newFolderName )
330
331              return newFolderName
332
333          else:
334
335              return par
336
337          # self.unlock()
338
339
340
341      def parse_json(self,data):
342
343
344
345         ipdata = json.loads(data)
346
347          try:
348
349              if ipdata[ ' imgs ']:
350
351                  for n in ipdata[ ' imgs ']: # data子项
352
353                      if n[ ' objURL ']:
354
355                          try:
356
357                             proxy_support = urllib2.ProxyHandler(proxy)
358
359                             opener = urllib2.build_opener(proxy_support)
360
361                             urllib2.install_opener(opener)
362
363                              # print "proxy",proxy
364
365                             self.lock()
366
367                             self.dbcurr.execute( ' select ID from pic_info where objURL=%s ', (n[ ' objURL ']))
368
369                             y = self.dbcurr.fetchone()
370
371                              # print "y=",y
372
373                              if y:
374
375                                  print " database exist "
376
377                                 self.unlock() # continue 前解锁
378
379                                  continue
380
381                              else:
382
383                                 real_extension=utils.get_extension(n[ ' objURL '])
384
385                                 req = urllib2.Request(n[ ' objURL '],headers=i_headers)
386
387                                 resp = urllib2.urlopen(req,None,5)
388
389                                 dataimg=resp.read()
390
391                                 name=str(uuid.uuid1())
392
393                                 filename= ""
394
395                                  if len(real_extension)>4:
396
397                                     real_extension= " .gif "
398
399                                 real_extension=real_extension.lower()
400
401                                  if real_extension== " .gif ":
402
403                                     filename  =self.makeDateFolder( " E://sosogif ", " d "+str(self.count % 60))+ " // "+name+ " -www.sosogif.com-搜搜gif贡献 "+real_extension
404
405                                     self.count+=1
406
407                                  else:
408
409                                     filename  =self.makeDateFolder( " E://sosogif ", " o "+str(self.count % 20))+ " // "+name+ " -www.sosogif.com-搜搜gif贡献 "+real_extension
410
411                                     self.count+=1
412
413                                  """
414
415                                 name=str(uuid.uuid1())
416
417                                 filename=""
418
419                                 if len(real_extension)>4:
420
421                                     real_extension=".gif"
422
423                                 filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
424
425                                 self.count+=1
426
427                                  """
428
429                                  try:
430
431                                      if not os.path.exists(filename):
432
433                                         file_object = open(filename, ' w+b ')
434
435                                         file_object.write(dataimg)
436
437                                         file_object.close()
438
439                                         self.anaylis_info(n,filename,real_extension) # 入库操作
440
441                                      else:
442
443                                          print " file exist "
444
445                                  except IOError,e1:
446
447                                      print " e1= ",e1
448
449                                      pass
450
451                             self.unlock()
452
453                          except IOError,e2:
454
455                              # print "e2=",e2
456
457                              pass
458
459                             self.chance1+=1
460
461          except Exception as parse_error:
462
463              print " parse_error ",parse_error
464
465              pass
466
467
468
469      def title_dealwith(self,title):
470
471
472
473          # print "title",title
474
475         a=title.find( " <strong> ")
476
477         temp1=title[0:a]
478
479         b=title.find( " </strong> ")
480
481         temp2=title[a+8:b]
482
483         temp3=title[b+9:len(title)]
484
485          return (temp1+temp2+temp3).strip()
486
487
488
489      def anaylis_info(self,n,filename,real_extension):
490
491          print " success. "
492
493
494
495          # if self.wait_ana_queue.qsize()!=0:
496
497              # n,filename,real_extension=self.wait.ana_queue.get()
498
499          # self.lock()
500
501         objURL=n[ ' objURL '] # 图片地址
502
503         fromURLHost=n[ ' fromURLHost '] # 来源网站
504
505         width=n[ ' width ']   # 宽度
506
507         height=n[ ' height '] # 高度
508
509         di=n[ ' di '] # 用来唯一标识
510
511         type=n[ ' type '] # 格式
512
513         fromPageTitle=n[ ' fromPageTitle '] # 来自网站
514
515         keyword=self.title_dealwith(fromPageTitle)
516
517         cs=n[ ' cs '] # 未知
518
519         os=n[ ' os '] # 未知
520
521         temp = time.time()
522
523         x = time.localtime(float(temp))
524
525         acTime = time.strftime( " %Y-%m-%d %H:%M:%S ",x) # 爬取时间
526
527         self.dbcurr.execute( ' select ID from pic_info where cs=%s ', (cs))
528
529         y = self.dbcurr.fetchone()
530
531          if not y:
532
533              print ' add pic ',filename
534
535             self.commit_count+=1
536
537             self.dbcurr.execute( ' INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
538
539              if self.commit_count==10:
540
541                 self.dbconn.commit()
542
543                 self.commit_count=0
544
545          # self.unlock()
546
547
548
549
550
551      def format_top_url(self,word,pn,rn):
552
553
554
555         url = TOP_URL.format(word=word, pn=pn,rn=rn).encode( ' utf-8 ')
556
557          return url
558
559
560
561      def how_many(self,data):
562
563          try:
564
565             ipdata = json.loads(data)
566
567              if ipdata[ ' displayNum ']>0:
568
569                 how_many=ipdata[ ' displayNum ']
570
571                  return int(how_many)
572
573              else:
574
575                  return 0
576
577          except Exception as e:
578
579              pass
580
581
582
583      def get_pic(self):
584
585          """
586
587         word="gif"
588
589         pn=0
590
591         rn=24
592
593         if self.key_word_queue.qsize()!=0:
594
595             word,pn,rn=self.key_word_queue.get()
596
597         url=self.format_top_url(word,pn,rn)
598
599         global proxy
600
601         if url:
602
603             try:
604
605                 html=""
606
607                 try:
608
609                     req = urllib2.Request(url,headers=i_headers)
610
611                     response = urllib2.urlopen(req, None,5)
612
613                     #print "url",url
614
615                     html = self.browser.openurl(url).read()
616
617                 except Exception as err:
618
619                     print "err",err
620
621                     #pass
622
623                 if html:
624
625                     how_many=self.how_many(html)
626
627                     #how_many=10000
628
629                     print "how_many",how_many
630
631                     word=self.get_para(url,"word")
632
633                     rn=int(self.get_para(url,"rn"))
634
635                     t=math.ceil(how_many/rn)
636
637                     num = int(t)
638
639                     for item  in xrange(0,num-1):
640
641          """
642
643          try:
644
645              global proxy
646
647              print " size of queue ",self.request_queue.qsize()
648
649              if self.request_queue.qsize()!=0:
650
651                 id,word,page_num = self.request_queue.get()
652
653                 u=self.format_top_url(word,page_num,24)
654
655                 self.lock()
656
657                 self.dbcurr.execute( ' update info SET status=1 WHERE id=%s ',(id))
658
659                 self.dbconn.commit()
660
661                  if self.chance >0 or self.chance1>1: # 任何一个出问题都给换代理
662
663                      if self.ID % 100==0:
664
665                         self.dbcurr.execute( " select count(*) from proxy ")
666
667                          for r in self.dbcurr:
668
669                             count=r[0]
670
671                          if self.ID>count:
672
673                             self.ID=50
674
675                     self.dbcurr.execute( " select * from proxy where ID=%s ",(self.ID))
676
677                     results = self.dbcurr.fetchall()
678
679                      for r in results:
680
681                         protocol=r[1]
682
683                         ip=r[2]
684
685                         port=r[3]
686
687                         pro=(protocol,ip+ " : "+port)
688
689                          if pro not in self.next_proxy_set:
690
691                             self.next_proxy_set.add(pro)
692
693                     self.chance=0
694
695                     self.chance1=0
696
697                     self.ID+=1
698
699                 self.unlock()
700
701                 proxy_support = urllib2.ProxyHandler(proxy)
702
703                 opener = urllib2.build_opener(proxy_support)
704
705                 urllib2.install_opener(opener)
706
707                 html= ""
708
709                  try:
710
711                     req = urllib2.Request(u,headers=i_headers)
712
713                      # print "u=",u
714
715                     response = urllib2.urlopen(req, None,5)
716
717                     html = response.read()
718
719                      if html:
720
721                          # print "html",type(html)
722
723                         self.parse_json(html)
724
725                  except Exception as ex1:
726
727                      # print "error=",ex1
728
729                      pass
730
731                     self.chance+=1
732
733                      if self.chance>0 or self.chance1>1:
734
735                          if len(self.next_proxy_set)>0:
736
737                             protocol,socket=self.next_proxy_set.pop()
738
739                             proxy= {protocol:socket}
740
741                              print " change proxy finished<< ",proxy,self.ID
742
743          except Exception as e:
744
745              print " error1 ",e
746
747              pass
748
749
750
751 if __name__ == ' __main__ ':
752
753
754
755     app = BaiduImage()
756
757     app.start_work(80)
758
759      # app.generateSeed()
760
761      while 1:
762
763          pass

若转载请注明出处！若有疑问，请回复交流！

百度图片爬虫-python版

热门文章

最新文章

相关课程

相关电子书

相关实验场景