123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- '''
- Created on 2016年3月9日
- 这是基于直连数据库的方法
- 图片文件名全部用.jpg,content_type="image/jpeg"
- 因为在2进制方式下面png,jpg可以随意切换
- @author: ChenHao
- '''
- from pymongo.mongo_client import MongoClient
- from proxy import ip_pool
- from util_common import html_downloader, Constant
- import threading
- import uuid
- import gridfs
- class ImgFileMain():
- def __init__(self, userName=None, maxThread=100):
- cli = MongoClient(Constant.MONGODB_URL)
- self.db = cli.spider
- self.fs = gridfs.GridFS(cli.pictures)
- self.user = self._get_user(userName)
- self.pool = ip_pool.Pool()
- self.activeThread = 0
- self.maxThread = maxThread
- self.successed = 0
- self.failured = 0
- self.total = 0
-
- self.isLast = False
- def _get_user(self, userName):
- rs_user = self.db.user.find_one({"name": userName})
- if rs_user is None:
-
- print(userName, ":is new for this task, Welcome!")
- rs_user = self.db.user.insert({"name": userName, "starttime": 0})
- return rs_user
-
-
- def _get_task(self, size=1):
- try:
- if self.isLast:
- return [self.db.component_original.find_one({"imgTask" : Constant.TODO}, {"_id": True, "img_url_mouser": True})]
- result = self.db.component_original.find({"imgTask" : Constant.TODO}).limit(size)
- if result is None:
- result = self.db.component_original.find({"imgTask" : Constant.TODO}).limit(size)
- return result
- except:
- return []
-
-
- def _get_imgName_and_img_url_uu(self):
- uuid_str = str(uuid.uuid1())
- imgName = uuid_str.replace("-", "") + ".jpg"
- img_url_uu = Constant.IMG_URL_HEADER + imgName
- return imgName, img_url_uu
-
- def hasNext(self):
- try:
- count = self.db.component_original.find({"imgTask": Constant.TODO}).count()
- self.isLast = count == 1
- return count > 0
- except:
- return True
-
- def _save_one_result(self, _id, cont_file):
-
- try:
- imgName, img_url_uu = self._get_imgName_and_img_url_uu()
- print (imgName, img_url_uu)
- self.fs.put(cont_file, content_type="image/jpeg", filename=imgName)
- except:
- None
-
- def _get_one_proxy(self):
- return self.pool.get()
-
- def _remove_one_proxy(self, proxy):
- self.pool.remove(proxy)
-
- def _on_download_success(self, _id, cont_file, proxy):
- self.activeThread -= 1
-
- if len(cont_file) < 5000:
- self._remove_one_proxy(proxy)
- self.failured += 1
- else:
- self._save_one_result(_id, cont_file)
- self.successed += 1
-
- def _on_download_error(self, e, url, proxy):
- self.activeThread -= 1
- self._remove_one_proxy(proxy)
- self.failured += 1
-
- def craw(self):
- if self.maxThread > self.activeThread:
- currentTasks = self._get_task(self.maxThread - self.activeThread)
- for task in currentTasks:
- crawer = CrawerThread(task["_id"], task["img_url_mouser"], None, self._on_download_success, self._on_download_error)
- crawer.start()
- self.activeThread += 1
- self.total += 1
-
- def statistic(self):
- return self.successed, self.failured, self.activeThread, self.total
- class CrawerThread(threading.Thread):
- def __init__(self, _id, img_url_mouser, proxy, success, error):
- threading.Thread.__init__(self)
- self.downloader = html_downloader.HtmlDownloader()
- self._id = _id
- self.img_url_mouser = img_url_mouser
- self.proxy = proxy
- self.success = success
- self.error = error
-
- def run(self):
- try:
- cont_file = self.downloader.download_file(self.img_url_mouser, self.proxy)
- if self.success is not None:
- self.success(self._id, cont_file, self.proxy)
- except Exception as e:
- if self.error is not None:
- self.error(e, self.img_url_mouser, self.proxy)
- if __name__ == '__main__':
- task = ImgFileMain(maxThread=1)
- while task.hasNext():
- task.craw()
-
- task.close()
|