# coding=utf-8 ''' Created on 2016年3月9日 这是基于直连数据库的方法 图片文件名全部用.jpg,content_type="image/jpeg" 因为在2进制方式下面png,jpg可以随意切换 @author: ChenHao ''' from pymongo.mongo_client import MongoClient from proxy import ip_pool from util_common import html_downloader, Constant import threading import uuid import gridfs class ImgFileMain(): def __init__(self, userName=None, maxThread=100): cli = MongoClient(Constant.MONGODB_URL) self.db = cli.spider self.fs = gridfs.GridFS(cli.pictures) self.user = self._get_user(userName) self.pool = ip_pool.Pool() self.activeThread = 0 self.maxThread = maxThread self.successed = 0 self.failured = 0 self.total = 0 # last one self.isLast = False def _get_user(self, userName): rs_user = self.db.user.find_one({"name": userName}) if rs_user is None: # 初次接入 print(userName, ":is new for this task, Welcome!") rs_user = self.db.user.insert({"name": userName, "starttime": 0}) return rs_user # 获得N个任务 def _get_task(self, size=1): try: if self.isLast: return [self.db.component_original.find_one({"imgTask" : Constant.TODO}, {"_id": True, "img_url_mouser": True})] # rand = random.random() result = self.db.component_original.find({"imgTask" : Constant.TODO}).limit(size) if result is None: result = self.db.component_original.find({"imgTask" : Constant.TODO}).limit(size) return result except: return [] # 生成图片文件名 def _get_imgName_and_img_url_uu(self): uuid_str = str(uuid.uuid1()) imgName = uuid_str.replace("-", "") + ".jpg" img_url_uu = Constant.IMG_URL_HEADER + imgName return imgName, img_url_uu def hasNext(self): try: count = self.db.component_original.find({"imgTask": Constant.TODO}).count() self.isLast = count == 1 return count > 0 except: return True def _save_one_result(self, _id, cont_file): # 保存并生成 try: imgName, img_url_uu = self._get_imgName_and_img_url_uu() print (imgName, img_url_uu) self.fs.put(cont_file, content_type="image/jpeg", filename=imgName) # self.db.component_original.update_one({'_id': _id}, {'$set': {'imgTask': Constant.DONE, 'img_url_uu': img_url_uu, 'img_download_user': self.user["name"]}}) except: None def _get_one_proxy(self): return self.pool.get() def _remove_one_proxy(self, proxy): self.pool.remove(proxy) def _on_download_success(self, _id, cont_file, proxy): self.activeThread -= 1 # 检查是不是有效下载(以5KB为标准) if len(cont_file) < 5000: self._remove_one_proxy(proxy) self.failured += 1 else: self._save_one_result(_id, cont_file) self.successed += 1 def _on_download_error(self, e, url, proxy): self.activeThread -= 1 self._remove_one_proxy(proxy) self.failured += 1 # print("failed! proxy ", proxy, ", url: ", url, e) def craw(self): if self.maxThread > self.activeThread: currentTasks = self._get_task(self.maxThread - self.activeThread) for task in currentTasks: crawer = CrawerThread(task["_id"], task["img_url_mouser"], None, self._on_download_success, self._on_download_error) crawer.start() self.activeThread += 1 self.total += 1 def statistic(self): return self.successed, self.failured, self.activeThread, self.total class CrawerThread(threading.Thread): def __init__(self, _id, img_url_mouser, proxy, success, error): threading.Thread.__init__(self) self.downloader = html_downloader.HtmlDownloader() self._id = _id self.img_url_mouser = img_url_mouser self.proxy = proxy self.success = success self.error = error def run(self): try: cont_file = self.downloader.download_file(self.img_url_mouser, self.proxy) if self.success is not None: self.success(self._id, cont_file, self.proxy) except Exception as e: if self.error is not None: self.error(e, self.img_url_mouser, self.proxy) if __name__ == '__main__': task = ImgFileMain(maxThread=1) while task.hasNext(): task.craw() task.close()