# coding=utf-8 ''' Created on 2016年6月17日 @author: Pro1 ''' from pymongo.mongo_client import MongoClient import math import datetime from util_common import Constant from concurrent2.propertyValue import Converter import time class PooledThread(object): def __init__(self, poolSize=50): self.poolSize = poolSize self.threads = list() def waitFor(self): if len(self.threads) < self.poolSize: return for thread in self.threads: if not thread.isAlive(): self.threads.remove(thread) if len(self.threads) >= self.poolSize: time.sleep(1) self.waitFor() def startThread(self, currentPage, pageSize): converter = Converter(currentPage, pageSize) converter.start() self.threads.append(converter) self.waitFor() if __name__ == '__main__': cli = MongoClient(Constant.MONGODB_URL) db = cli.spider # 65149825 totalElements = db.propertyvalue_0614.find({}, {'_id':True}).count() cli.close() pageSize = 10000 # 6520 pages = math.ceil(totalElements / pageSize) pool = PooledThread(poolSize=50) print("totalElements: %s, pages: %s, task start at %s" % (totalElements, pages, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) for currentPage in range(1, pages): pool.startThread(currentPage, pageSize)