123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- # coding=utf-8
- '''
- Created on 2016年6月17日
- @author: Pro1
- '''
- from pymongo.mongo_client import MongoClient
- import math
- import datetime
- from util_common import Constant
- from concurrent2.propertyValue import Converter
- import time
- class PooledThread(object):
- def __init__(self, poolSize=50):
- self.poolSize = poolSize
- self.threads = list()
-
- def waitFor(self):
- if len(self.threads) < self.poolSize:
- return
- for thread in self.threads:
- if not thread.isAlive():
- self.threads.remove(thread)
- if len(self.threads) >= self.poolSize:
- time.sleep(1)
- self.waitFor()
-
- def startThread(self, currentPage, pageSize):
- converter = Converter(currentPage, pageSize)
- converter.start()
- self.threads.append(converter)
- self.waitFor()
- if __name__ == '__main__':
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- # 65149825
- totalElements = db.propertyvalue_0614.find({}, {'_id':True}).count()
- cli.close()
- pageSize = 10000
- # 6520
- pages = math.ceil(totalElements / pageSize)
- pool = PooledThread(poolSize=50)
- print("totalElements: %s, pages: %s, task start at %s" % (totalElements, pages, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
-
- for currentPage in range(1, pages):
- pool.startThread(currentPage, pageSize)
|