import random from pymongo.mongo_client import MongoClient import requests from lxml import etree from util_common import Constant def _get_proxies_from_site(): url = 'http://proxy.ipcn.org/country/' xpath = '/html/body/div[last()]/table[last()]/tr/td/text()' r = requests.get(url) tree = etree.HTML(r.text) results = tree.xpath(xpath) proxies = [line.strip() for line in results] return proxies # http://lwons.com/wx def _get_valid_proxies(proxies, count): url = 'http://lwons.com/wx' results = set() cur = 0 for p in proxies: proxy = {'http': 'http://' + p} succeed = False try: r = requests.get(url, proxies=proxy, timeout=6) if r.text == 'default': succeed = True except: succeed = False if succeed: print('succeed:', p) results.append(p) cur += 1 if cur >= count: break return results def refresh(size=100): ips = _get_valid_proxies(_get_proxies_from_site(), size) _ips = [] for ip in ips: _ips.append({'type': "http", 'proxy': ip, 'random': random.random()}) cli = MongoClient(Constant.MONGODB_URL) db = cli.spider db.proxy_free.remove() db.proxy_free.save(_ips) print(len(_ips)) if __name__ == '__main__': refresh()