| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- import random
- from pymongo.mongo_client import MongoClient
- import requests
- from lxml import etree
- from util_common import Constant
- def _get_proxies_from_site():
- url = 'http://proxy.ipcn.org/country/'
- xpath = '/html/body/div[last()]/table[last()]/tr/td/text()'
- r = requests.get(url)
- tree = etree.HTML(r.text)
- results = tree.xpath(xpath)
- proxies = [line.strip() for line in results]
- return proxies
- # http://lwons.com/wx
- def _get_valid_proxies(proxies, count):
- url = 'http://lwons.com/wx'
- results = set()
- cur = 0
- for p in proxies:
- proxy = {'http': 'http://' + p}
- succeed = False
- try:
- r = requests.get(url, proxies=proxy, timeout=6)
- if r.text == 'default':
- succeed = True
- except:
- succeed = False
- if succeed:
- print('succeed:', p)
- results.append(p)
- cur += 1
- if cur >= count:
- break
- return results
- def refresh(size=100):
- ips = _get_valid_proxies(_get_proxies_from_site(), size)
- _ips = []
- for ip in ips:
- _ips.append({'type': "http", 'proxy': ip, 'random': random.random()})
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- db.proxy_free.remove()
- db.proxy_free.save(_ips)
- print(len(_ips))
- if __name__ == '__main__':
- refresh()
|