ip.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import random
  2. from pymongo.mongo_client import MongoClient
  3. import requests
  4. from lxml import etree
  5. from util_common import Constant
  6. def _get_proxies_from_site():
  7. url = 'http://proxy.ipcn.org/country/'
  8. xpath = '/html/body/div[last()]/table[last()]/tr/td/text()'
  9. r = requests.get(url)
  10. tree = etree.HTML(r.text)
  11. results = tree.xpath(xpath)
  12. proxies = [line.strip() for line in results]
  13. return proxies
  14. # http://lwons.com/wx
  15. def _get_valid_proxies(proxies, count):
  16. url = 'http://lwons.com/wx'
  17. results = set()
  18. cur = 0
  19. for p in proxies:
  20. proxy = {'http': 'http://' + p}
  21. succeed = False
  22. try:
  23. r = requests.get(url, proxies=proxy, timeout=6)
  24. if r.text == 'default':
  25. succeed = True
  26. except:
  27. succeed = False
  28. if succeed:
  29. print('succeed:', p)
  30. results.append(p)
  31. cur += 1
  32. if cur >= count:
  33. break
  34. return results
  35. def refresh(size=100):
  36. ips = _get_valid_proxies(_get_proxies_from_site(), size)
  37. _ips = []
  38. for ip in ips:
  39. _ips.append({'type': "http", 'proxy': ip, 'random': random.random()})
  40. cli = MongoClient(Constant.MONGODB_URL)
  41. db = cli.spider
  42. db.proxy_free.remove()
  43. db.proxy_free.save(_ips)
  44. print(len(_ips))
  45. if __name__ == '__main__':
  46. refresh()