首页 » Python » 使用python获得代理ip并且验证是否有效

使用python获得代理ip并且验证是否有效

 

闯哥提供的IP代理来源

1)国内外HTTP代理服务器提供商:
https://dash.scrapinghub.com/crawlera/#/down/usage
http://www.iprent.cn/
...

2)258IIP等站群服务器:
http://cn.raksmart.com/zhanqun
...

3)国内部署多台机器做分布式:
奏鸣部署30台阿里云
...

4)自己扫IP,或购买其他人扫的IP:
http://ip.zdaye.com/
...

5)ADSL拨号重连换IP

6)代理通道(如google翻译等)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#coding:utf-8

import urllib2,zlib,json,time,re,threading

ISOTIMEFORMAT = '%Y-%m-%d %X'

rawProxyList = []        # 所有代理IP
checkedProxyList = []    # 验证通过的代理IP

print "开始获取http代理>>>>>>>>>>>>>>>>>>"

#正则提取模块
def search(req,line):
    text = re.search(req,line)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data

url = 'https://proxyapi.mimvp.com/api/fetchopen.php?orderid=863000629212080122&num=20&result_fields=1,2'

request = urllib2.Request(url)
opener = urllib2.build_opener()
response = opener.open(request)
html = response.read()

for ip in html.split('\n'):
    ip.strip()
    ip = re.sub(",.*?$","",ip)
    rawProxyList.append(ip)
print '已获取%s个代理' % len(html.split('\n'))

class ProxyCheck(threading.Thread):
    def __init__(self,proxyList):
        threading.Thread.__init__(self)
        self.proxyList = proxyList
        self.timeout = 10   # 设置超时时间
        self.testUrl = "http://www.baidu.com/"  # 设置一个访问的网站

    def checkProxy(self):
        for proxy in self.proxyList:
            proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s' % proxy})
            opener = urllib2.build_opener(proxyHandler)
            opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36')]
            t1 = time.time()        # 获取请求网页的开始时间
           
            try:
                req = opener.open(self.testUrl, timeout=self.timeout)
                result = req.read()                
                timeused = time.time() - t1     # 实际访问网页的耗时
               
                title = re.search(r'<title>(.*?)</title>',result)
                if title:
                    title = title.group(1)
                else:
                    title = '无法打开网页'

                print timeused,title,proxy

                if (timeused < 5) and title == '百度一下,你就知道':         #如果,当前IP访问网页耗时<5s,且返回源代码的title等于“百度一下,你就知道”,则这个IP通过验证
                    checkedProxyList.append((proxy,timeused))
                else:
                    continue

            except Exception,e:
                print e
                continue
                       
    def sort(self):
        sorted(checkedProxyList,cmp=lambda x,y:cmp(x[1],y[1]))
                 
    def run(self):
        self.checkProxy()
        self.sort()
             
if __name__ == "__main__":
    getThreads = []
    checkThreads = []
   
    n = 50  # 设置线程数
    for i in range(n):
        t = ProxyCheck(rawProxyList[((len(rawProxyList)+(n-1))/n) * i:((len(rawProxyList)+(n-1))/n) * (i+1)])
        checkThreads.append(t)

    for i in range(len(checkThreads)):
        checkThreads[i].start()

    for i in range(len(checkThreads)):
        checkThreads[i].join()

    print ".......................总共%s个代理,共有%s个通过校验......................." % (len(rawProxyList),len(checkedProxyList))

    f= open("daili.txt",'w+')
   
    dangqian_time = time.strftime( ISOTIMEFORMAT, time.localtime() )
   
    f.write("代理更新于:%s,共计%s个通过验证\n\n"  % (dangqian_time,len(checkedProxyList)))

    for proxy in checkedProxyList:
        #print "qualified: %s\t%s" % (proxy[0],proxy[1])
        f.write(proxy[0]+"\n")
    f.close()

可能会遇到有些特殊的api,比如这个免费的。就可以通过正则把想要的格式做出来。

1
2
3
4
5
6
7
8
url = 'http://www.89ip.cn/tqdl.html?api=1&num=200&port=&address=&isp='

request = urllib2.Request(url)
opener = urllib2.build_opener()
response = opener.open(request)
html = response.read()
html = re.sub("<br>","\n",re.sub("<a.*?script>.*?script>","",re.sub("\n","",html)))
print html

原文链接:使用python获得代理ip并且验证是否有效,转载请注明来源!

0