454 def build_list_urls_5(page=12):
455 page=page+1
456 ret=[]
457 for i in range(1,page):
458 ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d'%{'num':i})
459 return ret
460
461 def parse_page_5(html=''):
462 matches=re.findall(r"<font color=black>([^<]*)</font>",html)
463 ret=[]
464 for index, match in enumerate(matches):
465 if (index%3==0):
466 ip=matches[index+1]
467 port=matches[index+2]
468 type=-1 #该网站未提供代理服务器类型
469 area=unicode(match, 'cp936')
470 area=area.encode('utf8')
471 if indebug:print '5',ip,port,type,area
472 ret.append([ip,port,type,area])
473 else:
474 continue
475 return ret
476
480 def build_list_urls_6(page=3):
481 page=page+1
482 ret=[]
483 for i in range(1,page):
484 ret.append('http://www.cnproxy.com/proxy%(num)01d.html'%{'num':i})
485 return ret
486
487 def parse_page_6(html=''):
488 matches=re.findall(r'''<tr>
489 <td>([^&]+) #ip
490 ‌
491 \:([^<]+) #port
492 </td>
493 <td>HTTP</td>
494 <td>[^<]+</td>
495 <td>([^<]+)</td> #area
496 </tr>''',html,re.VERBOSE)
497 ret=[]
498 for match in matches:
499 ip=match[0]
500 port=match[1]
501 type=-1 #该网站未提供代理服务器类型
502 area=match[2]
503 area=unicode(area, 'cp936')
504 area=area.encode('utf8')
505 ret.append([ip,port,type,area])
506 if indebug:print '6',ip,port,type,area
507 return ret
508
515 def build_list_urls_7(page=1):
516 return ['http://www.proxylists.net/http_highanon.txt']
517
518 def parse_page_7(html=''):
519 matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
520 ret=[]
521 for match in matches:
522 ip=match[0]
523 port=match[1]
524 type=2
525 area='--'
526 ret.append([ip,port,type,area])
527 if indebug:print '7',ip,port,type,area
528 return ret
529
536
537 def build_list_urls_8(page=1):
538 return ['http://www.proxylists.net/http.txt']
539
540 def parse_page_8(html=''):
541 matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
542 ret=[]
543 for match in matches:
544 ip=match[0]
545 port=match[1]
546 type=-1
547 area='--'
548 ret.append([ip,port,type,area])
549 if indebug:print '8',ip,port,type,area
550 return ret
555
556
557 def build_list_urls_9(page=6):
558 page=page+1
559 ret=[]
560 for i in range(0,page):
561 ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d'%{'n':i})
562 return ret
563
564 def parse_page_9(html=''):
565 matches=re.findall(r'''
566 (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
567 \:(\d{2,5}) #port
568 <\/TD>[\s\r\n]*
569 <TD>([^<]+)</TD> #area
570 [\s\r\n]*
571 <TD>([^<]+)</TD> #type
572 ''',html,re.VERBOSE)
573 ret=[]
574 for match in matches:
575 ip=match[0]
576 port=match[1]
577 type=match[3]
578 area=match[2]
579 if (type=='Anonymous'):
580 type=1
581 else:
582 type=-1
583 ret.append([ip,port,type,area])
584 if indebug:print '9',ip,port,type,area
585 return ret
586
588
589 def build_list_urls_10(page=5):
590 page=page+1
591 ret=[]
592 for i in range(1,page):
593 ret.append('http://www.publicproxyservers.com/page%(n)01d.html'%{'n':i})
594 return ret
595
596 def parse_page_10(html=''):
597 matches=re.findall(r'''
598 (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
599 <\/td>[\s\r\n]*
600 <td[^>]+>(\d{2,5})<\/td> #port
601 [\s\r\n]*
602 <td>([^<]+)<\/td> #type
603 [\s\r\n]*
604 <td>([^<]+)<\/td> #area
605 ''',html,re.VERBOSE)
606 ret=[]
607 for match in matches:
608 ip=match[0]
609 port=match[1]
610 type=match[2]
611 area=match[3]
612 if (type=='high anonymity'):
613 type=2
614 elif (type=='anonymous'):
615 type=1
616 elif (type=='transparent'):
617 type=0
618 else:
619 type=-1
620 ret.append([ip,port,type,area])
621 if indebug:print '10',ip,port,type,area
622 return ret
623
625
626
627
628 def build_list_urls_11(page=10):
629 page=page+1
630 ret=[]
631 for i in range(1,page):
632 ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d'%{'n':i})
633
634 ret.append('http://www.my-proxy.com/list/proxy.php?list=s1')
635 ret.append('http://www.my-proxy.com/list/proxy.php?list=s2')
636 ret.append('http://www.my-proxy.com/list/proxy.php?list=s3')
637 return ret
638
639 def parse_page_11(html=''):
640 matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
641 ret=[]
642
643 if (html.find('(Level 1)')>0):
644 type=2
645 elif (html.find('(Level 2)')>0):
646 type=1
647 elif (html.find('(Level 3)')>0):
648 type=0
649 else:
650 type=-1
651
652 for match in matches:
653 ip=match[0]
654 port=match[1]
655 area='--'
656 ret.append([ip,port,type,area])
657 if indebug:print '11',ip,port,type,area
658 return ret
659
661
662
663
664 def build_list_urls_12(page=4):
665 ret=[]
666 ret.append('http://www.cybersyndrome.net/plr4.html')
667 ret.append('http://www.cybersyndrome.net/pla4.html')
668 ret.append('http://www.cybersyndrome.net/pld4.html')
669 ret.append('http://www.cybersyndrome.net/pls4.html')
670 return ret
671
672 def parse_page_12(html=''):
673 matches=re.findall(r'''
674 onMouseOver\=
675 "s\(\'(\w\w)\'\)" #area
676 \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"?
677 (\w?) #type
678 "?>
679 (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
680 \:(\d{2,5}) #port
681 ''',html,re.VERBOSE)
682 ret=[]
683 for match in matches:
684 ip=match[2]
685 port=match[3]
686 area=match[0]
687 type=match[1]
688 if (type=='A'):
689 type=2
690 elif (type=='B'):
691 type=1
692 else:
693 type=0
694 ret.append([ip,port,type,area])
695 if indebug:print '12',ip,port,type,area
696 return ret
699
700
701 def build_list_urls_13(page=3):
702 url='http://www.checkedproxylists.com/'
703 html=get_html(url)
704 matchs=re.findall(r"""
705 href\='([^']+)'>(?:high_anonymous|anonymous|transparent)
706 \sproxy\slist<\/a>""",html,re.VERBOSE)
707 return map(lambda x: url+x, matchs)
708
709 def parse_page_13(html=''):
710 html_matches=re.findall(r"eval\(unescape\('([^']+)'\)",html)
711 if (len(html_matches)>0):
712 conent=urllib.unquote(html_matches[0])
713 matches=re.findall(r"""<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td>
714 <td>(\d{2,5})<\/td><\/tr>""",conent,re.VERBOSE)
715 ret=[]
716 if (html.find('<title>Checked Proxy Lists - proxylist_high_anonymous_')>0):
717 type=2
718 elif (html.find('<title>Checked Proxy Lists - proxylist_anonymous_')>0):
719 type=1
720 elif (html.find('<title>Checked Proxy Lists - proxylist_transparent_')>0):
721 type=0
722 else:
723 type=-1
724
725 for match in matches:
726 ip=match[0]
727 port=match[1]
728 area='--'
729 ret.append([ip,port,type,area])
730 if indebug:print '13',ip,port,type,area
731 return ret
732
734
735
736
737 #线程类
738
739 class TEST(threading.Thread):
740 def __init__(self,action,index=None,checklist=None):
741 threading.Thread.__init__(self)
742 self.index =index
743 self.action=action
744 self.checklist=checklist
745
746 def run(self):
747 if (self.action=='getproxy'):
748 get_proxy_one_website(self.index)
749 else:
750 check_proxy(self.index,self.checklist)
751
752
753 def check_proxy(index,checklist=[]):
754 for item in checklist:
755 check_one_proxy(item[0],item[1])
756
757
758 def patch_check_proxy(threadCount,action=''):
759 global check_in_one_call,skip_check_in_hour,conn
760 threads=[]
761 if (action=='checknew'): #检查所有新加入,并且从未被检查过的
762 orderby=' `time_added` desc '
763 strwhere=' `active` is null '
764 elif (action=='checkok'): #再次检查 以前已经验证成功的 代理
765 orderby=' `time_checked` asc '
766 strwhere=' `active`=1 '
767 elif (action=='checkfail'): #再次检查以前验证失败的代理
768 orderby=' `time_checked` asc '
769 strwhere=' `active`=0 '
770 else: #检查所有的
771 orderby=' `time_checked` asc '
772 strwhere=' 1=1 '
773 sql="""
774 select `ip`,`port` FROM `proxier` where
775 `time_checked` < (unix_timestamp()-%(skip_time)01s)
776 and %(strwhere)01s
777 order by %(order)01s
778 limit %(num)01d
779 """%{ 'num':check_in_one_call,
780 'strwhere':strwhere,
781 'order':orderby,
782 'skip_time':skip_check_in_hour*3600}
783 conn.execute(sql)
784 rows = conn.fetchall()
785
786 check_in_one_call=len(rows)
787
788 #计算每个线程将要检查的代理个数
789 if len(rows)>=threadCount:
790 num_in_one_thread=len(rows)/threadCount
791 else:
792 num_in_one_thread=1
793
794 threadCount=threadCount+1
795 print "现在开始验证以下代理服务器....."
796 for index in range(1,threadCount):
797 #分配每个线程要检查的checklist,并把那些剩余任务留给最后一个线程
798 checklist=rows[(index-1)*num_in_one_thread:index*num_in_one_thread]
799 if (index+1==threadCount):
800 checklist=rows[(index-1)*num_in_one_thread:]
801
802 t=TEST(action,index,checklist)
803 t.setDaemon(True)
804 t.start()
805 threads.append((t))
806 for thread in threads:
807 thread.join(60)
808 update_proxies() #把所有的检查结果更新到数据库
809
810
811 def get_proxy_one_website(index):
812 global proxy_array
813 func='build_list_urls_'+str(index)
814 parse_func=eval('parse_page_'+str(index))
815 urls=eval(func+'()')
816 for url in urls:
817 html=get_html(url)
818 print url
819 proxylist=parse_func(html)
820 for proxy in proxylist:
821 ip=string.strip(proxy[0])
822 port=string.strip(proxy[1])
823 if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
824 type=str(proxy[2])
825 area=string.strip(proxy[3])
826 proxy_array.append([ip,port,type,area])
827
828
829 def get_all_proxies():
830 global web_site_count,conn,skip_get_in_hour
831
832 #检查最近添加代理是什么时候,避免短时间内多次抓取
833 rs=conn.execute("select max(`time_added`) from `proxier` limit 1")
834 last_add=rs.fetchone()[0]
835 if (last_add and my_unix_timestamp()-last_add<skip_get_in_hour*3600):
836 print """
837 放弃抓取代理列表!
838 因为最近一次抓取代理的时间是: %(t)1s
839 这个时间距离现在的时间小于抓取代理的最小时间间隔: %(n)1d 小时
840 如果一定要现在抓取代理,请修改全局变量: skip_get_in_hour 的值
841 """%{'t':formattime(last_add),'n':skip_get_in_hour}
842 return
843
844 print "现在开始从以下"+str(web_site_count)+"个网站抓取代理列表...."
845 threads=[]
846 count=web_site_count+1
847 for index in range(1,count):
848 t=TEST('getproxy',index)
849 t.setDaemon(True)
850 t.start()
851 threads.append((t))
852 for thread in threads:
853 thread.join(60)
854 add_proxies_to_db()
855
856 def add_proxies_to_db():
857 global proxy_array
858 count=len(proxy_array)
859 for i in range(count):
860 item=proxy_array[i]
861 sql="""insert into `proxier` (`ip`,`port`,`type`,`time_added`,`area`) values('
862 """+item[0]+"',"+item[1]+","+item[2]+",unix_timestamp(),'"+clean_string(item[3])+"')"
863 try:
864 conn.execute(sql)
865 print "%(num)2.1f\%\t"%{'num':100*(i+1)/count},item[0],":",item[1]
866 except:
867 pass
868
869
870 def update_proxies():
871 global update_array
872 for item in update_array:
873 sql='''
874 update `proxier` set `time_checked`=unix_timestamp(),
875 `active`=%(active)01d,
876 `speed`=%(speed)02.3f
877 where `ip`='%(ip)01s' and `port`=%(port)01d
878 '''%{'active':item[2],'speed':item[3],'ip':item[0],'port':item[1]}
879 try:
880 conn.execute(sql)
881 except:
882 pass
883
884 #sqlite 不支持 unix_timestamp这个函数,所以我们要自己实现
885 def my_unix_timestamp():
886 return int(time.time())
887
888 def clean_string(s):
889 tmp=re.sub(r"['\,\s\\\/]", ' ', s)
890 return re.sub(r"\s+", ' ', tmp)
891
892 def formattime(t):
893 return time.strftime('%c',time.gmtime(t+8*3600))
894
895
896 def open_database():
897 global db,conn,day_keep,dbfile
898
899 try:
900 from pysqlite2 import dbapi2 as sqlite
901 except:
902 print """
903 本程序使用 sqlite 做数据库来保存数据,运行本程序需要 pysqlite的支持
904 python 访问 sqlite 需要到下面地址下载这个模块 pysqlite, 272kb
905 http://initd.org/tracker/pysqlite/wiki/pysqlite#Downloads
906 下载(Windows binaries for Python 2.x)
907 """
908 raise SystemExit
909
910 try:
911 db = sqlite.connect(dbfile,isolation_level=None)
912 db.create_function("unix_timestamp", 0, my_unix_timestamp)
913 conn = db.cursor()
914 except:
915 print "操作sqlite数据库失败,请确保脚本所在目录具有写权限"
916 raise SystemExit
917
918 sql="""
919 /* ip: 只要纯ip地址(xxx.xxx.xxx.xxx)的代理 */
920 /* type: 代理类型 2:高匿 1:普匿 0:透明 -1: 未知 */
921 /* status: 这个字段本程序还没有用到,留在这里作以后扩展*/
922 /* active: 代理是否可用 1:可用 0:不可用 */
923 /* speed: 请求相应时间,speed越小说明速度越快 */
924
925 CREATE TABLE IF NOT EXISTS `proxier` (
926 `ip` varchar(15) NOT NULL default '',
927 `port` int(6) NOT NULL default '0',
928 `type` int(11) NOT NULL default '-1',
929 `status` int(11) default '0',
930 `active` int(11) default NULL,
931 `time_added` int(11) NOT NULL default '0',
932 `time_checked` int(11) default '0',
933 `time_used` int(11) default '0',
934 `speed` float default NULL,
935 `area` varchar(120) default '--', /* 代理服务器所在位置 */
936 PRIMARY KEY (`ip`)
937 );
938 /*
939 CREATE INDEX IF NOT EXISTS `type` ON proxier(`type`);
940 CREATE INDEX IF NOT EXISTS `time_used` ON proxier(`time_used`);
941 CREATE INDEX IF NOT EXISTS `speed` ON proxier(`speed`);
942 CREATE INDEX IF NOT EXISTS `active` ON proxier(`active`);
943 */
944 PRAGMA encoding = "utf-8"; /* 数据库用 utf-8编码保存 */
945 """
946 conn.executescript(sql)
947 conn.execute("""DELETE FROM `proxier`
948 where `time_added`< (unix_timestamp()-?)
949 and `active`=0""",(day_keep*86400,))
950
951 conn.execute("select count(`ip`) from `proxier`")
952 m1=conn.fetchone()[0]
953 if m1 is None:return
954
955 conn.execute("""select count(`time_checked`)
956 from `proxier` where `time_checked`>0""")
957 m2=conn.fetchone()[0]
958
959 if m2==0:
960 m3,m4,m5=0,"尚未检查","尚未检查"
961 else:
962 conn.execute("select count(`active`) from `proxier` where `active`=1")
963 m3=conn.fetchone()[0]
964 conn.execute("""select max(`time_checked`), min(`time_checked`)
965 from `proxier` where `time_checked`>0 limit 1""")
966 rs=conn.fetchone()
967 m4,m5=rs[0],rs[1]
968 m4=formattime(m4)
969 m5=formattime(m5)
970 print """
971 共%(m1)1d条代理,其中%(m2)1d个代理被验证过,%(m3)1d个代理验证有效。
972 最近一次检查时间是:%(m4)1s
973 最远一次检查时间是: %(m5)1s
974 提示:对于检查时间超过24小时的代理,应该重新检查其有效性
975 """%{'m1':m1,'m2':m2,'m3':m3,'m4':m4,'m5':m5}
976
977
978
979 def close_database():
980 global db,conn
981 conn.close()
982 db.close()
983 conn=None
984 db=None
985
986 if __name__ == '__main__':
987 open_database()
988 get_all_proxies()
989 patch_check_proxy(thread_num)
990 output_file()
991 close_database()
992 print "所有工作已经完成"