230 #对于数据库中的每个不同字段,要处理一下,中文要编码,日期字段要转化
231 def formatitem(value,colnum):
232 global output_type
233 if (colnum==9):
234 value=value.encode('cp936')
235 elif value is None:
236 value=''
237
238 if colnum==5 or colnum==6 or colnum==7: #time_xxxed
239 value=string.atof(value)
240 if value<1:
241 value=''
242 else:
243 value=formattime(value)
244
245 if value=='' and output_type=='htm':value=' '
246 return value
247
248
249
250 def check_one_proxy(ip,port):
251 global update_array
252 global check_in_one_call
253 global target_url,target_string,target_timeout
254
255 url=target_url
256 checkstr=target_string
257 timeout=target_timeout
258 ip=string.strip(ip)
259 proxy=ip+':'+str(port)
260 proxies = {'http': 'http://'+proxy+'/'}
261 opener = urllib.FancyURLopener(proxies)
262 opener.addheaders = [
263 ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')
264 ]
265 t1=time.time()
266
267 if (url.find("?")==-1):
268 url=url+'?rnd='+str(random.random())
269 else:
270 url=url+'&rnd='+str(random.random())
271
272 try:
273 f = opener.open(url)
274 s= f.read()
275 pos=s.find(checkstr)
276 except:
277 pos=-1
278 pass
279 t2=time.time()
280 timeused=t2-t1
281 if (timeused<timeout and pos>0):
282 active=1
283 else:
284 active=0
285 update_array.append([ip,port,active,timeused])
286 print len(update_array),' of ',check_in_one_call," ",ip,':',port,'--',int(timeused)
287
288
289 def get_html(url=''):
290 opener = urllib.FancyURLopener({}) #不使用代理
291 #www.my-proxy.com 需要下面这个Cookie才能正常抓取
292 opener.addheaders = [
293 ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'),
294 ('Cookie','permission=1')
295 ]
296 t=time.time()
297 if (url.find("?")==-1):
298 url=url+'?rnd='+str(random.random())
299 else:
300 url=url+'&rnd='+str(random.random())
301 try:
302 f = opener.open(url)
303 return f.read()
304 except:
305 return ''
306
307
308
309
310 ################################################################################
#
## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
#
################################################################################
311
312 def build_list_urls_1(page=5):
313 page=page+1
314 ret=[]
315 for i in range(1,page):
316 ret.append('http://proxy4free.com/page%(num)01d.html'%{'num':i})
317 return ret
318
319 def parse_page_1(html=''):
320 matches=re.findall(r'''
321 <td>([\d\.]+)<\/td>[\s\n\r]* #ip
322 <td>([\d]+)<\/td>[\s\n\r]* #port
323 <td>([^\<]*)<\/td>[\s\n\r]* #type
324 <td>([^\<]*)<\/td> #area
325 ''',html,re.VERBOSE)
326 ret=[]
327 for match in matches:
328 ip=match[0]
329 port=match[1]
330 type=match[2]
331 area=match[3]
332 if (type=='anonymous'):
333 type=1
334 elif (type=='high anonymity'):
335 type=2
336 elif (type=='transparent'):
337 type=0
338 else:
339 type=-1
340 ret.append([ip,port,type,area])
341 if indebug:print '1',ip,port,type,area
342 return ret
343
344 ################################################################################
#
## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
#
################################################################################
345
346
347 def build_list_urls_2(page=1):
348 return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml']
349
350 def parse_page_2(html=''):
351 matches=re.findall(r'''
352 ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+) #ip:port
353 \s+(Anonymous|Elite Proxy)[+\s]+ #type
354 (.+)\r?\n #area
355 ''',html,re.VERBOSE)
356 ret=[]
357 for match in matches:
358 ip=match[0]
359 port=match[1]
360 type=match[2]
361 area=match[3]
362 if (type=='Anonymous'):
363 type=1
364 else:
365 type=2
366 ret.append([ip,port,type,area])
367 if indebug:print '2',ip,port,type,area
368 return ret
369
370
371 ################################################################################
#
## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
#
################################################################################
372
373
374 def build_list_urls_3(page=15):
375 page=page+1
376 ret=[]
377 for i in range(1,page):
378 ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm'%{'num':i})
379 return ret
380
381 def parse_page_3(html=''):
382 matches=re.findall(r'''
383 <tr><td><span\sclass\="\w+">(\d{1,3})<\/span>\. #ip(part1)
384 <span\sclass\="\w+">
385 (\d{1,3})<\/span> #ip(part2)
386 (\.\d{1,3}\.\d{1,3}) #ip(part3,part4)
387
388 \:\r?\n(\d{2,5})<\/td> #port
389 <td>([^<]+)</td> #type
390 <td>[^<]+<\/td>
391 <td>([^<]+)<\/td> #area
392 <\/tr>''',html,re.VERBOSE)
393 ret=[]
394 for match in matches:
395 ip=match[0]+"."+match[1]+match[2]
396 port=match[3]
397 type=match[4]
398 area=match[5]
399 if (type=='anonymous proxy server'):
400 type=1
401 elif (type=='high-anonymous proxy server'):
402 type=2
403 elif (type=='transparent proxy'):
404 type=0
405 else:
406 type=-1
407 ret.append([ip,port,type,area])
408 if indebug:print '3',ip,port,type,area
409 return ret
410
411
412
413 ################################################################################
#
## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
#
################################################################################
414
415 def build_list_urls_4(page=3):
416 page=page+1
417 ret=[]
418 for i in range(1,page):
419 ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d'%{'n':i})
420 return ret
421
422 def parse_page_4(html=''):
423 matches=re.findall(r"""
424 list
425 \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' #ip
426 \,'(\d{2,5})' #port
427 \,'(\d)' #type
428 \,'([^']+)'\) #area
429 \;\r?\n""",html,re.VERBOSE)
430 ret=[]
431 for match in matches:
432 ip=match[0]
433 port=match[1]
434 type=match[2]
435 area=match[3]
436 area=unicode(area, 'cp936')
437 area=area.encode('utf8')
438 if (type=='1'): #type的判断可以查看抓回来的网页的javascript部分
439 type=1
440 elif (type=='3'):
441 type=2
442 elif (type=='2'):
443 type=0
444 else:
445 type=-1
446 ret.append([ip,port,type,area])
447 if indebug:print '4',ip,port,type,area
448 return ret
449