python编写的URL采集工具

本文最后更新于 2024年8月23日 下午

0、简介

适用于百度的url采集工具,采集url的同时,可以配合burp+xray联动检测。

1、代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python3

'''
Blog:https://www.imortal.icu
Author:imortal
'''

import re
import sys
import time
import requests
import threading

#限制线程的最大数量为100
sem=threading.Semaphore(100)
#正则匹配url地址
url_re = re.compile(r'<a href="(.*?)" target="_blank"')
head = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'BAIDUID=4E1F48863DE6180CB275FC38826CD741:FG=1; BIDUPSID=4E1F48863DE6180C0425C90DEB64DA88; PSTM=1657768645; BD_UPN=13314752; ZFY=A7LYxpn3VZCuGnOtwfGfP1T1OYErt87SDFQGsLE6aaw:C; COOKIE_SESSION=129306_0_4_4_1_4_1_0_4_4_0_0_0_0_2_0_1658385541_0_1658385539%7C5%230_0_1658385539%7C1; baikeVisitId=51e786eb-5497-4b5b-bd36-485d969cc770; Hm_lvt_aec699bb6442ba076c8981c6dc490771=1658256230; B64_BOT=1; H_PS_645EC=cd791WcJIjH2j2IzGRQKOe%2BzrQxF2yOtETeX3vsZq40XJjzyhhFRwEhI24C3QYesfKDN; BA_HECTOR=242h8kag8la42l80242kle1d1hdht4417; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; ab_sr=1.0.1_ZmExYTE0Zjg5ODJlYjM2Y2EzOTY5N2ViZWZjYTA5Y2E5MzU2ZmYxNTdlM2ZkMGUyODNlYzE3MWE3OGU1MTgyZGM3YTllNjBhMjNhZjJiMDA5MjIxMTQwNzRiNWVmMjY3MDM4OTdiYzM3ZDQxZmJmYzhhM2RmMWY5MTVmMWNjOTgyMWUwNDFhNjlhZTRkNWI4ZGIzYTA2MDM4YTZlYjMwZA==; BD_HOME=1; H_PS_PSSID=36547_36756_36255_36726_36455_36413_36846_36452_36690_36165_36816_36570_36776_36773_36637_36746_36760_36768_36766_26350_36861',
'Host': 'www.baidu.com',
'Referer': 'https://www.baidu.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62',
'sec-ch-ua': '" Not;A Brand";v="99", "Microsoft Edge";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
#代理地址,方便配合burp使用
proxies={
'http':'127.0.0.1:8080','https':'127.0.0.1:8080'
}
#保存url地址
def result_url(target):
result = open('url.txt', 'a+')
with open('url.txt') as f:
if target not in f.read():
result.write(target +'\n')
result.close()
#保存网站地址
def result_domain(target):
target = 'http://' + target.split('/')[2]
result = open('domain.txt', 'a+')
with open('domain.txt') as f:
if target not in f.read():
result.write(target +'\n')
result.close()

def spider(url):
with sem:
try:
req = requests.get(url=url,headers=head,timeout=5)
urls = url_re.findall(req.text)
#遍历获取到的url地址
for url in urls:
###百度采集到的url,二次请求后可以获取真实的网站地址 req.url
req = requests.get(url=url,timeout=5)
target = req.url
#简单做一下过滤
if 'baidu' not in target and '51cto' not in target and 'csdn' not in target and 'jiameng' not in target:
print(target)
#开启:联动burp 关闭:只采集url地址
#req = requests.get(url=target,proxies=proxies,timeout=5)
result_url(target)
result_domain(target)
except:
pass
time.sleep(0.1)


if __name__ == '__main__':
if len(sys.argv)!= 2:
print('python url_spider.py "inurl:login title:系统"')
sys.exit(-1)
else:
keyword = sys.argv[1]
for i in range(0,750,50):
url_start = 'https://www.baidu.com/s?wd=' + keyword + '&rn=50&pn='
url = url_start+str(i)
t=threading.Thread(target=spider,args=(url,))
t.start()

2、截图

img