代码:
# encode -UTF-8
import requests
import re
url = "http://www.why403.cn/"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"
}
try:
res = requests.get(url=url, headers=header)
page_txt = res.text
except Exception as e:
print(e)
# 正则表达式提取url
ex_url = '<a class="continue-reading-link" href="(.*?)">.*?</a>'
url_waiting_list = re.findall(ex_url, page_txt, re.S)
# 正则表达式提取对应url的页面名
ex_pagename = '<a href=.*? itemprop="mainEntityOfPage" rel="bookmark">(.*?)</a>'
page_name_list =re.findall(ex_pagename,page_txt,re.S)
url_storing_list= list(zip(url_waiting_list,page_name_list))
for url_waiting,page_name in url_storing_list:
#提取页面名字
print(page_name)
storing_src = "D:/桌面/why403/" + page_name+".html"
print(storing_src)
#根据url请求页面
res_page = requests.get(url=url_waiting, headers=header)
page_txt_storing = res_page.text
try:
#储存页面
with open(storing_src, "w", encoding="utf-8") as f:
f.write(page_txt_storing)
print(page_name+"存储完成!")
except Exception as e:
print(e)
测试样例:
target url:www.why403.cn
local path:D:/桌面/爬虫测试why403/