爬取阿里物流报价的风险小一些(只要查询频率合理的话),毕竟阿里的这一网页就是用来让人做爬虫的。
Python
import random
import requests
import re
import os
import csv
import time
def dicBuild(path):
# 去除首位空格
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
def csvBuild(path, lineInfo):
file = open(path, mode='w', encoding='utf-8', newline='')
csvWriter = csv.DictWriter(file, fieldnames=['物流公司', '重货单价(元/公斤)', '轻货单价(元/立方)'])
csvWriter.writeheader()
for item in lineInfo:
dit = {
'物流公司': item[0],
'重货单价(元/公斤)': item[1],
'轻货单价(元/立方)': item[2]
}
csvWriter.writerow(dit)
file.close()
dicBuild('StataDic')
urlOrigin = 'https://56.1688.com/routes/nanchang360100.htm'
# 原cookie已被博主刻意篡改,仅用作格式核对,请使用正确的cookie值
headers = {
'cookie': 'cna=T7hCG73j4lYCAbZkXXcHClmF;...l=eBE74xDeTcgk64jLBOfanurza77OSIRGVhsiMR3z0a8y9BeYBqQAonxvTpyduh-Mmn',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56'
}
requests.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
response = requests.get(url=urlOrigin, headers=headers, timeout=300)
cityNameInfo = re.findall('<a href="//56.1688.com/routes/nanchang360100-(.*?).htm"', response.text)
for cityName in cityNameInfo:
url = 'https://56.1688.com/routes/nanchang360100-' + cityName + '.htm'
response2 = requests.get(url=url, headers=headers, timeout=300)
lineInfoInter = re.findall('阿里物流南昌(.*?)更多南昌市相关的热门线路', response2.text, re.S)
try:
lineInfo = re.findall(
'<a title=".*?" target="_blank" class="choseItem".*?南昌.*?<a title="(.*?)" target="_blank".*?<strong class="F14">(.*?)</strong>元/公斤.*?<strong class="F14">(.*?)</strong>元/立方',
str(lineInfoInter[0]), re.S)
except Exception:
continue
csvPath = 'StataDic/' + cityName + '.csv'
csvBuild(csvPath, lineInfo)
time.sleep(random.randint(5, 10))
# 单条请求测试代码
# url = 'https://56.1688.com/routes/nanchang360100-guangzhou.htm'
# response2 = requests.get(url=url, headers=headers, timeout=300)
#
# lineInfoInter = re.findall('阿里物流南昌(.*?)更多南昌市相关的热门线路', response2.text, re.S)
# print(lineInfoInter)
# lineInfo = re.findall('<a title=".*?" target="_blank" class="choseItem".*?南昌.*?<a title="(.*?)" target="_blank".*?<strong class="F14">(.*?)</strong>元/公斤.*?<strong class="F14">(.*?)</strong>元/立方', str(lineInfoInter[0]), re.S)
# csvBuild('StataDic/guangzhou.csv', lineInfo)