Python | 爬虫③：爬取阿里物流报价信息

爬取阿里物流报价的风险小一些（只要查询频率合理的话），毕竟阿里的这一网页就是用来让人做爬虫的。

Python

import random
import requests
import re
import os
import csv
import time


def dicBuild(path):
    # 去除首位空格
    path = path.strip()
    isExists = os.path.exists(path)
    if not isExists:
        os.makedirs(path)


def csvBuild(path, lineInfo):
    file = open(path, mode='w', encoding='utf-8', newline='')
    csvWriter = csv.DictWriter(file, fieldnames=['物流公司', '重货单价（元/公斤）', '轻货单价（元/立方）'])
    csvWriter.writeheader()
    for item in lineInfo:
        dit = {
            '物流公司': item[0],
            '重货单价（元/公斤）': item[1],
            '轻货单价（元/立方）': item[2]
        }
        csvWriter.writerow(dit)
    file.close()


dicBuild('StataDic')

urlOrigin = 'https://56.1688.com/routes/nanchang360100.htm'
# 原cookie已被博主刻意篡改，仅用作格式核对，请使用正确的cookie值
headers = {
    'cookie': 'cna=T7hCG73j4lYCAbZkXXcHClmF;...l=eBE74xDeTcgk64jLBOfanurza77OSIRGVhsiMR3z0a8y9BeYBqQAonxvTpyduh-Mmn',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56'
}

requests.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False

response = requests.get(url=urlOrigin, headers=headers, timeout=300)


cityNameInfo = re.findall('<a href="//56.1688.com/routes/nanchang360100-(.*?).htm"', response.text)

for cityName in cityNameInfo:
    url = 'https://56.1688.com/routes/nanchang360100-' + cityName + '.htm'
    response2 = requests.get(url=url, headers=headers, timeout=300)
    lineInfoInter = re.findall('阿里物流南昌(.*?)更多南昌市相关的热门线路', response2.text, re.S)
    try:
        lineInfo = re.findall(
            '<a title=".*?" target="_blank" class="choseItem".*?南昌.*?<a title="(.*?)" target="_blank".*?<strong class="F14">(.*?)</strong>元/公斤.*?<strong class="F14">(.*?)</strong>元/立方',
            str(lineInfoInter[0]), re.S)
    except Exception:
        continue
    csvPath = 'StataDic/' + cityName + '.csv'
    csvBuild(csvPath, lineInfo)
    time.sleep(random.randint(5, 10))

# 单条请求测试代码
# url = 'https://56.1688.com/routes/nanchang360100-guangzhou.htm'
# response2 = requests.get(url=url, headers=headers, timeout=300)
#
# lineInfoInter = re.findall('阿里物流南昌(.*?)更多南昌市相关的热门线路', response2.text, re.S)
# print(lineInfoInter)
# lineInfo = re.findall('<a title=".*?" target="_blank" class="choseItem".*?南昌.*?<a title="(.*?)" target="_blank".*?<strong class="F14">(.*?)</strong>元/公斤.*?<strong class="F14">(.*?)</strong>元/立方', str(lineInfoInter[0]), re.S)
# csvBuild('StataDic/guangzhou.csv', lineInfo)

Python | 爬虫③：爬取阿里物流报价信息

Python | 爬虫②：Pixiv爬取图信息和作者页

Python | OpenAI API的人格调教