+
80
-

selenium如何打开网页下载网页所有静态资源文件js css image等?

selenium如何打开网页下载网页所有静态资源文件js css image等?


网友回复

+
0
-

1、创建一个http代理

2、浏览器设置代理服务器

3、获取js、css、图片等资源url并下载到本地

+
0
-

附上示例代码:

import socket

import select

import threading

import requests

import os

from urllib.parse import urlparse, urljoin

# 代理服务器监听IP和端口

host = "127.0.0.1"

port = 8081


# 最大连接数

max_connections = 100


# 缓存大小

buffer_size = 4096


# 初始化代理服务器

proxy_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

proxy_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

proxy_server.bind((host, port))

proxy_server.listen(max_connections)

print(f"代理服务器已启动,监听地址:{host}:{port}")


def download_file(url, folder,headers):

    # 获取网页URL的路径和文件名

    parsed_url = urlparse(url)

    path_parts = parsed_url.path.split("/")

    if path_parts[-1] == "":

        filename = "index.html"

    else:

        filename = path_parts[-1]


    # 获取本地文件路径

    local_path = os.path.join(folder, *path_parts[1:-1])

    if not os.path.exists(local_path):

        os.makedirs(local_path)

    local_file = os.path.join(local_path, filename)


    response = requests.get(url, headers=headers)

    if response.status_code == 200:

        with open(local_file, "wb") as f:

            f.write(response.content)



# 处理HTTP请求

def handle_request(client_socket, client_address):

    # 接收客户端请求数据

    request = client_socket.recv(buffer_size).decode("utf-8")


    # 解析请求头部

    headers = request.split("\r\n")

    method, url, protocol = headers[0].split(" ")

    host = headers[1].split(" ")[1]


    # 输出请求信息

    print(f"get http req:{method} {url} {protocol} {host}")

    print(f"请求头部:{headers[:2]}")


    # 建立与目标服务器的连接

    remote_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    remote_socket.connect((host, 80))


    # 转发HTTP请求到目标服务器

    remote_socket.send(request.encode("utf-8"))


    # 接收目标服务器的响应数据

    response = b""

    while True:

        ready_to_read, _, _ = select.select([remote_socket], [], [], 3)

        if ready_to_read:

            data = remote_socket.recv(buffer_size)

            if not data:

                break

            response += data

        else:

            break


    # 输出响应信息

    print(f"收到HTTP响应:{len(response)} bytes")

    print(f"响应头部:{response[:100].decode('utf-8')}")


    # 将响应数据返回给客户端

    client_socket.send(response)

    filename = os.path.basename(url) 		

	#headers = {'Referer': 'custom header'}

    download_file(url,"test",headers)


    # 关闭连接

    remote_socket.close()

    client_socket.close()


# 接受客户端连接并处理请求

while True:

    client_socket, client_address = proxy_server.accept()

    print(f"收到来自 {client_address} 的连接")

    threading.Thread(target=handle_request, args=(client_socket, client_address)).start()

我知道答案,我要回答