+
7
-

回答

Selenium 默认获取的就是 JS 渲染后的真实 DOM,但关键在于 等待时机。如果在 JS 执行完成前调用获取方法,拿到的仍是初始 HTML。

下面提供一套生产环境常用的 Python 示例,包含显式等待、DOM 获取方式及注意事项:

完整示例代码(Python + Selenium 4)

from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_rendered_dom_with_resolved_urls(target_url: str, wait_selector: tuple = (By.TAG_NAME, "body")):
    # Selenium 4.6+ 已内置驱动自动管理,直接调用即可
    chromeOptions = webdriver.ChromeOptions()
    chromeOptions.add_argument("--headless")
    chromeOptions.add_argument("--remote-debugging-port=9222")
    chromeOptions.add_argument('--no-sandbox')
    chromeOptions.add_argument('--disable-gpu')
    chromeOptions.add_argument('--disable-dev-shm-usage')
    chromeOptions.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome('/usr/bin/chromedriver', chrome_options=chromeOptions)

    try:
        driver.get(target_url)
        
        # 1. 显式等待 JS 渲染完成(替换为你实际页面的特征元素)
        WebDriverWait(driver, timeout=15, poll_frequency=0.5).until(
            EC.presence_of_element_located(wait_selector)
        )
        
        # 2. 获取渲染后的完整 DOM
        rendered_html = driver.page_source
        
        # 3. 结合 urlparse & urljoin 处理相对路径(常见需求)
        parsed = urlparse(driver.current_url)
        base_url = f"{parsed.scheme}://{parsed.netloc}"
        
        # 示例:提取所有 <a> 标签并补全为绝对 URL
        links = []
        for elem in driver.find_elements(By.TAG_NAME, "a"):
            href = elem.get_attribute("href")
            if href:
                absolute_url = urljoin(driver.current_url, href)
                links.append(absolute_url)
                
        return rendered_html, base_url, links
        
    finally:
        driver.quit()

# 使用示例
if __name__ == "__main__":
    url = "https://example.com"
    html, base, links = get_rendered_dom_with_resolved_urls(
        url, 
        wait_selector=(By.CSS_SELECTOR, "h1")  # 替换为实际动态加载的元素
    )
    print(f"✅ 渲染后 HTML 长度: {len(html)}")
    print(f"					

网友回复

我知道答案,我要回答