搜索

回答

可以使用selenium爬取搜索引擎的搜索结果，我们以百度为例，这是运行的结果

800_auto

#!/usr/local/python3/bin/python3
# -*- coding: utf-8 -*
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--remote-debugging-port=9222")
chromeOptions.add_argument('--no-sandbox')
browser = webdriver.Chrome('/usr/bin/chromedriver',chrome_options=chromeOptions)

browser.get("https://www.baidu.com/")               #进入相关网站
#保存网站截屏图片   


browser.find_element_by_id('kw').send_keys('ai发展前景 ',Keys.RETURN)  # 输入框
browser.find_element_by_id('su').click()  # 点击搜索按钮


try:
    sleep(1)
    # 等待<h3 class="c-title t t tts-title">标签出现
    # element = WebDriverWait(browser, 10).until(
    #     EC.presence_of_element_located((By.CSS_SELECTOR, "h3.c-title.t.tts-title"))
    # )
    # 获取网页内容
    html_content = browser.page_source
    
    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 查找所有的<h3>标签
    h3_tags = soup.find_all('h3', class_='c-title t t tts-title')
    
    # 提取<a>链接和标题
    for h3 in h3_tags:
        a_tag = h3.find('a')
        if a_tag:
            link = a_tag['href']
            title = a_tag.get_text(strip=True)
            print(f'Link: {link}')
            print(f'Title: {title}')
            print('-' * 50)
    
    # 关闭WebDriver
    browser.quit()
    
    
finally:
    # 关闭浏览器
    browser.quit()

运行完后，我们还要解析清晰获取每个链接中的正文，然后汇总起来，我们可以通过以下这个python代码获取：

800_auto

点击查看全文

import requests

# 百度的链接
baidu_link = 'http://www.baidu.com/link?url=zaOHenZSbFjbJHo18QmDvv4ATubM0W0iUPaYr0Ij69UkVOFgmrKMyGDXozf2D7Mp'

# 发送请求并获取重定向后的URL
response = requests.get(baidu_link, allow_redirects=True)

# 获取重定向后的真实URL
real_url ="https://r.jina.ai/" + response.url

# 设置请求头
headers = {
    'Accept': 'application/json'
}

try:
    # 发送GET请求
    response = requests.get(real_url, headers=headers)
    
    # 检查请求是否成功（状态码200表示成功）
    response.raise_for_status()  # 如果状态码不是200，会抛出异常
    
    # 获取JSON数据
    json_data = response.json()
    print('JSON Data:', json_data)

except requests.exceptions.HTTPError as http_err:
    print(f'HTTP error occurred: {http_err}')
except requests.exceptions.ConnectionError as conn_err:
    print(f'Connection error occurred: {conn_err}')
except requests.exceptions.Timeout as timeout_err:
    print(f'Timeout error occurred: {timeout_err}')
except requests.exceptions.RequestException as req_err:
    print(f'An error occurred: {req_err}')

好了，最后我们汇总到一起

#!/usr/local/python3/bin/python3
# -*- coding: utf-8 -*
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests

def fetch_json_from_baidu_link(baidu_link):
    # 发送请求并获取重定向后的URL
    response = requests.get(baidu_link, allow_redirects=True)
    
    # 获取重定向后的真实URL
    real_url = "https://r.jina.ai/" + response.url
    
    # 设置请求头
    headers = {
        'Accept': 'application/json'
    }
    
    try:
        # 发送GET请求
        response = requests.get(real_url, headers=headers)
        
        # 检查请求是否成功（状态码200表示成功）
        response.raise_for_status()  # 如果状态码不是200，会抛出异常
        
        # 获取JSON数据
        json_data = response.json()
        return json_data

    except requests.exceptions.HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except requests.exceptions.ConnectionError as conn_err:
        print(f'Connection error occurred: {conn_err}')
    except requests.exceptions.Timeout as timeout_err:
        print(f'Timeout error occurred: {timeout_err}')
    except requests.exceptions.RequestException as req_err:
        print(f'An error occurred: {req_err}')
    
    return None	
    
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--remote-debugging-port=9222")
chromeOptions.add_argument('--no-sandbox')
browser = webdriver.Chrome('/usr/bin/chromedriver',chrome_options=chromeOptions)

browser.get("https://www.baidu.com/")               #进入相关网站
#保存网站截屏图片   

list_result=[]
browser.find_element_by_id('kw').send_keys('ai发展前景 ',Keys.RETURN)  # 输入框
browser.find_element_by_id('su').click()  # 点击搜索按钮


try:
    sleep(1)
    # 等待<h3 class="c-title t t tts-title">标签出现
    # element = WebDriverWait(browser, 10).until(
    #     EC.presence_of_element_located((By.CSS_SELECTOR, "h3.c-title.t.tts-title"))
    # )
    # 获取网页内容
    html_content = browser.page_source
    
    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 查找所有的<h3>标签
    h3_tags = soup.find_all('h3', class_='c-title t t tts-title')
    
    # 提取<a>链接和标题
    for h3 in h3_tags:
        a_tag = h3.find('a')
        if a_tag:
            link = a_tag['href']
            title = a_tag.get_text(strip=True)
            res=fetch_json_from_baidu_link(link)
            
            if res['code']==200: 
                list_result.append(res['data']['content'])
                
            

            # print(f'Link: {link}')
            # print(f'Title: {title}')
            # print('-' * 50)
        print(list_result)
    # 关闭WebDriver
    browser.quit()
    
    
finally:
    # 关闭浏览器
    browser.quit()

网友回复

我知道答案，我要回答

我有问题

私活外包

回答

开发了一个网站ai聊天助手

一个月开发一套类似coze的智能体平台

部署一套内网离线ai助理

私有ai助理开发

类似如家的租房app开发

h5手机端考试网站开发

开发一个短剧解锁剧集的小程序

我要开发一个酒类拍卖交易平台

开发艺术品拍卖收藏买画卖画h5网站

帮我做个数字货币交易所网站

如何通过截屏ocr识别元素坐标来调用大模型api实现自动操作电脑？

阿里的qwen-vl与gui-plus哪个更强？

向量语义匹配为啥需要rerank重新排序？

adb模拟用户输入为啥不支持中文输入？

python如何操作adb调用多模态大模型api实现类似豆包手机的自动化操作安卓手机？

redis如何开启auth连接密码访问?

抖音爆火的手势控制圣诞树粒子动画特效代码有吗？

ai生成的照片图片如何分层可以自己调整？

window如何直接运行linux的二进制程序？

A2UI到底是啥？