可以使用selenium爬取搜索引擎的搜索结果,我们以百度为例,这是运行的结果

#!/usr/local/python3/bin/python3
# -*- coding: utf-8 -*
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--remote-debugging-port=9222")
chromeOptions.add_argument('--no-sandbox')
browser = webdriver.Chrome('/usr/bin/chromedriver',chrome_options=chromeOptions)
browser.get("https://www.baidu.com/") #进入相关网站
#保存网站截屏图片
browser.find_element_by_id('kw').send_keys('ai发展前景 ',Keys.RETURN) # 输入框
browser.find_element_by_id('su').click() # 点击搜索按钮
try:
sleep(1)
# 等待<h3 class="c-title t t tts-title">标签出现
# element = WebDriverWait(browser, 10).until(
# EC.presence_of_element_located((By.CSS_SELECTOR, "h3.c-title.t.tts-title"))
# )
# 获取网页内容
html_content = browser.page_source
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有的<h3>标签
h3_tags = soup.find_all('h3', class_='c-title t t tts-title')
# 提取<a>链接和标题
for h3 in h3_tags:
a_tag = h3.find('a')
if a_tag:
link = a_tag['href']
title = a_tag.get_text(strip=True)
print(f'Link: {link}')
print(f'Title: {title}')
print('-' * 50)
# 关闭WebDriver
browser.quit()
finally:
# 关闭浏览器
browser.quit()
运行完后,我们还要解析清晰获取每个链接中的正文,然后汇总起来,我们可以通过以下这个python代码获取:
点击查看全文
import requests
# 百度的链接
baidu_link = 'http://www.baidu.com/link?url=zaOHenZSbFjbJHo18QmDvv4ATubM0W0iUPaYr0Ij69UkVOFgmrKMyGDXozf2D7Mp'
# 发送请求并获取重定向后的URL
response = requests.get(baidu_link, allow_redirects=True)
# 获取重定向后的真实URL
real_url ="https://r.jina.ai/" + response.url
# 设置请求头
headers = {
'Accept': 'application/json'
}
try:
# 发送GET请求
response = requests.get(real_url, headers=headers)
# 检查请求是否成功(状态码200表示成功)
response.raise_for_status() # 如果状态码不是200,会抛出异常
# 获取JSON数据
json_data = response.json()
print('JSON Data:', json_data)
except requests.exceptions.HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except requests.exceptions.ConnectionError as conn_err:
print(f'Connection error occurred: {conn_err}')
except requests.exceptions.Timeout as timeout_err:
print(f'Timeout error occurred: {timeout_err}')
except requests.exceptions.RequestException as req_err:
print(f'An error occurred: {req_err}')好了,最后我们汇总到一起#!/usr/local/python3/bin/python3
# -*- coding: utf-8 -*
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
def fetch_json_from_baidu_link(baidu_link):
# 发送请求并获取重定向后的URL
response = requests.get(baidu_link, allow_redirects=True)
# 获取重定向后的真实URL
real_url = "https://r.jina.ai/" + response.url
# 设置请求头
headers = {
'Accept': 'application/json'
}
try:
# 发送GET请求
response = requests.get(real_url, headers=headers)
# 检查请求是否成功(状态码200表示成功)
response.raise_for_status() # 如果状态码不是200,会抛出异常
# 获取JSON数据
json_data = response.json()
return json_data
except requests.exceptions.HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except requests.exceptions.ConnectionError as conn_err:
print(f'Connection error occurred: {conn_err}')
except requests.exceptions.Timeout as timeout_err:
print(f'Timeout error occurred: {timeout_err}')
except requests.exceptions.RequestException as req_err:
print(f'An error occurred: {req_err}')
return None
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--headless")
chromeOptions.add_argument("--remote-debugging-port=9222")
chromeOptions.add_argument('--no-sandbox')
browser = webdriver.Chrome('/usr/bin/chromedriver',chrome_options=chromeOptions)
browser.get("https://www.baidu.com/") #进入相关网站
#保存网站截屏图片
list_result=[]
browser.find_element_by_id('kw').send_keys('ai发展前景 ',Keys.RETURN) # 输入框
browser.find_element_by_id('su').click() # 点击搜索按钮
try:
sleep(1)
# 等待<h3 class="c-title t t tts-title">标签出现
# element = WebDriverWait(browser, 10).until(
# EC.presence_of_element_located((By.CSS_SELECTOR, "h3.c-title.t.tts-title"))
# )
# 获取网页内容
html_content = browser.page_source
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有的<h3>标签
h3_tags = soup.find_all('h3', class_='c-title t t tts-title')
# 提取<a>链接和标题
for h3 in h3_tags:
a_tag = h3.find('a')
if a_tag:
link = a_tag['href']
title = a_tag.get_text(strip=True)
res=fetch_json_from_baidu_link(link)
if res['code']==200:
list_result.append(res['data']['content'])
# print(f'Link: {link}')
# print(f'Title: {title}')
# print('-' * 50)
print(list_result)
# 关闭WebDriver
browser.quit()
finally:
# 关闭浏览器
browser.quit()
网友回复


