可以使用selenium爬取搜索引擎的搜索结果,我们以百度为例,这是运行的结果
#!/usr/local/python3/bin/python3 # -*- coding: utf-8 -* from selenium import webdriver from selenium.webdriver.common.keys import Keys from time import sleep from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--headless") chromeOptions.add_argument("--remote-debugging-port=9222") chromeOptions.add_argument('--no-sandbox') browser = webdriver.Chrome('/usr/bin/chromedriver',chrome_options=chromeOptions) browser.get("https://www.baidu.com/") #进入相关网站 #保存网站截屏图片 browser.find_element_by_id('kw').send_keys('ai发展前景 ',Keys.RETURN) # 输入框 browser.find_element_by_id('su').click() # 点击搜索按钮 try: sleep(1) # 等待<h3 class="c-title t t tts-title">标签出现 # element = WebDriverWait(browser, 10).until( # EC.presence_of_element_located((By.CSS_SELECTOR, "h3.c-title.t.tts-title")) # ) # 获取网页内容 html_content = browser.page_source # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html_content, 'html.parser') # 查找所有的<h3>标签 h3_tags = soup.find_all('h3', class_='c-title t t tts-title') # 提取<a>链接和标题 for h3 in h3_tags: a_tag = h3.find('a') if a_tag: link = a_tag['href'] title = a_tag.get_text(strip=True) print(f'Link: {link}') print(f'Title: {title}') print('-' * 50) # 关闭WebDriver browser.quit() finally: # 关闭浏览器 browser.quit()运行完后,我们还要解析清晰获取每个链接中的正文,然后汇总起来,我们可以通过以下这个python代码获取:
点击查看全文
import requests # 百度的链接 baidu_link = 'http://www.baidu.com/link?url=zaOHenZSbFjbJHo18QmDvv4ATubM0W0iUPaYr0Ij69UkVOFgmrKMyGDXozf2D7Mp' # 发送请求并获取重定向后的URL response = requests.get(baidu_link, allow_redirects=True) # 获取重定向后的真实URL real_url ="https://r.jina.ai/" + response.url # 设置请求头 headers = { 'Accept': 'application/json' } try: # 发送GET请求 response = requests.get(real_url, headers=headers) # 检查请求是否成功(状态码200表示成功) response.raise_for_status() # 如果状态码不是200,会抛出异常 # 获取JSON数据 json_data = response.json() print('JSON Data:', json_data) except requests.exceptions.HTTPError as http_err: print(f'HTTP error occurred: {http_err}') except requests.exceptions.ConnectionError as conn_err: print(f'Connection error occurred: {conn_err}') except requests.exceptions.Timeout as timeout_err: print(f'Timeout error occurred: {timeout_err}') except requests.exceptions.RequestException as req_err: print(f'An error occurred: {req_err}')好了,最后我们汇总到一起
#!/usr/local/python3/bin/python3 # -*- coding: utf-8 -* from selenium import webdriver from selenium.webdriver.common.keys import Keys from time import sleep from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import requests def fetch_json_from_baidu_link(baidu_link): # 发送请求并获取重定向后的URL response = requests.get(baidu_link, allow_redirects=True) # 获取重定向后的真实URL real_url = "https://r.jina.ai/" + response.url # 设置请求头 headers = { 'Accept': 'application/json' } try: # 发送GET请求 response = requests.get(real_url, headers=headers) # 检查请求是否成功(状态码200表示成功) response.raise_for_status() # 如果状态码不是200,会抛出异常 # 获取JSON数据 json_data = response.json() return json_data except requests.exceptions.HTTPError as http_err: print(f'HTTP error occurred: {http_err}') except requests.exceptions.ConnectionError as conn_err: print(f'Connection error occurred: {conn_err}') except requests.exceptions.Timeout as timeout_err: print(f'Timeout error occurred: {timeout_err}') except requests.exceptions.RequestException as req_err: print(f'An error occurred: {req_err}') return None chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--headless") chromeOptions.add_argument("--remote-debugging-port=9222") chromeOptions.add_argument('--no-sandbox') browser = webdriver.Chrome('/usr/bin/chromedriver',chrome_options=chromeOptions) browser.get("https://www.baidu.com/") #进入相关网站 #保存网站截屏图片 list_result=[] browser.find_element_by_id('kw').send_keys('ai发展前景 ',Keys.RETURN) # 输入框 browser.find_element_by_id('su').click() # 点击搜索按钮 try: sleep(1) # 等待<h3 class="c-title t t tts-title">标签出现 # element = WebDriverWait(browser, 10).until( # EC.presence_of_element_located((By.CSS_SELECTOR, "h3.c-title.t.tts-title")) # ) # 获取网页内容 html_content = browser.page_source # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html_content, 'html.parser') # 查找所有的<h3>标签 h3_tags = soup.find_all('h3', class_='c-title t t tts-title') # 提取<a>链接和标题 for h3 in h3_tags: a_tag = h3.find('a') if a_tag: link = a_tag['href'] title = a_tag.get_text(strip=True) res=fetch_json_from_baidu_link(link) if res['code']==200: list_result.append(res['data']['content']) # print(f'Link: {link}') # print(f'Title: {title}') # print('-' * 50) print(list_result) # 关闭WebDriver browser.quit() finally: # 关闭浏览器 browser.quit()
网友回复
python如何实现torrent的服务端进行文件分发p2p下载?
如何在浏览器中录制摄像头和麦克风数据为mp4视频保存下载本地?
go如何编写一个类似docker的linux的虚拟容器?
python如何写一个bittorrent的种子下载客户端?
ai能通过看一个网页的交互过程视频自主模仿复制网页编写代码吗?
ai先写功能代码通过chrome mcp来进行测试功能最后ai美化页面这个流程能行吗?
vue在手机端上下拖拽元素的时候如何禁止父元素及body的滚动导致无法拖拽完成?
使用tailwindcss如何去掉响应式自适应?
有没有直接在浏览器中运行的离线linux系统?
nginx如何保留post或get数据进行url重定向?