python实现网络请求与爬虫

获取网页内容

import requests
response = requests.get('https://www.example.com')
print(response.text)

解析 HTML 页面

from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all('h1')
for title in titles:
  print(title.text)

下载图片

import requests
img_data = requests.get('http://example.com/image.jpg').content
with open('image.jpg', 'wb') as handler:
  handler.write(img_data)

发送 HTTP POST 请求

import requests
payload = {'key1': 'value1', 'key2': 'value2'}
response = requests.post('https://httpbin.org/post', data=payload)
print(response.text)

处理 JSON 响应

import requests
response = requests.get('https://api.example.com/data')
data = response.json()
print(data)

设置超时时间

import requests
try:
  response = requests.get('https://www.example.com', timeout=5)
except requests.Timeout:
  print("The request timed out")

处理异常

import requests
try:
  response = requests.get('https://www.example.com')
  response.raise_for_status()
except requests.HTTPError as http_err:
  print(f"HTTP error occurred: {http_err}")
except Exception as err:
  print(f"Other error occurred: {err}")

使用会话保持连接

import requests
session = requests.Session()
response = session.get('https://www.example.com')
print(response.text)

获取响应头信息

import requests
response = requests.get('https://www.example.com')
print(response.headers)

设置自定义请求头

import requests
headers = {'User-Agent': 'MyApp/1.0'}
response = requests.get('https://www.example.com', headers=headers)
print(response.text)

comment: