在Python中,常用的爬虫库有很多,以下是一些最常用的库
在Python中,常用的爬虫库有很多,以下是一些最常用的库:
1import requests
2response = requests.get('https://example.com')
3print(response.text)
41from bs4 import BeautifulSoup
2soup = BeautifulSoup(response.text, 'html.parser')
3print(soup.title.text)
41from lxml import etree
2tree = etree.HTML(response.text)
3title = tree.xpath('//title/text()')
4print(title)
51from selenium import webdriver
2driver = webdriver.Chrome()
3driver.get('https://example.com')
4print(driver.page_source)
5driver.quit()
6相关文章 使用 Selenium时Web爬虫被检测为机器人怎么办?
1scrapy startproject myproject
2scrapy crawl myspider
31from pyquery import PyQuery as pq
2doc = pq(response.text)
3print(doc('title').text())
41import aiohttp
2import asyncio
3
4async def fetch(url):
5 async with aiohttp.ClientSession() as session:
6 async with session.get(url) as response:
7 return await response.text()
8
9asyncio.run(fetch('https://example.com'))
101from lxml import etree
2html = etree.HTML(response.text)
3result = html.xpath('//div[@class="example"]/text()')
41from pyppeteer import launch
2
3async def main():
4 browser = await launch()
5 page = await browser.newPage()
6 await page.goto('https://example.com')
7 print(await page.content())
8 await browser.close()
9
10asyncio.get_event_loop().run_until_complete(main())
111import httpx
2async with httpx.AsyncClient() as client:
3 r = await client.get('https://example.com')
4 print(r.text)
51from twisted.internet import reactor
2from twisted.web.client import getPage
3
4def print_response(response):
5 print(response)
6 reactor.stop()
7
8d = getPage(b'https://example.com')
9d.addCallback(print_response)
10reactor.run()
111phantomjs my_script.js
21import requests
2proxies = {
3 'http': 'http://10.10.1.10:3128',
4 'https': 'http://10.10.1.10:1080',
5}
6headers = {
7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
8}
9response = requests.get('https://example.com', proxies=proxies, headers=headers)
101docker run -d -p 8080:8080 --name crawlab crawlabteam/crawlab
21docker run -p 8050:8050 scrapinghub/splash
2然后可以在Scrapy中集成Splash:
1import scrapy
2from scrapy_splash import SplashRequest
3
4class MySpider(scrapy.Spider):
5 def start_requests(self):
6 yield SplashRequest(url='https://example.com', callback=self.parse)
71from PIL import Image
2import pytesseract
3
4img = Image.open('captcha.png')
5text = pytesseract.image_to_string(img)
6print(text)
71from faker import Faker
2fake = Faker()
3print(fake.name())
4print(fake.address())
5print(fake.email())
61import asyncio
2from pyppeteer import launch
3
4async def main():
5 browser = await launch(headless=True)
6 page = await browser.newPage()
7 await page.goto('https://example.com')
8 print(await page.content())
9 await browser.close()
10
11asyncio.get_event_loop().run_until_complete(main())
12以上介绍的库和工具覆盖了静态网页爬取、动态网页渲染、验证码破解、代理使用等多个场景。可以根据具体的网站和需求,选择合适的工具组合使用。例如,静态页面抓取可以使用requests + BeautifulSoup,而对于复杂的动态页面,则可以选择Selenium或Pyppeteer。