python Copy
import requests
from bs4 import BeautifulSoup
import time
import random
import re
from urllib.parse import urljoin, urlparse
import json
class AmazonScraper:
def __init__(self):
self.session = requests.Session()
self.set_headers()
def set_headers(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
self.session.headers.update(self.headers)
def random_delay(self):
time.sleep(random.uniform(2, 4))
def scrape_product_page(self, url):
"""Scrape individual product page"""
try:
self.random_delay()
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
product_data = {
'url': url,
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
'title': self.extract_title(soup),
'price': self.extract_price(soup),
'availability': self.extract_availability(soup),
'rating': self.extract_rating(soup),
'review_count': self.extract_review_count(soup),
'description': self.extract_description(soup),
'images': self.extract_images(soup),
'specifications': self.extract_specifications(soup)
}
return product_data
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_title(self, soup):
# Multiple selectors for title
selectors = [
'#productTitle',
'h1.a-size-large',
'.a-size-medium.a-spacing-none'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text().strip()
return None
def extract_price(self, soup):
# Multiple price selectors
price_selectors = [
'.a-price-whole',
'.a-price .a-offscreen',
'#priceblock_dealprice',
'#priceblock_ourprice'
]
for selector in price_selectors:
element = soup.select_one(selector)
if element:
price_text = element.get_text().strip()
# Clean price text
price = re.search(r'[\d,]+\.?\d*', price_text)
if price:
return price.group()
return None
def extract_rating(self, soup):
rating_element = soup.select_one('.a-icon-alt')
if rating_element:
rating_text = rating_element.get_text()
match = re.search(r'(\d+\.\d+)', rating_text)
if match:
return match.group(1)
return None
def extract_review_count(self, soup):
review_element = soup.select_one('#acrCustomerReviewText')
if review_element:
count_text = review_element.get_text()
numbers = re.findall(r'\d+', count_text.replace(',', ''))
if numbers:
return int(numbers[0])
return None
def extract_availability(self, soup):
availability_selectors = [
'#availability .a-color-success',
'#availability .a-color-price',
'#outOfStock'
]
for selector in availability_selectors:
element = soup.select_one(selector)
if element:
return element.get_text().strip()
return "Available"
def extract_description(self, soup):
# Try multiple description locations
description_selectors = [
'#productDescription',
'#feature-bullets',
'.a-plus-content'
]
for selector in description_selectors:
element = soup.select_one(selector)
if element:
return element.get_text().strip()[:1000] # Limit length
return None
def extract_images(self, soup):
images = []
image_elements = soup.select('#landingImage, .a-dynamic-image')
for img in image_elements:
src = img.get('src') or img.get('data-src')
if src and 'http' in src:
images.append(src)
return images
def extract_specifications(self, soup):
specs = {}
# Technical specifications table
table = soup.select_one('#productDetails_techSpec_section_1')
if table:
rows = table.select('tr')
for row in rows:
th = row.select_one('th')
td = row.select_one('td')
if th and td:
key = th.get_text().strip()
value = td.get_text().strip()
specs[key] = value
return specs
# Usage example
scraper = AmazonScraper()
product_data = scraper.scrape_product_page('https://www.amazon.com/dp/B08N5WRWNW')
print(json.dumps(product_data, indent=2))