from bs4 import BeautifulSoup
import requests
url = "https://example.com/table"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table")
rows = table.find_all("tr")
for row in rows:
cells = row.find_all("td")
print([cell.text.strip() for cell in cells])
scrapy shell "https://example.com"
response.css('title::text').get() # Get the page title
response.css('a::attr(href)').getall() # Get all links
import scrapy
class ExampleSpider(scrapy.Spider):
name = "example"
start_urls = ["https://example.com"]
def parse(self, response):
title = response.css("title::text").get()
print(f"Page Title: {title}")
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ["http://quotes.toscrape.com"]
def parse(self, response):
for quote in response.css("div.quote"):
yield {
"text": quote.css("span.text::text").get(),
"author": quote.css("span small.author::text").get(),
}
# Save data by running: scrapy crawl quotes -o quotes.json
from bs4 import BeautifulSoup
import requests
import os
url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
images = [img['src'] for img in soup.find_all("img", src=True)]
os.makedirs("images", exist_ok=True)
for i, img_url in enumerate(images):
img_data = requests.get(img_url).content
with open(f"images/image_{i}.jpg", "wb") as img_file:
img_file.write(img_data)
import scrapy
class PaginationSpider(scrapy.Spider):
name = "pagination"
start_urls = ["http://quotes.toscrape.com/page/1/"]
def parse(self, response):
for quote in response.css("div.quote"):
yield {
"text": quote.css("span.text::text").get(),
"author": quote.css("span small.author::text").get(),
}
next_page = response.css("li.next a::attr(href)").get()
if next_page:
yield response.follow(next_page, self.parse)
from bs4 import BeautifulSoup
import requests
url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
metadata = {meta["name"]: meta["content"] for meta in soup.find_all("meta", attrs={"name": True, "content": True})}
print("Metadata:", metadata)
import requests
url = "https://api.example.com/data"
response = requests.get(url)
data = response.json()
for item in data:
print(item["name"], item["value"])