# Beautiful Soup

***

### 1. Scraping Websites

```python
import bs4

# Parse the HTML of a website
url = 'https://example.com'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

# Find all links on the page
links = soup.find_all('a')

# Print the text of each link
for link in links:
    print(link.text)
```

### 2. Parsing XML

```python
import bs4

# Parse the XML of a document
xml = '<document><title>Example</title><body>Hello, world!</body></document>'
soup = BeautifulSoup(xml, 'xml')

# Find the title of the document
title = soup.title.string

# Print the title
print(title)  # Output: Example
```

### 3. Cleaning Data

```python
import bs4

# Parse the HTML of a website
html = '<html><body><h1>Example</h1><script>alert("Hello, world!");</script></body></html>'
soup = BeautifulSoup(html, 'html.parser')

# Remove all script tags from the soup
for script in soup.find_all('script'):
    script.decompose()

# Print the cleaned HTML
print(soup)  # Output: <html><body><h1>Example</h1></body></html>
```

### 4. Extracting Attributes

```python
import bs4

# Parse the HTML of a website
html = '<a href="https://example.com">Example</a>'
soup = BeautifulSoup(html, 'html.parser')

# Find the link's href attribute
href = soup.a['href']

# Print the href attribute
print(href)  # Output: https://example.com
```

### 5. Traversing the DOM

```python
import bs4

# Parse the HTML of a website
html = '<div><p>Example</p><ul><li>Item 1</li><li>Item 2</li></ul></div>'
soup = BeautifulSoup(html, 'html.parser')

# Find the first paragraph
paragraph = soup.find('p')

# Find the parent of the paragraph (the div)
parent = paragraph.parent

# Find the next sibling of the paragraph (the ul)
sibling = paragraph.next_sibling

# Print the parent and sibling
print(parent)  # Output: <div><p>Example</p><ul><li>Item 1</li><li>Item 2</li></ul></div>
print(sibling)  # Output: <ul><li>Item 1</li><li>Item 2</li></ul>
```

### 6. Searching with CSS Selectors

```python
import bs4

# Parse the HTML of a website
html = '<div class="example"><h1>Example</h1></div>'
soup = BeautifulSoup(html, 'html.parser')

# Find all elements with the class "example"
elements = soup.select('.example')

# Print the first element
print(elements[0])  # Output: <div class="example"><h1>Example</h1></div>
```

### 7. Scraping Tables

```python
import bs4

# Parse the HTML of a website
html = '<table><thead><tr><th>Name</th><th>Age</th></tr></thead><tbody><tr><td>John</td><td>30</td></tr></tbody></table>'
soup = BeautifulSoup(html, 'html.parser')

# Find the table
table = soup.find('table')

# Find all rows in the table
rows = table.find_all('tr')

# Iterate over the rows and print the data
for row in rows:
    cells = row.find_all('td')
    print(cells[0].text, cells[1].text)  # Output: John 30
```

### 8. Scraping Nested Data

```python
import bs4

# Parse the HTML of a website
html = '<div><ul><li>Item 1<ul><li>Subitem 1</li><li>Subitem 2</li></ul></li><li>Item 2</li></ul></div>'
soup = BeautifulSoup(html, 'html.parser')

# Find all list items
items = soup.find_all('li')

# Iterate over the items and print their text
for item in items:
    print(item.text)  # Output: Item 1\nSubitem 1\nSubitem 2\nItem 2
```

### 9. Using Regular Expressions

```python
import bs4
import re

# Parse the HTML of a website
html = '<div><p>Example</p><p>Example 2</p></div>'
soup = BeautifulSoup(html, 'html.parser')

# Find all paragraphs that contain the word "Example"
paragraphs = soup.find_all('p', text=re.compile('Example'))

# Print the paragraphs
for paragraph in paragraphs:
    print(paragraph.text)  # Output: Example\nExample 2
```

### 10. Handling Errors

```python
import bs4

# Parse the HTML of a website
try:
    html = requests.get('https://example.com').text
except:
    print("An error occurred while fetching the website.")
    exit()

# Parse the HTML
soup = BeautifulSoup(html, 'html.parser')
```

### 11. Scraping Dynamic Content

```python
import bs4
from selenium import webdriver

# Create a Selenium WebDriver object
driver = webdriver.Chrome()

# Navigate to the website
driver.get('https://example.com')

# Wait for the page to load
driver.implicitly_wait(10)

# Parse the HTML of the page
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# Close the Selenium WebDriver object
driver.close()
```

### 12. Parsing JSON

```python
import bs4
import json

# Parse the HTML of a website
html = '<div><script>window.data = {"name": "John", "age": 30};</script></div>'
soup = BeautifulSoup(html, 'html.parser')

# Find the script tag with the JSON data
script = soup.find('script', text=re.compile('window.data = '))

# Extract the JSON data from the script tag
data = json.loads(script.text.split('window.data = ')[-1])

# Print the JSON data
print(data)  # Output: {'name': 'John', 'age': 30}
```

### 13. Scraping PDF Files

```python
import bs4
import pdftotext

# Convert the PDF file to text
pdf = pdftotext.PDF(open('example.pdf', 'rb'))
text = pdf.get_text()

# Parse the text

```
