Imports
import bs4
from bs4 import BeautifulSoup, Tag, Comment
from bs4.exceptions import FeatureNotFound, ParserRejectedMarkup
from bs4.dammit import UnicodeDammit
Core Patterns
Parse markup with an explicit parser ✅ Current
from __future__ import annotations
from bs4 import BeautifulSoup
html_doc = "<html><body><p class='body strikeout'>Hello</p></body></html>"
# Always choose the parser explicitly for consistent behavior across environments.
soup = BeautifulSoup(html_doc, "html.parser")
p = soup.find("p")
assert p is not None
print(p.name) # "p"
print(p.get_text()) # "Hello"
- Prefer
BeautifulSoup(markup, "html.parser"), "lxml", "html5lib", or "xml"/"lxml-xml" depending on your needs; different parsers can produce different trees for invalid documents.
Parse from a file handle (context manager) ✅ Current
from __future__ import annotations
from pathlib import Path
from bs4 import BeautifulSoup
path = Path("example.html")
path.write_text("<html><body><a href='/x'>Link</a></body></html>", encoding="utf-8")
with path.open("r", encoding="utf-8") as fp:
soup = BeautifulSoup(fp, "html.parser")
a = soup.find("a")
assert a is not None
print(a.get("href")) # "/x"
- Pass an open file handle directly to
BeautifulSoup to let the builder stream/handle encodings appropriately.
Find elements and navigate relatives ✅ Current
from __future__ import annotations
from typing import Optional
from bs4 import BeautifulSoup, Tag
html_doc = """
<div id="root">
<h1>Title</h1>
<p>First</p>
<p>Second <span>inner</span></p>
</div>
"""
soup = BeautifulSoup(html_doc, "html.parser")
root: Optional[Tag] = soup.find(id="root")
assert root is not None
h1: Optional[Tag] = root.find("h1")
assert h1 is not None
# Navigate
second_p: Optional[Tag] = h1.find_next("p")
assert second_p is not None
print(second_p.get_text(strip=True)) # "First"
all_ps = root.find_all("p")
print([p.get_text(" ", strip=True) for p in all_ps]) # ["First", "Second inner"]
- Use
find, find_all, and the find_next* / find_previous* / sibling / parent variants for tree navigation.
Work with tag attributes (including multi-valued class) ✅ Current
from __future__ import annotations
from bs4 import BeautifulSoup, Tag
soup = BeautifulSoup("<p id='x' class='body strikeout'></p>", "html.parser")
p = soup.find("p")
assert isinstance(p, Tag)
# Dict-like access
print(p["id"]) # "x"
print(p.get("id")) # "x"
# Multi-valued HTML attributes like class are lists by default.
print(p["class"]) # ["body", "strikeout"]
# If you always want a list (even for non-multivalued attrs), use get_attribute_list.
print(p.get_attribute_list("id")) # ["x"]
print(p.get_attribute_list("class")) # ["body", "strikeout"]
# Mutation
p["data-role"] = "demo"
del p["id"]
print(p.attrs) # {'class': ['body', 'strikeout'], 'data-role': 'demo'}
- In HTML mode,
class, rel, etc. are typically stored as list[str]. Use Tag.get_attribute_list(name) to normalize to a list.
Handle text nodes and comments safely ✅ Current
from __future__ import annotations
from bs4 import BeautifulSoup, Comment
from bs4.element import NavigableString
soup = BeautifulSoup("<p>Hello<!--secret--></p>", "html.parser")
p = soup.find("p")
assert p is not None
# Comments are special text nodes.
comment = p.find(string=lambda s: isinstance(s, Comment))
assert isinstance(comment, Comment)
print(comment) # "secret"
# NavigableString is immutable; replace the node instead of editing in place.
text = p.find(string=lambda s: isinstance(s, NavigableString) and not isinstance(s, Comment))
assert isinstance(text, NavigableString)
text.replace_with("Hi")
print(p.get_text()) # "Hi"
- Treat
NavigableString as immutable; use replace_with(...) to change text.
Configuration
- Parser selection (
features):
"html.parser": built-in, decent baseline.
"lxml": fast (requires lxml).
"html5lib": most lenient (slow; requires html5lib).
"xml" / "lxml-xml": XML parsing mode (attribute handling differs from HTML).
parse_only: pass a SoupStrainer (not covered here) to parse only parts of a document for speed/memory.
from_encoding / exclude_encodings: hint or restrict encoding detection when input is bytes.
- Large text nodes with lxml: when using an lxml builder and documents may contain a single text node > 10,000,000 bytes, pass
huge_tree=True to BeautifulSoup(...) to avoid lxml security limits truncating the parse.
- Multi-valued attributes:
- Default (HTML):
class/rel etc. become lists.
- To disable list conversion:
BeautifulSoup(markup, "html.parser", multi_valued_attributes=None)
- In XML mode, multi-valued attributes are not enabled by default; you can opt in via
multi_valued_attributes={'*': 'class'}.
Pitfalls
Wrong: Not specifying a parser (inconsistent trees)
from bs4 import BeautifulSoup
html_doc = "<p><b>badly nested</p></b>"
soup = BeautifulSoup(html_doc) # parser not specified
print(soup.find("b"))
Right: Choose a parser explicitly
from bs4 import BeautifulSoup
html_doc = "<p><b>badly nested</p></b>"
soup = BeautifulSoup(html_doc, "html.parser")
print(soup.find("b"))
Wrong: Treating class as a string in HTML mode
from bs4 import BeautifulSoup
soup = BeautifulSoup("<p class='body strikeout'></p>", "html.parser")
# In HTML mode, soup.p["class"] is a list, so this fails.
classes = soup.p["class"].split() # type: ignore[attr-defined]
print(classes)
Right: Use the list directly (or normalize with get_attribute_list)
from bs4 import BeautifulSoup
soup = BeautifulSoup("<p class='body strikeout'></p>", "html.parser")
classes = soup.p["class"]
print(classes) # ["body", "strikeout"]
ids = soup.p.get_attribute_list("id")
print(ids) # []
Wrong: Assuming multi-valued attributes exist in XML mode
from bs4 import BeautifulSoup
soup = BeautifulSoup("<p class='body strikeout'></p>", "xml")
# In XML mode, "class" is a string by default; indexing returns a character.
first = soup.p["class"][0]
print(first) # "b" (not "body")
Right: Opt in to multi-valued attributes when parsing XML
from bs4 import BeautifulSoup
class_is_multi = {"*": "class"}
soup = BeautifulSoup("<p class='body strikeout'></p>", "xml", multi_valued_attributes=class_is_multi)
first = soup.p["class"][0]
print(first) # "body"
Wrong: Editing a NavigableString “in place”
from bs4 import BeautifulSoup
from bs4.element import NavigableString
soup = BeautifulSoup("<p>Hello</p>", "html.parser")
text = soup.p.string
assert isinstance(text, NavigableString)
# Strings are immutable; this does not update the parse tree.
text = NavigableString("Hi")
print(soup.p.get_text()) # still "Hello"
Right: Replace the existing node with replace_with
from bs4 import BeautifulSoup
from bs4.element import NavigableString
soup = BeautifulSoup("<p>Hello</p>", "html.parser")
text = soup.p.string
assert isinstance(text, NavigableString)
text.replace_with("Hi")
print(soup.p.get_text()) # "Hi"
Wrong: lxml builder truncation with huge text nodes (missing huge_tree=True)
from bs4 import BeautifulSoup
# If this markup contains a single >10,000,000 byte text node, lxml may stop early.
markup_with_huge_text = "<root>" + ("x" * 11_000_000) + "</root>"
soup = BeautifulSoup(markup_with_huge_text, "lxml")
print(soup.find("root") is not None)
Right: Enable huge tree support when needed
from bs4 import BeautifulSoup
markup_with_huge_text = "<root>" + ("x" * 11_000_000) + "</root>"
soup = BeautifulSoup(markup_with_huge_text, "lxml", huge_tree=True)
print(soup.find("root") is not None)
References