Beautiful Soup parses HTML. CaptchaAI solves CAPTCHAs. Together with requests, they form the fastest scraping stack for CAPTCHA-protected pages — no browser required.
This approach works when the site serves HTML directly (server-side rendered). For JavaScript-heavy SPAs, use Selenium or Playwright instead.
Prerequisites
pip install beautifulsoup4 requests lxml
The workflow
- Fetch the page HTML with
requests - Parse with Beautiful Soup to extract CAPTCHA parameters
- Send parameters to CaptchaAI to solve
- Submit the form with the CAPTCHA token via
requests - Parse the result page with Beautiful Soup
Extracting reCAPTCHA sitekeys with Beautiful Soup
import requests
from bs4 import BeautifulSoup
def extract_recaptcha_sitekey(url):
"""Extract reCAPTCHA v2 sitekey from page HTML."""
resp = requests.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Method 1: data-sitekey attribute on div
recaptcha_div = soup.find("div", class_="g-recaptcha")
if recaptcha_div and recaptcha_div.get("data-sitekey"):
return recaptcha_div["data-sitekey"]
# Method 2: data-sitekey on any element
element = soup.find(attrs={"data-sitekey": True})
if element:
return element["data-sitekey"]
# Method 3: from script src
import re
for script in soup.find_all("script", src=True):
match = re.search(r"render=([A-Za-z0-9_-]{40})", script["src"])
if match:
return match.group(1)
return None
sitekey = extract_recaptcha_sitekey("https://example.com/login")
print(f"Sitekey: {sitekey}")
Extracting Turnstile sitekeys
def extract_turnstile_sitekey(url):
"""Extract Cloudflare Turnstile sitekey from page HTML."""
resp = requests.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Method 1: Turnstile div
turnstile_div = soup.find("div", class_="cf-turnstile")
if turnstile_div and turnstile_div.get("data-sitekey"):
return turnstile_div["data-sitekey"]
# Method 2: Any element with Turnstile sitekey pattern
element = soup.find(attrs={"data-sitekey": True})
if element:
sitekey = element["data-sitekey"]
if sitekey.startswith("0x"):
return sitekey
# Method 3: In inline script
import re
for script in soup.find_all("script"):
if script.string:
match = re.search(r"sitekey\s*:\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]", script.string)
if match:
return match.group(1)
return None
Extracting form fields
Always extract hidden form fields — they often contain CSRF tokens and other parameters the server expects:
def extract_form_data(soup, form_selector="form"):
"""Extract all form field names and values."""
form = soup.select_one(form_selector)
if not form:
return {}
data = {}
# Hidden inputs (CSRF tokens, etc.)
for inp in form.find_all("input", type="hidden"):
name = inp.get("name")
value = inp.get("value", "")
if name:
data[name] = value
# Text inputs with default values
for inp in form.find_all("input", type=["text", "email", "password"]):
name = inp.get("name")
value = inp.get("value", "")
if name:
data[name] = value
return data
Complete reCAPTCHA scraping flow
import time
import requests
from bs4 import BeautifulSoup
API_KEY = "YOUR_API_KEY"
def solve_captcha(method, **params):
"""Solve CAPTCHA via CaptchaAI."""
submit = requests.post("https://ocr.captchaai.com/in.php", data={
"key": API_KEY, "method": method, "json": 1, **params,
}, timeout=30).json()
if submit.get("status") != 1:
raise Exception(f"Submit error: {submit.get('request')}")
task_id = submit["request"]
for _ in range(30):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "get", "id": task_id, "json": 1,
}, timeout=30).json()
if result.get("status") == 1:
return result["request"]
raise TimeoutError("Solve timed out")
def scrape_protected_page(url, credentials=None):
"""Scrape a reCAPTCHA-protected page — no browser needed."""
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
})
# Step 1: Fetch the login page
resp = session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Step 2: Extract sitekey
sitekey = None
recaptcha_div = soup.find(attrs={"data-sitekey": True})
if recaptcha_div:
sitekey = recaptcha_div["data-sitekey"]
if not sitekey:
raise ValueError("No CAPTCHA sitekey found")
print(f"Sitekey: {sitekey}")
# Step 3: Extract form fields (CSRF tokens, etc.)
form_data = extract_form_data(soup)
print(f"Form fields: {list(form_data.keys())}")
# Step 4: Add credentials
if credentials:
form_data.update(credentials)
# Step 5: Solve CAPTCHA
token = solve_captcha("userrecaptcha", googlekey=sitekey, pageurl=url)
form_data["g-recaptcha-response"] = token
# Step 6: Submit the form
form = soup.find("form")
action_url = form.get("action", url) if form else url
if not action_url.startswith("http"):
from urllib.parse import urljoin
action_url = urljoin(url, action_url)
method = (form.get("method", "POST") if form else "POST").upper()
if method == "POST":
result = session.post(action_url, data=form_data, timeout=30)
else:
result = session.get(action_url, params=form_data, timeout=30)
# Step 7: Parse the result
result_soup = BeautifulSoup(result.text, "lxml")
return result_soup, session
# Usage
result_soup, session = scrape_protected_page(
"https://example.com/login",
credentials={"username": "user@example.com", "password": "pass123"},
)
# Now use the authenticated session to scrape protected content
dashboard = session.get("https://example.com/dashboard", timeout=30)
dashboard_soup = BeautifulSoup(dashboard.text, "lxml")
print(dashboard_soup.title.string)
Scraping search results behind CAPTCHA
def scrape_search_results(search_url, query):
"""Scrape search results from a CAPTCHA-protected search engine."""
session = requests.Session()
session.headers["User-Agent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
)
# Fetch search page
resp = session.get(search_url, params={"q": query}, timeout=30)
# Check if CAPTCHA is present
soup = BeautifulSoup(resp.text, "lxml")
sitekey_el = soup.find(attrs={"data-sitekey": True})
if sitekey_el:
# Solve CAPTCHA
sitekey = sitekey_el["data-sitekey"]
token = solve_captcha("userrecaptcha", googlekey=sitekey, pageurl=resp.url)
# Resubmit with token
form_data = extract_form_data(soup)
form_data["g-recaptcha-response"] = token
form_data["q"] = query
resp = session.post(resp.url, data=form_data, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Extract results
results = []
for item in soup.select(".result, .search-result, .g"):
title_el = item.select_one("h3, .title")
link_el = item.select_one("a")
snippet_el = item.select_one(".snippet, .description, .st")
if title_el and link_el:
results.append({
"title": title_el.get_text(strip=True),
"url": link_el.get("href", ""),
"snippet": snippet_el.get_text(strip=True) if snippet_el else "",
})
return results
Image CAPTCHA extraction with Beautiful Soup
import base64
from urllib.parse import urljoin
def solve_image_captcha_bs4(url, captcha_img_selector="img.captcha"):
"""Extract, solve, and submit an image CAPTCHA."""
session = requests.Session()
resp = session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Find CAPTCHA image
img = soup.select_one(captcha_img_selector)
if not img:
raise ValueError("CAPTCHA image not found")
# Download the image
img_url = img.get("src", "")
if img_url.startswith("data:image"):
# Base64 inline image
img_base64 = img_url.split(",", 1)[1]
else:
# URL — download it
img_url = urljoin(url, img_url)
img_resp = session.get(img_url, timeout=30)
img_base64 = base64.b64encode(img_resp.content).decode()
# Solve
answer = solve_captcha("base64", body=img_base64)
print(f"CAPTCHA answer: {answer}")
# Submit form
form_data = extract_form_data(soup)
# Find the captcha input field name
captcha_input = soup.select_one("input[name*='captcha'], input[name*='code']")
if captcha_input:
form_data[captcha_input["name"]] = answer
form = soup.find("form")
action = urljoin(url, form.get("action", "")) if form else url
result = session.post(action, data=form_data, timeout=30)
return BeautifulSoup(result.text, "lxml"), session
Production scraper class
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class ProtectedScraper:
"""Scrape CAPTCHA-protected pages without a browser."""
def __init__(self, api_key):
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
})
def get(self, url):
"""Fetch and parse a page, solving CAPTCHAs automatically."""
resp = self.session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Check for CAPTCHA
sitekey_el = soup.find(attrs={"data-sitekey": True})
if sitekey_el:
soup = self._handle_captcha(soup, resp.url, sitekey_el)
return soup
def login(self, url, credentials):
"""Log in through a CAPTCHA-protected form."""
resp = self.session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
form_data = self._extract_form(soup)
form_data.update(credentials)
sitekey_el = soup.find(attrs={"data-sitekey": True})
if sitekey_el:
token = self._solve(sitekey_el["data-sitekey"], url)
form_data["g-recaptcha-response"] = token
form = soup.find("form")
action = urljoin(url, form.get("action", "")) if form else url
result = self.session.post(action, data=form_data, timeout=30)
return BeautifulSoup(result.text, "lxml")
def _handle_captcha(self, soup, url, sitekey_el):
token = self._solve(sitekey_el["data-sitekey"], url)
form_data = self._extract_form(soup)
form_data["g-recaptcha-response"] = token
form = soup.find("form")
action = urljoin(url, form.get("action", "")) if form else url
resp = self.session.post(action, data=form_data, timeout=30)
return BeautifulSoup(resp.text, "lxml")
def _extract_form(self, soup):
data = {}
for inp in soup.select("form input[type='hidden']"):
if inp.get("name"):
data[inp["name"]] = inp.get("value", "")
return data
def _solve(self, sitekey, url):
submit = requests.post("https://ocr.captchaai.com/in.php", data={
"key": self.api_key, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": url, "json": 1,
}, timeout=30).json()
if submit.get("status") != 1:
raise Exception(f"Error: {submit.get('request')}")
task_id = submit["request"]
for _ in range(30):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key, "action": "get", "id": task_id, "json": 1,
}, timeout=30).json()
if result.get("status") == 1:
return result["request"]
raise TimeoutError("Solve timed out")
# Usage
scraper = ProtectedScraper("YOUR_API_KEY")
# Login and scrape
scraper.login("https://example.com/login", {
"email": "user@example.com",
"password": "pass123",
})
# Now scrape authenticated pages
soup = scraper.get("https://example.com/dashboard")
for row in soup.select("table tr"):
cells = [td.get_text(strip=True) for td in row.select("td")]
print(cells)
When to use Beautiful Soup vs browser automation
| Scenario | Use BS4 + requests | Use Selenium/Playwright |
|---|---|---|
| Server-rendered HTML | Yes | Overkill |
| JavaScript-rendered content | No | Yes |
| Complex multi-step form | Maybe | Preferred |
| High-volume scraping | Yes (faster) | Slower |
| Sites with JS fingerprinting | No | Yes |
| Simple login + scrape | Yes | Not needed |
Troubleshooting
| Symptom | Cause | Fix |
|---|---|---|
Sitekey extraction returns None |
CAPTCHA loaded via JavaScript | Switch to Selenium/Playwright |
| Form submission returns login page | Missing CSRF token | Extract all hidden inputs with extract_form_data() |
| 403 after form POST | Bot detection on headers | Add realistic User-Agent and Referer headers |
| Token rejected | Wrong pageurl parameter |
Use the exact URL shown in the browser |
| Cookies lost between requests | Not using requests.Session() |
Always use a session object |
Frequently asked questions
Can Beautiful Soup solve CAPTCHAs?
No — Beautiful Soup is an HTML parser. It extracts CAPTCHA parameters (sitekeys, image URLs). CaptchaAI does the actual solving. requests handles the HTTP communication.
When should I use a browser instead?
When the page requires JavaScript to render content, when the CAPTCHA is loaded dynamically, or when the site uses JavaScript-based fingerprinting.
Is this faster than Selenium?
Yes. requests + Beautiful Soup skips browser startup, JavaScript execution, and rendering, making it 5-10x faster per page.
Summary
Python Beautiful Soup + CaptchaAI provides the fastest scraping stack for CAPTCHA-protected pages that serve HTML directly. Parse sitekeys with BS4, solve with the API, and submit via requests.Session().
Related Articles
- Geetest Vs Cloudflare Turnstile Comparison
- Cloudflare Turnstile 403 After Token Fix
- Cloudflare Turnstile Widget Modes Explained
Full Working Code
Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.
View on GitHub →
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.