Integrations

Scrapy Spider Middleware for CaptchaAI: Advanced Patterns

Scrapy's middleware system lets you intercept requests and responses. Here's how to build a downloader middleware that automatically solves CAPTCHAs with CaptchaAI.


Downloader Middleware

# middlewares.py
import re
import time
import logging
import requests as http_requests
from scrapy import signals
from scrapy.http import HtmlResponse

logger = logging.getLogger(__name__)


class CaptchaAIMiddleware:
    """Scrapy downloader middleware for automatic CAPTCHA solving."""

    CAPTCHA_PATTERNS = [
        (r'data-sitekey="([^"]+)"', "recaptcha"),
        (r"cf-turnstile.*?data-sitekey=\"([^\"]+)\"", "turnstile"),
    ]

    def __init__(self, api_key, max_retries=2):
        self.api_key = api_key
        self.max_retries = max_retries
        self.stats = {"detected": 0, "solved": 0, "failed": 0}

    @classmethod
    def from_crawler(cls, crawler):
        api_key = crawler.settings.get("CAPTCHAAI_API_KEY")
        if not api_key:
            raise ValueError("CAPTCHAAI_API_KEY setting is required")

        middleware = cls(
            api_key=api_key,
            max_retries=crawler.settings.getint("CAPTCHAAI_MAX_RETRIES", 2),
        )

        crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
        return middleware

    def process_response(self, request, response, spider):
        """Check response for CAPTCHA and solve if found."""
        if not isinstance(response, HtmlResponse):
            return response

        body = response.text

        for pattern, captcha_type in self.CAPTCHA_PATTERNS:
            match = re.search(pattern, body)
            if match:
                sitekey = match.group(1)
                self.stats["detected"] += 1
                logger.info(
                    f"CAPTCHA ({captcha_type}) on {response.url}, solving..."
                )

                retries = request.meta.get("captcha_retries", 0)
                if retries >= self.max_retries:
                    self.stats["failed"] += 1
                    logger.error(f"Max CAPTCHA retries on {response.url}")
                    return response

                token = self._solve(captcha_type, sitekey, response.url)
                if token:
                    self.stats["solved"] += 1
                    # Re-request with token
                    new_request = request.copy()
                    new_request.meta["captcha_retries"] = retries + 1
                    new_request.meta["captcha_token"] = token
                    new_request.method = "POST"
                    new_request.body = f"g-recaptcha-response={token}"
                    new_request.headers[b"Content-Type"] = b"application/x-www-form-urlencoded"
                    new_request.dont_filter = True
                    return new_request
                else:
                    self.stats["failed"] += 1

        return response

    def _solve(self, captcha_type, sitekey, pageurl):
        """Solve CAPTCHA via CaptchaAI."""
        method_map = {
            "recaptcha": {"method": "userrecaptcha", "googlekey": sitekey},
            "turnstile": {"method": "turnstile", "sitekey": sitekey},
        }

        params = method_map.get(captcha_type)
        if not params:
            return None

        try:
            resp = http_requests.post("https://ocr.captchaai.com/in.php", data={
                "key": self.api_key,
                "pageurl": pageurl,
                "json": 1,
                **params,
            }, timeout=30)
            result = resp.json()

            if result.get("status") != 1:
                logger.error(f"Submit error: {result.get('request')}")
                return None

            task_id = result["request"]
            time.sleep(10)

            for _ in range(24):
                resp = http_requests.get("https://ocr.captchaai.com/res.php", params={
                    "key": self.api_key, "action": "get",
                    "id": task_id, "json": 1,
                }, timeout=15)
                data = resp.json()

                if data.get("status") == 1:
                    return data["request"]
                if data["request"] != "CAPCHA_NOT_READY":
                    return None
                time.sleep(5)

        except Exception as e:
            logger.error(f"Solve exception: {e}")

        return None

    def spider_closed(self, spider):
        """Log CAPTCHA statistics on spider close."""
        logger.info(
            f"CAPTCHA Stats — Detected: {self.stats['detected']}, "
            f"Solved: {self.stats['solved']}, "
            f"Failed: {self.stats['failed']}"
        )

Scrapy Settings

# settings.py

# CaptchaAI configuration
CAPTCHAAI_API_KEY = "YOUR_API_KEY"  # Better: use env variable
CAPTCHAAI_MAX_RETRIES = 2

# Enable the middleware
DOWNLOADER_MIDDLEWARES = {
    "myproject.middlewares.CaptchaAIMiddleware": 600,
}

# Increase timeouts for CAPTCHA solving
DOWNLOAD_TIMEOUT = 180

# Rate limiting
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True
CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 2

Spider Example

# spiders/product_spider.py
import scrapy


class ProductSpider(scrapy.Spider):
    name = "products"
    start_urls = ["https://example.com/products"]

    def parse(self, response):
        """Parse product listing page."""
        # The middleware handles CAPTCHAs automatically
        # This method only deals with parsing

        for product in response.css("div.product-card"):
            yield {
                "name": product.css(".name::text").get("").strip(),
                "price": product.css(".price::text").get("").strip(),
                "url": response.urljoin(product.css("a::attr(href)").get("")),
            }

        # Follow pagination
        next_page = response.css("a.next-page::attr(href)").get()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page))

    def parse_product(self, response):
        """Parse individual product page."""
        # Access CAPTCHA token if middleware solved one
        token = response.meta.get("captcha_token")
        if token:
            self.logger.info(f"Page accessed after CAPTCHA solve: {response.url}")

        yield {
            "title": response.css("h1::text").get("").strip(),
            "description": response.css(".description::text").get("").strip(),
            "price": response.css(".price::text").get("").strip(),
        }

Token-Passing Spider Middleware

For spiders that need the token in the parse method:

class CaptchaTokenSpiderMiddleware:
    """Pass CAPTCHA tokens to spider callbacks."""

    def process_spider_input(self, response, spider):
        """Add CAPTCHA token to response meta if available."""
        token = response.meta.get("captcha_token")
        if token:
            spider.logger.debug(f"CAPTCHA token available for {response.url}")
        return None

    def process_spider_output(self, response, result, spider):
        """Forward token to new requests from this response."""
        token = response.meta.get("captcha_token")

        for item_or_request in result:
            if isinstance(item_or_request, scrapy.Request) and token:
                item_or_request.meta.setdefault("parent_captcha_token", token)
            yield item_or_request

FAQ

Does the middleware block Scrapy's event loop?

Yes, the synchronous HTTP calls to CaptchaAI's API block during polling. For high-concurrency spiders, consider using scrapy-playwright with async CAPTCHA solving instead.

Can I use this middleware with Scrapy-Splash?

Yes. The middleware intercepts responses regardless of how they were rendered. It works with Splash, Playwright, and standard HTTP responses.

How do I test the middleware?

Use Scrapy's fake_response helper in unit tests. Mock the CaptchaAI API responses to test CAPTCHA detection and retry logic without making real API calls.



Add CaptchaAI to Scrapy — get your API key.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Discussions (0)

No comments yet.