Real Estate Data Scraping with CAPTCHA Handling

Real estate platforms are heavily protected against automated data collection. CaptchaAI helps you maintain reliable access to property listings, pricing data, and market analytics.

CAPTCHA Protections on Real Estate Sites

Platform Type	Protection	CAPTCHA Type
MLS aggregators	Cloudflare Challenge	Full challenge + proxy
Zillow-type portals	reCAPTCHA v3	Invisible, behavioral
Realtor directories	reCAPTCHA v2	Checkbox or invisible
Property tax records	Image CAPTCHA	Text recognition
Auction sites	Cloudflare Turnstile	Widget challenge
Commercial listings	reCAPTCHA v2 Enterprise	Enhanced verification

Property Data Collector

import requests
import time
import re
import json
import csv
import os
from datetime import datetime

API_KEY = os.environ["CAPTCHAAI_API_KEY"]


def solve_captcha(params):
    params["key"] = API_KEY
    resp = requests.get("https://ocr.captchaai.com/in.php", params=params)
    if not resp.text.startswith("OK|"):
        raise Exception(f"Submit: {resp.text}")

    task_id = resp.text.split("|")[1]
    for _ in range(60):
        time.sleep(5)
        result = requests.get("https://ocr.captchaai.com/res.php", params={
            "key": API_KEY, "action": "get", "id": task_id,
        })
        if result.text == "CAPCHA_NOT_READY":
            continue
        if result.text.startswith("OK|"):
            return result.text.split("|", 1)[1]
        raise Exception(f"Solve: {result.text}")
    raise TimeoutError()


class PropertyCollector:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers["User-Agent"] = (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 Chrome/120.0.0.0"
        )

    def fetch(self, url):
        """Fetch page with automatic CAPTCHA handling."""
        resp = self.session.get(url)

        # reCAPTCHA
        match = re.search(
            r'data-sitekey=["\']([A-Za-z0-9_-]+)["\']', resp.text
        )
        if match:
            # Detect v3
            is_v3 = "recaptcha/api.js?render=" in resp.text
            params = {
                "method": "userrecaptcha",
                "googlekey": match.group(1),
                "pageurl": url,
            }
            if is_v3:
                params["version"] = "v3"
                params["action"] = "search"

            token = solve_captcha(params)
            resp = self.session.post(url, data={
                "g-recaptcha-response": token,
            })

        # Turnstile
        if "cf-turnstile" in resp.text:
            match = re.search(r'data-sitekey=["\']([^"\']+)', resp.text)
            if match:
                token = solve_captcha({
                    "method": "turnstile",
                    "sitekey": match.group(1),
                    "pageurl": url,
                })
                resp = self.session.post(url, data={
                    "cf-turnstile-response": token,
                })

        return resp.text

    def collect_listings(self, urls):
        """Collect property listings from multiple pages."""
        listings = []
        for url in urls:
            try:
                html = self.fetch(url)
                page_listings = self._parse_listings(html)
                listings.extend(page_listings)
                print(f"  {len(page_listings)} listings from {url}")
                time.sleep(3)
            except Exception as e:
                print(f"  Error: {url} - {e}")
        return listings

    def _parse_listings(self, html):
        """Extract property data from HTML."""
        listings = []

        # Price extraction
        prices = re.findall(r'\$\s*([\d,]+)', html)
        # Address extraction
        addresses = re.findall(
            r'class="address"[^>]*>(.*?)</(?:div|span|p)', html
        )
        # Bed/Bath extraction
        beds = re.findall(r'(\d+)\s*(?:bed|br|bedroom)', html, re.I)
        baths = re.findall(r'(\d+)\s*(?:bath|ba|bathroom)', html, re.I)
        # Sqft extraction
        sqft = re.findall(r'([\d,]+)\s*(?:sq\s*ft|sqft)', html, re.I)

        # Combine available data
        count = max(len(prices), len(addresses), 1)
        for i in range(min(count, 50)):  # Cap at 50 per page
            listing = {
                "price": prices[i] if i < len(prices) else None,
                "address": (
                    addresses[i].strip() if i < len(addresses) else None
                ),
                "beds": beds[i] if i < len(beds) else None,
                "baths": baths[i] if i < len(baths) else None,
                "sqft": sqft[i] if i < len(sqft) else None,
                "collected_at": datetime.utcnow().isoformat(),
            }
            if listing["price"] or listing["address"]:
                listings.append(listing)

        return listings

    def export_csv(self, listings, filename):
        if not listings:
            print("No listings to export")
            return

        keys = ["price", "address", "beds", "baths", "sqft", "collected_at"]
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(listings)
        print(f"Exported {len(listings)} listings to {filename}")


# Usage
collector = PropertyCollector()

search_urls = [
    "https://example-realty.com/search?city=austin&type=sale&page=1",
    "https://example-realty.com/search?city=austin&type=sale&page=2",
    "https://example-realty.com/search?city=austin&type=sale&page=3",
]

listings = collector.collect_listings(search_urls)
collector.export_csv(listings, "austin_listings.csv")

Data Points to Collect

Field	Source	Use Case
Listing price	Property page	Market valuation
Address	Property page	Geo-analysis
Beds/Baths/Sqft	Property details	Comparable analysis
Days on market	Listing metadata	Market velocity
Price history	Price change log	Trend analysis
Property tax	Tax records	Investment analysis
HOA fees	Listing details	Cost analysis

Market Analysis Workflow

Daily Collection
    → Property listings (500-1000 per market)
    → Price changes (delta from previous day)
    → New listings vs delisted

Weekly Analysis
    → Median price trends
    → Inventory levels
    → Days-on-market averages
    → Price-per-sqft by neighborhood

Monthly Report
    → Market heat map
    → Competitive pricing analysis
    → Investment opportunity scoring

FAQ

Is scraping real estate data legal?

Public listing data is generally scrapable. Avoid collecting personal information about sellers or agents. Always comply with the site's terms of service.

How do I handle pagination?

Increment the page parameter in your URLs. Most real estate sites use ?page=N or &offset=N patterns.

Which CAPTCHA type is hardest on real estate sites?

Cloudflare Challenge on MLS aggregators is the most complex — it requires proxy parameters. reCAPTCHA v3 on major portals is common but solved reliably by CaptchaAI.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →