Tutorials

Build a Price Comparison Bot with Python and CaptchaAI

Build a bot that scrapes prices from multiple websites, handles CAPTCHAs automatically, and outputs a comparison report.


Project Structure

price_bot/
├── config.py        # API keys and store configs
├── solver.py        # CaptchaAI integration
├── scraper.py       # Per-store scrapers
├── compare.py       # Price comparison logic
└── main.py          # Entry point

CAPTCHA Solver Module

# solver.py
import requests
import time


class CaptchaSolver:
    def __init__(self, api_key):
        self.api_key = api_key

    def solve_recaptcha(self, sitekey, pageurl):
        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.api_key,
            "method": "userrecaptcha",
            "googlekey": sitekey,
            "pageurl": pageurl,
            "json": 1,
        }, timeout=30)
        result = resp.json()
        if result.get("status") != 1:
            raise RuntimeError(result.get("request"))

        task_id = result["request"]
        time.sleep(15)

        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()
            if data.get("status") == 1:
                return data["request"]
            if data["request"] != "CAPCHA_NOT_READY":
                raise RuntimeError(data["request"])
            time.sleep(5)

        raise TimeoutError("Solve timeout")

Store Scraper

# scraper.py
import requests
import re
import time
from bs4 import BeautifulSoup


class StoreScraper:
    """Generic store scraper with CAPTCHA handling."""

    def __init__(self, solver):
        self.solver = solver
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
        })

    def scrape_price(self, url):
        """Fetch page, handle CAPTCHA if present, extract price."""
        resp = self.session.get(url, timeout=15)

        # Check for CAPTCHA
        if 'data-sitekey' in resp.text:
            resp = self._solve_and_retry(url, resp.text)

        return self._extract_price(resp.text, url)

    def _solve_and_retry(self, url, html):
        """Solve CAPTCHA and re-fetch."""
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if not match:
            return None

        sitekey = match.group(1)
        token = self.solver.solve_recaptcha(sitekey, url)

        # Submit with token
        resp = self.session.post(url, data={
            "g-recaptcha-response": token,
        }, timeout=30)

        return resp

    def _extract_price(self, html, url):
        """Extract price from HTML."""
        soup = BeautifulSoup(html, "html.parser")

        # Try common price selectors
        selectors = [
            ".price", ".product-price", "[data-price]",
            ".price-current", ".offer-price", "#priceblock_ourprice",
        ]

        for selector in selectors:
            el = soup.select_one(selector)
            if el:
                price_text = el.get_text(strip=True)
                # Extract numeric value
                match = re.search(r'[\d,]+\.?\d*', price_text.replace(",", ""))
                if match:
                    return {
                        "price": float(match.group()),
                        "currency": self._detect_currency(price_text),
                        "url": url,
                    }

        return {"price": None, "currency": None, "url": url}

    def _detect_currency(self, text):
        if "$" in text: return "USD"
        if "€" in text: return "EUR"
        if "£" in text: return "GBP"
        return "USD"

Price Comparison Engine

# compare.py
from datetime import datetime


def compare_prices(product_name, price_data):
    """Compare prices from multiple sources."""
    valid = [p for p in price_data if p.get("price") is not None]

    if not valid:
        return {"product": product_name, "error": "No prices found"}

    sorted_prices = sorted(valid, key=lambda x: x["price"])
    best = sorted_prices[0]
    worst = sorted_prices[-1]

    return {
        "product": product_name,
        "best_price": best["price"],
        "best_source": best["url"],
        "worst_price": worst["price"],
        "savings": round(worst["price"] - best["price"], 2),
        "savings_pct": round((1 - best["price"] / worst["price"]) * 100, 1),
        "all_prices": sorted_prices,
        "checked_at": datetime.now().isoformat(),
    }


def format_report(comparisons):
    """Format comparison results as text report."""
    lines = ["=" * 60, "Price Comparison Report", "=" * 60, ""]

    for comp in comparisons:
        if "error" in comp:
            lines.append(f"{comp['product']}: {comp['error']}")
            continue

        lines.append(f"Product: {comp['product']}")
        lines.append(f"  Best:    ${comp['best_price']:.2f}")
        lines.append(f"  Source:  {comp['best_source']}")
        lines.append(f"  Savings: ${comp['savings']:.2f} ({comp['savings_pct']}%)")

        for p in comp["all_prices"]:
            lines.append(f"    ${p['price']:.2f} — {p['url']}")
        lines.append("")

    return "\n".join(lines)

Main Runner

# main.py
import os
import time
from solver import CaptchaSolver
from scraper import StoreScraper
from compare import compare_prices, format_report

PRODUCTS = [
    {
        "name": "Wireless Headphones",
        "urls": [
            "https://store-a.example.com/headphones-xyz",
            "https://store-b.example.com/product/headphones-xyz",
            "https://store-c.example.com/electronics/headphones-xyz",
        ],
    },
]


def main():
    api_key = os.environ["CAPTCHAAI_API_KEY"]
    solver = CaptchaSolver(api_key)
    scraper = StoreScraper(solver)

    comparisons = []

    for product in PRODUCTS:
        print(f"Checking prices for: {product['name']}")
        prices = []

        for url in product["urls"]:
            try:
                price = scraper.scrape_price(url)
                prices.append(price)
                print(f"  {url}: ${price.get('price', 'N/A')}")
            except Exception as e:
                print(f"  {url}: Error — {e}")

            time.sleep(3)

        comparison = compare_prices(product["name"], prices)
        comparisons.append(comparison)

    report = format_report(comparisons)
    print(report)

    # Save report
    with open("price_report.txt", "w") as f:
        f.write(report)


if __name__ == "__main__":
    main()

FAQ

How often should I check prices?

Daily is sufficient for most products. For flash sales or competitive monitoring, every 4-6 hours. Rate limit to avoid blocks.

How do I handle different currencies?

Convert to a base currency using a free exchange rate API before comparing. Store the original currency for reference.

Can I run this as a scheduled job?

Yes. Use cron (Linux), Task Scheduler (Windows), or a cloud scheduler to run the bot daily.



Build your price bot — start with CaptchaAI.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Discussions (0)

No comments yet.