Integrations

ParseHub + CaptchaAI: No-Code Scraping with CAPTCHA Solving

ParseHub is a visual scraping tool that struggles with CAPTCHAs. Here's how to use CaptchaAI to pre-solve CAPTCHAs and feed authenticated sessions into ParseHub.


The Challenge

ParseHub's visual selector can't interact with CAPTCHAs:

Issue Impact
reCAPTCHA blocks scraping ParseHub can't click checkboxes
Login CAPTCHAs Can't authenticate
Rate-limit CAPTCHAs Extraction stops mid-run
Cloudflare challenges Page never loads data

Solution: Pre-Authentication Script

import requests
import time
import json


def pre_authenticate(api_key, login_url, sitekey, credentials):
    """
    Solve CAPTCHA, authenticate, and export cookies for ParseHub.

    Run this script before starting your ParseHub extraction.
    """
    session = requests.Session()

    # Set realistic headers
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
    })

    # Visit login page for initial cookies
    print("1. Loading login page...")
    session.get(login_url, timeout=15)

    # Solve CAPTCHA
    print("2. Solving CAPTCHA...")
    token = solve_captcha(api_key, sitekey, login_url)
    print(f"   Token received ({len(token)} chars)")

    # Submit login
    print("3. Submitting login form...")
    resp = session.post(login_url, data={
        **credentials,
        "g-recaptcha-response": token,
    }, timeout=30)

    if resp.status_code != 200 or "error" in resp.text.lower():
        print(f"   Login may have failed (status: {resp.status_code})")
    else:
        print("   Login successful")

    # Export cookies
    cookies = {c.name: c.value for c in session.cookies}
    print(f"4. Exported {len(cookies)} cookies")

    return cookies


def solve_captcha(api_key, sitekey, pageurl):
    """Solve reCAPTCHA via CaptchaAI."""
    resp = requests.post("https://ocr.captchaai.com/in.php", data={
        "key": api_key,
        "method": "userrecaptcha",
        "googlekey": sitekey,
        "pageurl": pageurl,
        "json": 1,
    }, timeout=30)
    result = resp.json()

    if result.get("status") != 1:
        raise RuntimeError(f"Submit error: {result.get('request')}")

    task_id = result["request"]
    time.sleep(15)

    for _ in range(24):
        resp = requests.get("https://ocr.captchaai.com/res.php", params={
            "key": api_key, "action": "get",
            "id": task_id, "json": 1,
        }, timeout=15)
        data = resp.json()
        if data.get("status") == 1:
            return data["request"]
        if data["request"] != "CAPCHA_NOT_READY":
            raise RuntimeError(data["request"])
        time.sleep(5)

    raise TimeoutError("Solve timeout")


# Run before ParseHub
cookies = pre_authenticate(
    api_key="YOUR_API_KEY",
    login_url="https://example.com/login",
    sitekey="6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-",
    credentials={"username": "user", "password": "pass"},
)

# Save for ParseHub
with open("parsehub_cookies.json", "w") as f:
    json.dump(cookies, f, indent=2)
print(f"\nCookies saved to parsehub_cookies.json")
print("Import these cookies in ParseHub project settings")

Using ParseHub API with Pre-Solved Cookies

import requests
import json


def run_parsehub_with_cookies(project_token, api_key, cookies):
    """Start a ParseHub run with pre-authenticated cookies."""
    # Format cookies for ParseHub
    cookie_string = "; ".join(f"{k}={v}" for k, v in cookies.items())

    resp = requests.post(
        f"https://www.parsehub.com/api/v2/projects/{project_token}/run",
        data={
            "api_key": api_key,
            "start_url": "https://example.com/dashboard",
            "cookies": cookie_string,
        },
        timeout=30,
    )

    if resp.status_code == 200:
        run_data = resp.json()
        print(f"ParseHub run started: {run_data.get('run_token')}")
        return run_data
    else:
        print(f"Failed to start run: {resp.status_code}")
        return None


def get_parsehub_results(run_token, api_key):
    """Get results from a ParseHub run."""
    resp = requests.get(
        f"https://www.parsehub.com/api/v2/runs/{run_token}/data",
        params={"api_key": api_key, "format": "json"},
        timeout=60,
    )
    return resp.json()

Scheduled Workflow

import schedule
import time


def captcha_then_parsehub():
    """Run CAPTCHA solve, then trigger ParseHub."""
    # Step 1: Pre-authenticate
    cookies = pre_authenticate(
        api_key="YOUR_CAPTCHAAI_KEY",
        login_url="https://example.com/login",
        sitekey="SITEKEY",
        credentials={"username": "user", "password": "pass"},
    )

    # Step 2: Start ParseHub with fresh cookies
    result = run_parsehub_with_cookies(
        project_token="PARSEHUB_PROJECT_TOKEN",
        api_key="PARSEHUB_API_KEY",
        cookies=cookies,
    )

    if result:
        print(f"Extraction started at {time.strftime('%H:%M')}")


# Run daily at 6 AM
schedule.every().day.at("06:00").do(captcha_then_parsehub)

while True:
    schedule.run_pending()
    time.sleep(60)

FAQ

Can ParseHub solve CAPTCHAs on its own?

ParseHub has limited CAPTCHA handling for simple image CAPTCHAs. For reCAPTCHA, Turnstile, or other modern CAPTCHAs, use CaptchaAI to pre-solve and pass cookies.

How long do pre-solved cookies last?

Session cookies typically last 30 minutes to 24 hours. Run the pre-authentication script before each ParseHub extraction for reliability.

Should I switch from ParseHub to a coded solution?

If most of your target sites have CAPTCHAs, a Python script with CaptchaAI gives more reliability. Use ParseHub for simple, low-CAPTCHA sites.



Handle CAPTCHAs in ParseHub — try CaptchaAI.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Discussions (0)

No comments yet.