CAPTCHA Handling for Salary and Compensation Data Collection

Salary databases, job boards, and government labor portals protect compensation data with Cloudflare Turnstile and reCAPTCHA. CAPTCHAs trigger when querying salary ranges by role, location, or industry — especially during bulk data collection across many job titles. Here's how to handle them.

CAPTCHA Patterns on Salary Portals

Source type	CAPTCHA	Trigger
Salary comparison sites	Cloudflare Turnstile	Repeated search queries
Job board salary filters	reCAPTCHA v2	Multiple salary lookups
Government labor statistics	Image CAPTCHA	Data download requests
Corporate salary pages	Cloudflare Challenge	Bulk page views
HR survey platforms	reCAPTCHA v3	Form submissions

Salary Data Collector

import requests
import time
import re
from dataclasses import dataclass

@dataclass
class SalaryRecord:
    title: str
    location: str
    min_salary: float
    max_salary: float
    median_salary: float
    sample_size: int
    source: str

class SalaryCollector:
    def __init__(self, api_key):
        self.api_key = api_key
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })

    def collect_salary_data(self, portal_url, job_title, location):
        """Search for salary data, solving CAPTCHAs as needed."""
        response = self.session.get(portal_url, params={
            "title": job_title,
            "location": location
        })

        if self._is_turnstile_challenge(response):
            response = self._solve_turnstile_and_retry(response, portal_url)

        return self._parse_salary_data(response.text, portal_url)

    def collect_bulk(self, portal_url, job_titles, locations):
        """Collect salary data for multiple job title + location combos."""
        results = []

        for title in job_titles:
            for location in locations:
                try:
                    data = self.collect_salary_data(
                        portal_url, title, location
                    )
                    results.extend(data)
                    # Respectful delay between requests
                    time.sleep(2)
                except Exception as e:
                    print(f"Failed for {title} in {location}: {e}")

        return results

    def _is_turnstile_challenge(self, response):
        return (
            response.status_code == 403 or
            "cf-turnstile" in response.text or
            "challenges.cloudflare.com" in response.text
        )

    def _solve_turnstile_and_retry(self, response, url):
        match = re.search(r'data-sitekey="(0x[^"]+)"', response.text)
        if not match:
            raise ValueError("Turnstile sitekey not found")

        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.api_key,
            "method": "turnstile",
            "sitekey": match.group(1),
            "pageurl": url,
            "json": 1
        })
        task_id = resp.json()["request"]

        for _ in range(60):
            time.sleep(3)
            result = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key,
                "action": "get",
                "id": task_id,
                "json": 1
            })
            data = result.json()
            if data["status"] == 1:
                return self.session.post(url, data={
                    "cf-turnstile-response": data["request"]
                })

        raise TimeoutError("Turnstile solve timed out")

    def _parse_salary_data(self, html, source):
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
        records = []

        for row in soup.select(".salary-row, .compensation-entry, tr[data-salary]"):
            try:
                records.append(SalaryRecord(
                    title=row.select_one(".job-title, .title")?.text?.strip() or "",
                    location=row.select_one(".location")?.text?.strip() or "",
                    min_salary=self._parse_amount(
                        row.select_one(".min-salary, .low")?.text
                    ),
                    max_salary=self._parse_amount(
                        row.select_one(".max-salary, .high")?.text
                    ),
                    median_salary=self._parse_amount(
                        row.select_one(".median, .mid")?.text
                    ),
                    sample_size=int(
                        row.select_one(".count, .sample")?.text?.replace(",", "") or 0
                    ),
                    source=source
                ))
            except (AttributeError, ValueError):
                continue

        return records

    def _parse_amount(self, text):
        if not text:
            return 0.0
        cleaned = re.sub(r'[^\d.]', '', text)
        return float(cleaned) if cleaned else 0.0


# Usage
collector = SalaryCollector("YOUR_API_KEY")
data = collector.collect_bulk(
    "https://salary.example.com/search",
    job_titles=["Software Engineer", "Data Analyst", "Product Manager"],
    locations=["San Francisco", "New York", "Austin"]
)

for record in data:
    print(f"{record.title} in {record.location}: "
          f"${record.min_salary:,.0f}–${record.max_salary:,.0f} "
          f"(median: ${record.median_salary:,.0f})")

Multi-Source Aggregation (JavaScript)

class SalaryAggregator {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.sources = [];
  }

  addSource(name, searchUrl) {
    this.sources.push({ name, searchUrl });
  }

  async collectForRole(jobTitle, location) {
    const results = [];

    for (const source of this.sources) {
      try {
        const data = await this.querySource(source, jobTitle, location);
        results.push({ source: source.name, ...data });
      } catch (error) {
        results.push({ source: source.name, error: error.message });
      }
    }

    return this.aggregateResults(results, jobTitle, location);
  }

  async querySource(source, jobTitle, location) {
    const url = `${source.searchUrl}?title=${encodeURIComponent(jobTitle)}&location=${encodeURIComponent(location)}`;
    const response = await fetch(url);
    const html = await response.text();

    if (html.includes('cf-turnstile') || response.status === 403) {
      return this.solveAndRetry(source.searchUrl, html, jobTitle, location);
    }

    return this.parseSalaryData(html);
  }

  async solveAndRetry(baseUrl, html, jobTitle, location) {
    const match = html.match(/data-sitekey="(0x[^"]+)"/);
    if (!match) throw new Error('Turnstile sitekey not found');

    const submitResp = await fetch('https://ocr.captchaai.com/in.php', {
      method: 'POST',
      body: new URLSearchParams({
        key: this.apiKey,
        method: 'turnstile',
        sitekey: match[1],
        pageurl: baseUrl,
        json: '1'
      })
    });
    const { request: taskId } = await submitResp.json();

    for (let i = 0; i < 60; i++) {
      await new Promise(r => setTimeout(r, 3000));
      const result = await fetch(
        `https://ocr.captchaai.com/res.php?key=${this.apiKey}&action=get&id=${taskId}&json=1`
      );
      const data = await result.json();
      if (data.status === 1) {
        const response = await fetch(baseUrl, {
          method: 'POST',
          body: new URLSearchParams({
            'cf-turnstile-response': data.request,
            title: jobTitle,
            location: location
          })
        });
        return this.parseSalaryData(await response.text());
      }
    }
    throw new Error('Turnstile solve timed out');
  }

  aggregateResults(results, jobTitle, location) {
    const valid = results.filter(r => !r.error && r.median);
    if (valid.length === 0) return null;

    const medians = valid.map(r => r.median);
    return {
      jobTitle,
      location,
      avgMedian: medians.reduce((a, b) => a + b, 0) / medians.length,
      sources: valid.length,
      range: { min: Math.min(...medians), max: Math.max(...medians) }
    };
  }
}

// Usage
const aggregator = new SalaryAggregator('YOUR_API_KEY');
aggregator.addSource('SalaryDB', 'https://salarydb.example.com/search');
aggregator.addSource('PayScale', 'https://payscale.example.com/lookup');

const result = await aggregator.collectForRole('Software Engineer', 'San Francisco');
console.log(`Median salary: $${result.avgMedian.toLocaleString()} (${result.sources} sources)`);

Data Collection Strategy

Approach	Volume per day	CAPTCHA frequency	Best for
Sequential with delays	100–500 queries	Low	Small surveys
Proxy rotation	500–2,000 queries	Moderate	Regional analysis
Multi-session parallel	2,000–10,000 queries	High	Comprehensive datasets

Troubleshooting

Issue	Cause	Fix
Turnstile on every search	Session expired	Persist cf_clearance cookie
Salary data shows "Login required"	Portal requires authentication	Authenticate before searching
Empty results after CAPTCHA solve	POST parameters missing	Include all hidden form fields
Data inconsistent between runs	Portal shows different ranges	Use consistent query parameters

FAQ

How many salary queries can I make per day?

It depends on the portal's rate limits, not CaptchaAI. CaptchaAI solves Turnstile with 100% success rate. Space requests 2–5 seconds apart and rotate proxies for high-volume collection.

Should I use proxies for salary data collection?

Yes, especially for bulk collection across thousands of job titles. Residential proxies reduce CAPTCHA frequency significantly compared to datacenter IPs.

Can I collect real-time salary data?

Most salary portals update data monthly or quarterly, so real-time collection is unnecessary. Schedule weekly or monthly collection runs for comprehensive datasets.