Use Cases

CAPTCHA Scraping with Node.js: Complete Tutorial

Node.js excels at I/O-heavy scraping workloads. When target sites serve CAPTCHAs, CaptchaAI's API solves them while your script handles the HTTP requests. This tutorial covers the complete workflow using axios and cheerio.

Requirements

Requirement Details
Node.js 16+ With npm
axios npm install axios
cheerio npm install cheerio
CaptchaAI API key From captchaai.com

The CaptchaAI Solver Module

// captcha-solver.js
const axios = require("axios");

class CaptchaSolver {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.baseUrl = "https://ocr.captchaai.com";
  }

  async _submit(params) {
    params.key = this.apiKey;
    const resp = await axios.get(`${this.baseUrl}/in.php`, { params });
    if (!resp.data.startsWith("OK|")) {
      throw new Error(`Submit error: ${resp.data}`);
    }
    return resp.data.split("|")[1];
  }

  async _poll(taskId, timeout = 300000) {
    const deadline = Date.now() + timeout;
    while (Date.now() < deadline) {
      await new Promise((r) => setTimeout(r, 5000));
      const resp = await axios.get(`${this.baseUrl}/res.php`, {
        params: { key: this.apiKey, action: "get", id: taskId },
      });
      if (resp.data === "CAPCHA_NOT_READY") continue;
      if (resp.data.startsWith("OK|")) return resp.data.split("|")[1];
      throw new Error(`Solve error: ${resp.data}`);
    }
    throw new Error("Solve timed out");
  }

  async solveRecaptchaV2(siteKey, pageUrl) {
    const taskId = await this._submit({
      method: "userrecaptcha",
      googlekey: siteKey,
      pageurl: pageUrl,
    });
    return this._poll(taskId);
  }

  async solveRecaptchaV3(siteKey, pageUrl, action = "verify") {
    const taskId = await this._submit({
      method: "userrecaptcha",
      googlekey: siteKey,
      pageurl: pageUrl,
      version: "v3",
      action,
    });
    return this._poll(taskId);
  }

  async solveTurnstile(siteKey, pageUrl) {
    const taskId = await this._submit({
      method: "turnstile",
      sitekey: siteKey,
      pageurl: pageUrl,
    });
    return this._poll(taskId);
  }
}

module.exports = CaptchaSolver;

Scraping a reCAPTCHA-Protected Page

const axios = require("axios");
const cheerio = require("cheerio");
const CaptchaSolver = require("./captcha-solver");

const solver = new CaptchaSolver("YOUR_API_KEY");

async function scrapeProtectedPage(url) {
  // Step 1: Load the page
  const { data: html } = await axios.get(url, {
    headers: {
      "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    },
  });

  const $ = cheerio.load(html);

  // Step 2: Extract site key
  const siteKey = $(".g-recaptcha").attr("data-sitekey");
  if (!siteKey) {
    console.log("No CAPTCHA found, page loaded directly");
    return html;
  }

  console.log("Site key found:", siteKey);

  // Step 3: Solve the CAPTCHA
  const token = await solver.solveRecaptchaV2(siteKey, url);
  console.log("Token received:", token.substring(0, 50));

  // Step 4: Submit with the token
  const result = await axios.post(
    url,
    new URLSearchParams({
      "g-recaptcha-response": token,
      q: "search query",
    }),
    {
      headers: {
        "Content-Type": "application/x-www-form-urlencoded",
        "User-Agent":
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
      },
    }
  );

  return result.data;
}

Scraping Multiple Pages Concurrently

async function scrapePages(urls, siteKey, concurrency = 3) {
  const results = [];
  const queue = [...urls];

  const worker = async () => {
    while (queue.length > 0) {
      const url = queue.shift();
      try {
        const token = await solver.solveRecaptchaV2(siteKey, url);
        const { data } = await axios.post(
          url,
          new URLSearchParams({ "g-recaptcha-response": token }),
          {
            headers: {
              "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            },
          }
        );
        results.push({ url, data, success: true });
        console.log(`Scraped: ${url}`);
      } catch (err) {
        results.push({ url, error: err.message, success: false });
        console.error(`Failed: ${url} - ${err.message}`);
      }
    }
  };

  // Run workers concurrently
  const workers = Array(concurrency)
    .fill(null)
    .map(() => worker());
  await Promise.all(workers);

  return results;
}

// Usage
const urls = [
  "https://example.com/page/1",
  "https://example.com/page/2",
  "https://example.com/page/3",
];
const results = await scrapePages(urls, "6Le-wvkS...", 3);

Handling Cookies and Sessions

Use axios with cookie persistence for sites that require session cookies:

const { wrapper } = require("axios-cookiejar-support");
const { CookieJar } = require("tough-cookie");

const jar = new CookieJar();
const client = wrapper(
  axios.create({
    jar,
    headers: {
      "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    },
  })
);

async function scrapeWithSession(url, siteKey) {
  // Initial page load sets cookies
  await client.get(url);

  // Solve CAPTCHA
  const token = await solver.solveRecaptchaV2(siteKey, url);

  // Submit with maintained cookies
  const result = await client.post(
    url,
    new URLSearchParams({ "g-recaptcha-response": token })
  );

  return result.data;
}

Parsing Results with Cheerio

function parseResults(html) {
  const $ = cheerio.load(html);
  const items = [];

  $(".result-item").each((_, el) => {
    items.push({
      title: $(el).find(".title").text().trim(),
      url: $(el).find("a").attr("href"),
      description: $(el).find(".description").text().trim(),
    });
  });

  return items;
}

Troubleshooting

Issue Cause Fix
CAPTCHA_NOT_READY loops indefinitely Wrong site key or slow solve Verify site key; increase timeout
403 Forbidden on POST Missing cookies or headers Use session cookies; add Referer header
Cheerio can't find elements Dynamic content Use Puppeteer for JS-rendered sites
ECONNREFUSED Rate limited by target site Add delays; rotate proxies

FAQ

When should I use Puppeteer instead of axios?

Use axios + cheerio when the target site returns HTML with standard form submissions. Use Puppeteer when the site requires JavaScript execution, dynamic rendering, or complex user interactions.

Can I solve multiple CAPTCHAs at the same time?

Yes. Submit multiple CAPTCHA tasks to CaptchaAI concurrently and poll for each result. The concurrent scraping example above demonstrates this pattern.

How do I handle Cloudflare-protected sites?

If the site uses Turnstile, use solver.solveTurnstile(). For full Cloudflare challenge pages, use Cloudflare Challenge solving which returns cf_clearance cookies.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Discussions (0)

No comments yet.