Tutorials

MongoDB for CAPTCHA Solve History and Analytics

MongoDB's flexible schema and aggregation framework make it a strong fit for CAPTCHA solve tracking. Store every solve attempt with metadata, then query patterns across time, CAPTCHA types, and error rates.

Why MongoDB for CAPTCHA Data

CAPTCHA solve records have variable fields depending on the type — reCAPTCHA needs googlekey, hCaptcha needs sitekey, image CAPTCHAs need body. MongoDB's schemaless documents handle these naturally without schema migrations.

Document Schema

{
  "_id": "ObjectId",
  "captcha_id": "12345678",
  "type": "recaptcha_v2",
  "method": "userrecaptcha",
  "sitekey": "6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-",
  "pageurl": "https://example.com/form",
  "status": "solved",
  "solution": "03AGdBq26...",
  "error": null,
  "submitted_at": "2026-04-04T10:15:30.000Z",
  "solved_at": "2026-04-04T10:15:45.000Z",
  "elapsed_ms": 15000,
  "polls": 3,
  "proxy_used": true,
  "cost": 0.00299,
  "metadata": {
    "project": "price-monitor",
    "worker_id": "worker-3",
    "target_domain": "example.com"
  }
}

Python Implementation

Setup and Connection

import os
import time
from datetime import datetime, timezone
from pymongo import MongoClient, ASCENDING, DESCENDING
import requests

MONGO_URI = os.environ.get("MONGO_URI", "mongodb://localhost:27017")
API_KEY = os.environ["CAPTCHAAI_API_KEY"]

client = MongoClient(MONGO_URI)
db = client["captcha_tracking"]
solves = db["solves"]

Create Indexes

def setup_indexes():
    solves.create_index([("submitted_at", DESCENDING)])
    solves.create_index([("type", ASCENDING), ("status", ASCENDING)])
    solves.create_index([("metadata.project", ASCENDING)])
    solves.create_index([("metadata.target_domain", ASCENDING)])
    solves.create_index(
        [("submitted_at", ASCENDING)],
        expireAfterSeconds=90 * 24 * 3600,  # Auto-delete after 90 days
        name="ttl_cleanup"
    )

setup_indexes()

Solve and Store

def solve_and_store(sitekey, pageurl, captcha_type="recaptcha_v2", metadata=None):
    record = {
        "type": captcha_type,
        "method": "userrecaptcha",
        "sitekey": sitekey,
        "pageurl": pageurl,
        "status": "submitted",
        "submitted_at": datetime.now(timezone.utc),
        "metadata": metadata or {}
    }

    result = solves.insert_one(record)
    doc_id = result.inserted_id

    # Submit to CaptchaAI
    resp = requests.post("https://ocr.captchaai.com/in.php", data={
        "key": API_KEY,
        "method": "userrecaptcha",
        "googlekey": sitekey,
        "pageurl": pageurl,
        "json": 1
    })
    data = resp.json()

    if data.get("status") != 1:
        solves.update_one(
            {"_id": doc_id},
            {"$set": {"status": "error", "error": data.get("request")}}
        )
        return None

    captcha_id = data["request"]
    solves.update_one(
        {"_id": doc_id},
        {"$set": {"captcha_id": captcha_id, "status": "polling"}}
    )

    # Poll for result
    polls = 0
    for _ in range(60):
        time.sleep(5)
        polls += 1
        poll_resp = requests.get("https://ocr.captchaai.com/res.php", params={
            "key": API_KEY, "action": "get",
            "id": captcha_id, "json": 1
        }).json()

        if poll_resp.get("status") == 1:
            solved_at = datetime.now(timezone.utc)
            elapsed_ms = int(
                (solved_at - record["submitted_at"]).total_seconds() * 1000
            )
            solves.update_one({"_id": doc_id}, {"$set": {
                "status": "solved",
                "solution": poll_resp["request"],
                "solved_at": solved_at,
                "elapsed_ms": elapsed_ms,
                "polls": polls
            }})
            return poll_resp["request"]

        if poll_resp.get("request") != "CAPCHA_NOT_READY":
            solves.update_one({"_id": doc_id}, {"$set": {
                "status": "error",
                "error": poll_resp.get("request"),
                "polls": polls
            }})
            return None

    solves.update_one({"_id": doc_id}, {"$set": {
        "status": "timeout", "polls": polls
    }})
    return None

Analytics Queries

def get_success_rate(hours=24):
    """Success rate for the last N hours."""
    from datetime import timedelta
    cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)

    pipeline = [
        {"$match": {"submitted_at": {"$gte": cutoff}}},
        {"$group": {
            "_id": "$status",
            "count": {"$sum": 1}
        }}
    ]
    results = {r["_id"]: r["count"] for r in solves.aggregate(pipeline)}
    total = sum(results.values())
    solved = results.get("solved", 0)
    return (solved / total * 100) if total else 0


def get_avg_solve_time_by_type():
    """Average solve time grouped by CAPTCHA type."""
    pipeline = [
        {"$match": {"status": "solved"}},
        {"$group": {
            "_id": "$type",
            "avg_time_ms": {"$avg": "$elapsed_ms"},
            "min_time_ms": {"$min": "$elapsed_ms"},
            "max_time_ms": {"$max": "$elapsed_ms"},
            "count": {"$sum": 1}
        }},
        {"$sort": {"count": -1}}
    ]
    return list(solves.aggregate(pipeline))


def get_hourly_solve_volume(days=7):
    """Hourly solve volume for charting."""
    from datetime import timedelta
    cutoff = datetime.now(timezone.utc) - timedelta(days=days)

    pipeline = [
        {"$match": {"submitted_at": {"$gte": cutoff}}},
        {"$group": {
            "_id": {
                "date": {"$dateToString": {"format": "%Y-%m-%d", "date": "$submitted_at"}},
                "hour": {"$hour": "$submitted_at"}
            },
            "total": {"$sum": 1},
            "solved": {"$sum": {"$cond": [{"$eq": ["$status", "solved"]}, 1, 0]}}
        }},
        {"$sort": {"_id.date": 1, "_id.hour": 1}}
    ]
    return list(solves.aggregate(pipeline))


def get_error_breakdown(hours=24):
    """Error frequency by error code."""
    from datetime import timedelta
    cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)

    pipeline = [
        {"$match": {"submitted_at": {"$gte": cutoff}, "status": "error"}},
        {"$group": {"_id": "$error", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}}
    ]
    return list(solves.aggregate(pipeline))

JavaScript Implementation

const { MongoClient } = require("mongodb");
const axios = require("axios");

const MONGO_URI = process.env.MONGO_URI || "mongodb://localhost:27017";
const API_KEY = process.env.CAPTCHAAI_API_KEY;

let db, solves;

async function connect() {
  const client = await MongoClient.connect(MONGO_URI);
  db = client.db("captcha_tracking");
  solves = db.collection("solves");

  await solves.createIndex({ submitted_at: -1 });
  await solves.createIndex({ type: 1, status: 1 });
  await solves.createIndex({ "metadata.project": 1 });
  await solves.createIndex(
    { submitted_at: 1 },
    { expireAfterSeconds: 90 * 24 * 3600 }
  );
}

async function solveAndStore(sitekey, pageurl, type = "recaptcha_v2", metadata = {}) {
  const submittedAt = new Date();
  const { insertedId } = await solves.insertOne({
    type, method: "userrecaptcha", sitekey, pageurl,
    status: "submitted", submitted_at: submittedAt, metadata,
  });

  const submit = await axios.post("https://ocr.captchaai.com/in.php", null, {
    params: { key: API_KEY, method: "userrecaptcha", googlekey: sitekey, pageurl, json: 1 },
  });

  if (submit.data.status !== 1) {
    await solves.updateOne({ _id: insertedId }, { $set: { status: "error", error: submit.data.request } });
    return null;
  }

  const captchaId = submit.data.request;
  await solves.updateOne({ _id: insertedId }, { $set: { captcha_id: captchaId, status: "polling" } });

  let polls = 0;
  for (let i = 0; i < 60; i++) {
    await new Promise((r) => setTimeout(r, 5000));
    polls++;
    const poll = await axios.get("https://ocr.captchaai.com/res.php", {
      params: { key: API_KEY, action: "get", id: captchaId, json: 1 },
    });

    if (poll.data.status === 1) {
      const solvedAt = new Date();
      await solves.updateOne({ _id: insertedId }, { $set: {
        status: "solved", solution: poll.data.request,
        solved_at: solvedAt, elapsed_ms: solvedAt - submittedAt, polls,
      }});
      return poll.data.request;
    }
    if (poll.data.request !== "CAPCHA_NOT_READY") {
      await solves.updateOne({ _id: insertedId }, { $set: { status: "error", error: poll.data.request, polls } });
      return null;
    }
  }

  await solves.updateOne({ _id: insertedId }, { $set: { status: "timeout", polls } });
  return null;
}

async function getSuccessRate(hours = 24) {
  const cutoff = new Date(Date.now() - hours * 3600 * 1000);
  const pipeline = [
    { $match: { submitted_at: { $gte: cutoff } } },
    { $group: { _id: "$status", count: { $sum: 1 } } },
  ];
  const results = await solves.aggregate(pipeline).toArray();
  const total = results.reduce((s, r) => s + r.count, 0);
  const solved = results.find((r) => r._id === "solved")?.count || 0;
  return total ? ((solved / total) * 100).toFixed(1) : 0;
}

Data Retention

Strategy TTL Index Use Case
30-day retention expireAfterSeconds: 2592000 Development/testing
90-day retention expireAfterSeconds: 7776000 Production analytics
Permanent (with archival) No TTL; use capped collection or cold storage Compliance/audit

Troubleshooting

Issue Cause Fix
Slow aggregation queries Missing indexes on submitted_at and type Run setup_indexes() — see index section above
Documents growing large Storing full solutions in every record Store solution hashes or truncate after use
TTL not deleting old records TTL monitor runs every 60 seconds; large backlogs take time Wait for background cleanup; check index with db.solves.getIndexes()
Connection pool exhaustion Too many concurrent solve operations Set maxPoolSize in connection string

FAQ

Should I store the full CAPTCHA solution token?

For debugging, store tokens for 24–48 hours then let the TTL index clean them up. For long-term analytics, store only metadata (type, time, status, error) — tokens are useless after expiration anyway.

How much storage does this use?

Each solve record is roughly 500 bytes to 2 KB depending on metadata. At 10,000 solves/day with 90-day retention, expect about 1–2 GB. MongoDB handles this easily.

Can I use MongoDB Atlas (cloud)?

Yes. Atlas supports TTL indexes and aggregation pipelines. Use the connection string from your Atlas dashboard in MONGO_URI.

Next Steps

Track every CAPTCHA solve and spot issues before they impact your pipeline — get your CaptchaAI API key.

Related guides:

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Discussions (0)

No comments yet.