Use Cases

News and Media Aggregation with CAPTCHA Handling

News outlets and media platforms use CAPTCHAs to protect content from automated aggregation. Media monitors, PR firms, and research organizations need to collect articles programmatically. CaptchaAI handles CAPTCHA challenges across news sources.


CAPTCHAs on News Platforms

Source Type CAPTCHA Trigger Content
Major news outlets Cloudflare Turnstile Bot detection Articles, headlines
Wire services (AP, Reuters) reCAPTCHA v2 Bulk access Breaking news
Paywalled publications reCAPTCHA v3 Access attempts Premium articles
Local news sites reCAPTCHA v2 Rate limiting Regional news
News aggregators Cloudflare Challenge Scraping detection Aggregated feeds
Press release sites Image CAPTCHA Download pages PR content

News Aggregator

import requests
import time
import re
from bs4 import BeautifulSoup
from datetime import datetime
import json

CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"


def solve_captcha(method, sitekey, pageurl, **kwargs):
    data = {
        "key": CAPTCHAAI_KEY, "method": method,
        "googlekey": sitekey, "pageurl": pageurl, "json": 1,
    }
    data.update(kwargs)
    resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data=data)
    task_id = resp.json()["request"]
    for _ in range(60):
        time.sleep(5)
        result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
            "key": CAPTCHAAI_KEY, "action": "get",
            "id": task_id, "json": 1,
        })
        r = result.json()
        if r["request"] != "CAPCHA_NOT_READY":
            return r["request"]
    raise TimeoutError("Timeout")


class NewsAggregator:
    def __init__(self, proxy=None):
        self.session = requests.Session()
        if proxy:
            self.session.proxies = {"http": proxy, "https": proxy}
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
        })

    def collect_headlines(self, source_url, section=None):
        """Collect headlines from a news source."""
        url = f"{source_url}/{section}" if section else source_url
        resp = self.session.get(url, timeout=30)

        if self._has_captcha(resp.text):
            resp = self._solve_and_retry(resp.text, url)

        soup = BeautifulSoup(resp.text, "html.parser")
        articles = []

        for item in soup.select("article, .story, .headline-item, h2 a, h3 a"):
            link = item if item.name == "a" else item.select_one("a")
            if link:
                articles.append({
                    "title": link.get_text(strip=True),
                    "url": self._abs_url(source_url, link.get("href", "")),
                    "source": source_url,
                    "collected_at": datetime.now().isoformat(),
                })

        return articles

    def get_article(self, article_url):
        """Fetch full article content."""
        resp = self.session.get(article_url, timeout=30)

        if self._has_captcha(resp.text):
            resp = self._solve_and_retry(resp.text, article_url)

        soup = BeautifulSoup(resp.text, "html.parser")

        # Remove unwanted elements
        for tag in soup.select("script, style, nav, footer, .ad, .sidebar"):
            tag.decompose()

        content_el = soup.select_one(
            "article, .article-body, .story-body, .entry-content"
        )

        return {
            "url": article_url,
            "title": self._text(soup, "h1, .article-title"),
            "author": self._text(soup, ".author, .byline, [rel='author']"),
            "date": self._text(soup, "time, .publish-date, .article-date"),
            "content": content_el.get_text(separator="\n", strip=True) if content_el else "",
            "word_count": len(content_el.get_text().split()) if content_el else 0,
        }

    def aggregate_sources(self, sources, max_articles_per=20):
        """Aggregate headlines across multiple sources."""
        all_articles = []

        for source in sources:
            try:
                articles = self.collect_headlines(source["url"], source.get("section"))
                all_articles.extend(articles[:max_articles_per])
                print(f"{source['name']}: {len(articles)} headlines")
            except Exception as e:
                print(f"{source['name']}: Error - {e}")
            time.sleep(3)

        return all_articles

    def _has_captcha(self, html):
        return any(tag in html.lower() for tag in [
            'data-sitekey', 'g-recaptcha', 'cf-turnstile',
        ])

    def _solve_and_retry(self, html, url):
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if not match:
            return self.session.get(url)
        sitekey = match.group(1)
        if 'cf-turnstile' in html:
            token = solve_captcha("turnstile", sitekey, url)
            return self.session.post(url, data={"cf-turnstile-response": token})
        token = solve_captcha("userrecaptcha", sitekey, url)
        return self.session.post(url, data={"g-recaptcha-response": token})

    def _text(self, soup, selector):
        el = soup.select_one(selector)
        return el.get_text(strip=True) if el else ""

    def _abs_url(self, base, href):
        if href.startswith("http"):
            return href
        return base.rstrip("/") + "/" + href.lstrip("/")


# Usage
aggregator = NewsAggregator(
    proxy="http://user:pass@residential.proxy.com:5000"
)

sources = [
    {"name": "Tech News A", "url": "https://technews-a.example.com", "section": "latest"},
    {"name": "Business B", "url": "https://business-b.example.com", "section": "tech"},
    {"name": "Industry C", "url": "https://industry-c.example.com"},
]

headlines = aggregator.aggregate_sources(sources)
print(f"Total: {len(headlines)} headlines collected")

Keyword-Based News Monitoring

class NewsMonitor:
    def __init__(self, keywords, sources, proxy=None):
        self.keywords = [kw.lower() for kw in keywords]
        self.aggregator = NewsAggregator(proxy=proxy)
        self.sources = sources
        self.seen_urls = set()

    def scan(self):
        """Scan for articles matching keywords."""
        headlines = self.aggregator.aggregate_sources(self.sources)
        matches = []

        for article in headlines:
            if article["url"] in self.seen_urls:
                continue

            title_lower = article["title"].lower()
            matched_kws = [kw for kw in self.keywords if kw in title_lower]

            if matched_kws:
                article["matched_keywords"] = matched_kws
                matches.append(article)
                self.seen_urls.add(article["url"])

        return matches

    def continuous_monitor(self, interval_min=30):
        """Run continuous monitoring with alerts."""
        while True:
            matches = self.scan()
            if matches:
                print(f"\n=== {len(matches)} new matches found ===")
                for m in matches:
                    print(f"  [{', '.join(m['matched_keywords'])}] {m['title']}")
                    print(f"    {m['url']}")
            else:
                print(f"No new matches at {datetime.now().strftime('%H:%M')}")

            time.sleep(interval_min * 60)


# Monitor for specific topics
monitor = NewsMonitor(
    keywords=["captcha", "bot detection", "web scraping", "automation"],
    sources=sources,
    proxy="http://user:pass@residential.proxy.com:5000",
)
matches = monitor.scan()

Troubleshooting

Issue Cause Fix
Cloudflare blocking all requests Aggressive bot detection Use residential proxy + realistic UA
Paywall instead of article Content behind subscription Detect paywall, skip or handle
CAPTCHA on every page IP flagged Rotate proxy, add 5+ sec delays
Article content empty JS-rendered content Use Selenium/Puppeteer for SPA sites
Duplicate articles Same story from multiple sources Deduplicate by title similarity

FAQ

Collecting headlines and metadata for research or monitoring is common practice. Reproducing full copyrighted articles without permission is not. Use snippets and link back to the original source.

How do I handle paywalled content?

Detect the paywall (look for paywall CSS classes or limited content length) and flag it. Only access content you're authorized to view.

Which proxy type works best for news sites?

Rotating residential proxies work best. Major news outlets use Cloudflare, which blocks datacenter IPs aggressively.



Aggregate news from any source — get your CaptchaAI key and automate content collection.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Discussions (0)

No comments yet.