Add remove_consent_popups flag and fix from_kwargs dict deserialization
Add CrawlerRunConfig.remove_consent_popups (bool, default False) that
targets GDPR/cookie consent popups from 70+ known CMP providers including
OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Usercentrics,
Sourcepoint, Google FundingChoices, and many more.
The JS strategy uses a 5-phase approach:
1. Click "Accept All" buttons (cleanest dismissal, sets cookies)
2. Try CMP JavaScript APIs (__tcfapi, Didomi, Cookiebot, Osano, Klaro)
3. Remove known CMP containers by selector (~120 selectors)
4. Handle iframe-based and shadow DOM CMPs
5. Restore body scroll and remove CMP body classes
Also fix from_kwargs() in CrawlerRunConfig and BrowserConfig to
auto-deserialize dict values using the existing from_serializable_dict()
infrastructure. Previously, strategy objects like markdown_generator
arriving as {"type": "DefaultMarkdownGenerator", "params": {...}} from
JSON APIs were passed through as raw dicts, causing crashes when the
crawler later called methods on them.
This commit is contained in:
@@ -734,6 +734,12 @@ class BrowserConfig:
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||
# Auto-deserialize any dict values that use the {"type": ..., "params": ...}
|
||||
# serialization format (e.g. from JSON API requests or dump()/load() roundtrips).
|
||||
kwargs = {
|
||||
k: from_serializable_dict(v) if isinstance(v, dict) and "type" in v else v
|
||||
for k, v in kwargs.items()
|
||||
}
|
||||
return BrowserConfig(
|
||||
browser_type=kwargs.get("browser_type", "chromium"),
|
||||
headless=kwargs.get("headless", True),
|
||||
@@ -1282,6 +1288,10 @@ class CrawlerRunConfig():
|
||||
Default: False.
|
||||
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
||||
Default: False.
|
||||
remove_consent_popups (bool): If True, remove GDPR/cookie consent popups (IAB TCF/CMP)
|
||||
before extracting HTML. Targets known CMP providers like
|
||||
OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, etc.
|
||||
Default: False.
|
||||
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
|
||||
Default: False.
|
||||
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
|
||||
@@ -1445,6 +1455,7 @@ class CrawlerRunConfig():
|
||||
max_scroll_steps: Optional[int] = None,
|
||||
process_iframes: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
remove_consent_popups: bool = False,
|
||||
simulate_user: bool = False,
|
||||
override_navigator: bool = False,
|
||||
magic: bool = False,
|
||||
@@ -1573,6 +1584,7 @@ class CrawlerRunConfig():
|
||||
self.max_scroll_steps = max_scroll_steps
|
||||
self.process_iframes = process_iframes
|
||||
self.remove_overlay_elements = remove_overlay_elements
|
||||
self.remove_consent_popups = remove_consent_popups
|
||||
self.simulate_user = simulate_user
|
||||
self.override_navigator = override_navigator
|
||||
self.magic = magic
|
||||
@@ -1798,6 +1810,13 @@ class CrawlerRunConfig():
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
|
||||
# Auto-deserialize any dict values that use the {"type": ..., "params": ...}
|
||||
# serialization format (e.g. from JSON API requests or dump()/load() roundtrips).
|
||||
# This covers markdown_generator, extraction_strategy, content_filter, etc.
|
||||
kwargs = {
|
||||
k: from_serializable_dict(v) if isinstance(v, dict) and "type" in v else v
|
||||
for k, v in kwargs.items()
|
||||
}
|
||||
return CrawlerRunConfig(
|
||||
# Content Processing Parameters
|
||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||
@@ -1854,6 +1873,7 @@ class CrawlerRunConfig():
|
||||
max_scroll_steps=kwargs.get("max_scroll_steps"),
|
||||
process_iframes=kwargs.get("process_iframes", False),
|
||||
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
|
||||
remove_consent_popups=kwargs.get("remove_consent_popups", False),
|
||||
simulate_user=kwargs.get("simulate_user", False),
|
||||
override_navigator=kwargs.get("override_navigator", False),
|
||||
magic=kwargs.get("magic", False),
|
||||
@@ -1978,6 +1998,7 @@ class CrawlerRunConfig():
|
||||
"max_scroll_steps": self.max_scroll_steps,
|
||||
"process_iframes": self.process_iframes,
|
||||
"remove_overlay_elements": self.remove_overlay_elements,
|
||||
"remove_consent_popups": self.remove_consent_popups,
|
||||
"simulate_user": self.simulate_user,
|
||||
"override_navigator": self.override_navigator,
|
||||
"magic": self.magic,
|
||||
|
||||
@@ -465,6 +465,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
config.wait_for or
|
||||
config.scan_full_page or
|
||||
config.remove_overlay_elements or
|
||||
config.remove_consent_popups or
|
||||
config.simulate_user or
|
||||
config.magic or
|
||||
config.process_iframes or
|
||||
@@ -972,6 +973,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.delay_before_return_html:
|
||||
await asyncio.sleep(config.delay_before_return_html)
|
||||
|
||||
# Handle CMP/consent popup removal (before generic overlay removal)
|
||||
if config.remove_consent_popups:
|
||||
await self.remove_consent_popups(page)
|
||||
|
||||
# Handle overlay removal
|
||||
if config.remove_overlay_elements:
|
||||
await self.remove_overlay_elements(page)
|
||||
@@ -1447,6 +1452,50 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
params={"error": str(e)},
|
||||
)
|
||||
|
||||
async def remove_consent_popups(self, page: Page) -> None:
|
||||
"""
|
||||
Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot,
|
||||
TrustArc, Quantcast, Didomi, Usercentrics, Sourcepoint, Klaro, Osano, Iubenda,
|
||||
Complianz, CookieYes, ConsentManager, LiveRamp/Fides, etc.).
|
||||
|
||||
Strategy:
|
||||
1. Try clicking "Accept All" buttons (cleanest dismissal, sets cookies)
|
||||
2. Try IAB TCF / CMP JavaScript APIs
|
||||
3. Remove known CMP containers by selector
|
||||
4. Handle iframe-based CMPs
|
||||
5. Restore body scroll
|
||||
|
||||
Args:
|
||||
page (Page): The Playwright page instance
|
||||
"""
|
||||
remove_consent_js = load_js_script("remove_consent_popups")
|
||||
|
||||
try:
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(async () => {{
|
||||
try {{
|
||||
const removeConsent = {remove_consent_js};
|
||||
await removeConsent();
|
||||
return {{ success: true }};
|
||||
}} catch (error) {{
|
||||
return {{
|
||||
success: false,
|
||||
error: error.toString(),
|
||||
stack: error.stack
|
||||
}};
|
||||
}}
|
||||
}})()
|
||||
"""
|
||||
)
|
||||
await page.wait_for_timeout(500) # Wait for any animations to complete
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message="Failed to remove consent popups: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
|
||||
async def export_pdf(self, page: Page) -> bytes:
|
||||
"""
|
||||
Exports the current page as a PDF.
|
||||
|
||||
710
crawl4ai/js_snippet/remove_consent_popups.js
Normal file
710
crawl4ai/js_snippet/remove_consent_popups.js
Normal file
@@ -0,0 +1,710 @@
|
||||
async () => {
|
||||
// Helper: check if element is visible
|
||||
const isVisible = (elem) => {
|
||||
if (!elem) return false;
|
||||
const style = window.getComputedStyle(elem);
|
||||
return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0";
|
||||
};
|
||||
|
||||
// =========================================================================
|
||||
// Phase 1: Click "Accept All" buttons (fastest, cleanest dismissal)
|
||||
// =========================================================================
|
||||
|
||||
// CMP-specific accept button selectors (ordered by market share)
|
||||
const cmpAcceptSelectors = [
|
||||
// OneTrust / CookiePro
|
||||
'#onetrust-accept-btn-handler',
|
||||
'#accept-recommended-btn-handler',
|
||||
// Cookiebot (Usercentrics/Cybot)
|
||||
'#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll',
|
||||
'#CybotCookiebotDialogBodyButtonAccept',
|
||||
'#CybotCookiebotDialogBodyLevelButtonAccept',
|
||||
// Didomi
|
||||
'#didomi-notice-agree-button',
|
||||
'.didomi-button-highlight',
|
||||
// Quantcast Choice
|
||||
'.qc-cmp2-summary-buttons button[mode="primary"]',
|
||||
// Sourcepoint
|
||||
'.sp_choice_type_11',
|
||||
'.sp_choice_type_ACCEPT_ALL',
|
||||
// Google FundingChoices / FC
|
||||
'.fc-button.fc-cta-consent.fc-primary-button',
|
||||
'.fc-cta-consent',
|
||||
'.fc-confirm-choices',
|
||||
// TrustArc
|
||||
'#truste-consent-button',
|
||||
'.truste_popframe .pdynamicbutton .call',
|
||||
// ConsentManager.net
|
||||
'.cmpboxbtnyes',
|
||||
'#cmpwelcomebtnyes',
|
||||
'.cmpboxbtnyescustomchoices',
|
||||
// Osano
|
||||
'.osano-cm-accept-all',
|
||||
'.osano-cm-accept',
|
||||
// Iubenda
|
||||
'#iubenda-cs-accept-btn',
|
||||
'.iubenda-cs-accept-btn',
|
||||
// Complianz (WordPress)
|
||||
'.cmplz-btn.cmplz-accept',
|
||||
// LiveRamp / Fides (Ethyca)
|
||||
'.fides-accept-all-button',
|
||||
'#fides-banner .fides-accept-all-button',
|
||||
// CookieYes
|
||||
'.cky-btn-accept',
|
||||
'[data-cky-tag="accept-button"]',
|
||||
// Klaro
|
||||
'.klaro .cm-btn-accept-all',
|
||||
'.klaro .cm-btn-success',
|
||||
'.klaro .cookie-notice .cm-btn-success',
|
||||
// Termly
|
||||
'[data-tid="banner-accept"]',
|
||||
// CookieFirst
|
||||
'button[data-cookiefirst-action="accept"]',
|
||||
// CookieScript
|
||||
'#cookiescript_accept',
|
||||
// Borlabs Cookie (WordPress)
|
||||
'a[data-cookie-accept-all]',
|
||||
'.brlbs-btn-accept-all',
|
||||
// Civic Cookie Control
|
||||
'#ccc-recommended-settings',
|
||||
'#ccc-notify-accept',
|
||||
'.ccc-accept-button',
|
||||
// Cookie Information
|
||||
'.coi-banner__accept',
|
||||
// Evidon / Crownpeak
|
||||
'#_evidon-accept-button',
|
||||
// Axeptio
|
||||
'button#axeptio_btn_acceptAll',
|
||||
// HubSpot
|
||||
'#hs-eu-confirmation-button',
|
||||
// Ketch
|
||||
'#lanyard_root button[class*="confirmButton"]',
|
||||
// Moove GDPR (WordPress)
|
||||
'.moove-gdpr-infobar-allow-all',
|
||||
// TermsFeed
|
||||
'.cc-nb-okagree',
|
||||
// tarteaucitron.js
|
||||
'#tarteaucitronPersonalize2',
|
||||
'.tarteaucitronAllow',
|
||||
// CookieHub
|
||||
'.ch2-allow-all-btn',
|
||||
// Cookie Notice (WP plugin)
|
||||
'#cn-accept-cookie',
|
||||
// EU Cookie Compliance (Drupal)
|
||||
'.eu-cookie-compliance-banner .agree-button',
|
||||
'.eu-cookie-compliance-banner .accept-all',
|
||||
// WordPress GDPR Cookie Consent
|
||||
'#gdpr-cookie-consent-bar #cookie_action_accept',
|
||||
// Cookie Law Info / WebToffee
|
||||
'[data-cli_action="accept"]',
|
||||
// Shopify Native
|
||||
'#shopify-pc__banner__btn-accept',
|
||||
// Wix Native
|
||||
'[data-hook="ccsu-banner-accept"]',
|
||||
// Finsweet (Webflow)
|
||||
'[fs-consent-element="allow"]',
|
||||
'[fs-cc="banner"] [fs-cc="allow"]',
|
||||
// Pandectes (Shopify)
|
||||
'#pandectes-banner .cc-allow',
|
||||
// Clickio
|
||||
'#cl-consent [data-role="b_agree"]',
|
||||
// Snigel
|
||||
'.snigel-cmp-framework #accept-choices',
|
||||
// Cassie
|
||||
'.cassie-accept-all',
|
||||
// FastCMP
|
||||
'.fast-cmp-home-accept button',
|
||||
// Sibbo
|
||||
'#acceptAllMain',
|
||||
// PubTech
|
||||
'#pt-accept-all',
|
||||
// UniConsent
|
||||
'#unic-agree',
|
||||
// Ezoic
|
||||
'#ez-accept-all',
|
||||
// Transcend
|
||||
'#transcend-consent-manager .inner-container button',
|
||||
// Cloudflare Zaraz
|
||||
'#cf_consent-buttons__accept-all',
|
||||
// CookieConsent v2 (Insites/Osano OSS)
|
||||
'#s-all-bn',
|
||||
// CookieConsent v3 (orestbida)
|
||||
'.cm__btn[data-role="all"]',
|
||||
// Openli / Legal Monster
|
||||
'#lm-accept-all',
|
||||
// UK Cookie Consent (Catapult)
|
||||
'#catapultCookie',
|
||||
// Mediavine
|
||||
'[data-name="mediavine-gdpr-cmp"] [format="primary"]',
|
||||
// Consentmo (Shopify)
|
||||
'.isense-cc-allow',
|
||||
// AdOpt
|
||||
'#adopt-accept-all-button',
|
||||
// Truyo
|
||||
'button#acceptAllCookieButton',
|
||||
// KConsent
|
||||
'#kc-acceptAndHide',
|
||||
// Gravito
|
||||
'#modalConfirmBtn.gravitoCMP-button',
|
||||
// CT Ultimate GDPR (WordPress)
|
||||
'#ct-ultimate-gdpr-cookie-accept',
|
||||
// Hu-manity
|
||||
'[data-hu-action="cookies-notice-consent-choices-3"]',
|
||||
// GDPR Legal Cookie (Shopify)
|
||||
'.overlay_bc_banner *[data-cookie-accept-all]',
|
||||
// Bing / Microsoft
|
||||
'#bnp_btn_accept',
|
||||
// Privado
|
||||
'#cookie-consent-banner #accept-button',
|
||||
// Cookie Alert
|
||||
'button[data-controller="cookie-alert/extended/button/accept"]',
|
||||
// iWink / STARTER
|
||||
'body.cookies-request #cookie-bar .allow-cookies',
|
||||
// Real Cookie Banner (devowl.io)
|
||||
'.rcb-banner-cta-accept-all',
|
||||
];
|
||||
|
||||
// Generic accept button selectors (attribute-based)
|
||||
const genericAcceptSelectors = [
|
||||
'button[id*="accept" i]',
|
||||
'button[class*="accept-all" i]',
|
||||
'button[class*="acceptAll" i]',
|
||||
'a[id*="accept" i]',
|
||||
'button[id*="agree" i]',
|
||||
'button[class*="agree" i]',
|
||||
'button[class*="allow-all" i]',
|
||||
'button[class*="allowAll" i]',
|
||||
'button[data-action="accept"]',
|
||||
'button[data-action="accept-all"]',
|
||||
'button[data-gdpr="accept"]',
|
||||
'button[data-consent="accept"]',
|
||||
];
|
||||
|
||||
// Try clicking a CMP-specific accept button
|
||||
const clickButton = async (selectors) => {
|
||||
for (const selector of selectors) {
|
||||
try {
|
||||
const btn = document.querySelector(selector);
|
||||
if (btn && isVisible(btn)) {
|
||||
btn.click();
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
return true;
|
||||
}
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
let accepted = await clickButton(cmpAcceptSelectors);
|
||||
if (!accepted) accepted = await clickButton(genericAcceptSelectors);
|
||||
|
||||
// Text-content fallback: find buttons by visible text
|
||||
if (!accepted) {
|
||||
const acceptPatterns = [
|
||||
/^accept\s*(all)?(\s*cookies)?$/i,
|
||||
/^allow\s*(all)?(\s*cookies)?$/i,
|
||||
/^i\s*agree$/i,
|
||||
/^agree(\s*(and|&)\s*(close|continue))?$/i,
|
||||
/^got\s*it[!]?$/i,
|
||||
/^consent$/i,
|
||||
/^(accept|agree)\s*&?\s*close$/i,
|
||||
];
|
||||
|
||||
const candidates = document.querySelectorAll(
|
||||
'button, a[role="button"], [role="button"], input[type="submit"], input[type="button"]'
|
||||
);
|
||||
for (const btn of candidates) {
|
||||
const text = (btn.textContent || btn.value || '').trim();
|
||||
if (text.length > 0 && text.length < 40) {
|
||||
for (const pattern of acceptPatterns) {
|
||||
if (pattern.test(text) && isVisible(btn)) {
|
||||
btn.click();
|
||||
accepted = true;
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (accepted) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shadow DOM: Usercentrics, Axeptio
|
||||
if (!accepted) {
|
||||
const shadowRoots = [
|
||||
{ id: 'usercentrics-root', btn: 'button[data-testid="uc-accept-all-button"]' },
|
||||
{ cls: 'axeptio_mount', btn: 'button#axeptio_btn_acceptAll' },
|
||||
];
|
||||
for (const cfg of shadowRoots) {
|
||||
try {
|
||||
const host = cfg.id
|
||||
? document.getElementById(cfg.id)
|
||||
: document.querySelector('.' + cfg.cls);
|
||||
if (host && host.shadowRoot) {
|
||||
const btn = host.shadowRoot.querySelector(cfg.btn);
|
||||
if (btn) {
|
||||
btn.click();
|
||||
accepted = true;
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
}
|
||||
|
||||
// Iframe-based CMPs (Sourcepoint, FastCMP, TrustArc, Privacy Manager)
|
||||
if (!accepted) {
|
||||
const iframeSelectors = [
|
||||
'iframe[id^="sp_message_iframe"]',
|
||||
'iframe#fast-cmp-iframe',
|
||||
'iframe[id*="consent" i]',
|
||||
'iframe[title*="consent" i]',
|
||||
'iframe[title*="cookie" i]',
|
||||
'iframe[title*="privacy" i]',
|
||||
'iframe[src*="privacymanager" i]',
|
||||
'iframe[src*="consent-tool" i]',
|
||||
];
|
||||
for (const sel of iframeSelectors) {
|
||||
try {
|
||||
const iframe = document.querySelector(sel);
|
||||
if (iframe && iframe.contentDocument) {
|
||||
const iframeDoc = iframe.contentDocument;
|
||||
const btns = iframeDoc.querySelectorAll(
|
||||
'button[title="Accept All" i], button[title="Accept" i], ' +
|
||||
'.sp_choice_type_11, button.message-button, ' +
|
||||
'button[class*="accept" i], button[class*="agree" i]'
|
||||
);
|
||||
for (const btn of btns) {
|
||||
if (btn.offsetParent !== null) {
|
||||
btn.click();
|
||||
accepted = true;
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (accepted) break;
|
||||
}
|
||||
} catch (e) { /* cross-origin iframes will throw SecurityError, expected */ }
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Phase 2: Try CMP JavaScript APIs
|
||||
// =========================================================================
|
||||
|
||||
// IAB TCF v2 API
|
||||
if (typeof window.__tcfapi === 'function') {
|
||||
try {
|
||||
window.__tcfapi('addEventListener', 2, () => {});
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// Didomi API
|
||||
if (typeof window.Didomi !== 'undefined') {
|
||||
try {
|
||||
window.Didomi.setUserAgreeToAll();
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// Cookiebot API
|
||||
if (typeof window.Cookiebot !== 'undefined') {
|
||||
try {
|
||||
window.Cookiebot.submitCustomConsent(true, true, true);
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// Osano API
|
||||
if (typeof window.Osano !== 'undefined') {
|
||||
try {
|
||||
window.Osano.cm.acceptAll();
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// Klaro API
|
||||
if (typeof window.klaro !== 'undefined') {
|
||||
try {
|
||||
window.klaro.getManager().acceptAll();
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// Wait for CMP animations/transitions
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
// =========================================================================
|
||||
// Phase 3: Remove known CMP containers by selector
|
||||
// =========================================================================
|
||||
const cmpContainerSelectors = [
|
||||
// --- Tier 1: Enterprise CMPs ---
|
||||
|
||||
// OneTrust / CookiePro
|
||||
'#onetrust-consent-sdk',
|
||||
'#onetrust-banner-sdk',
|
||||
'.onetrust-pc-dark-filter',
|
||||
'#onetrust-pc-sdk',
|
||||
|
||||
// Cookiebot (Usercentrics/Cybot)
|
||||
'#CybotCookiebotDialog',
|
||||
'#CybotCookiebotDialogBodyUnderlay',
|
||||
'#dtcookie-container',
|
||||
'#cookiebanner',
|
||||
|
||||
// TrustArc
|
||||
'#truste-consent-track',
|
||||
'.truste_overlay',
|
||||
'.truste_box_overlay',
|
||||
'#truste-consent-content',
|
||||
'#consent_blackbar',
|
||||
'.trustarc-banner-container',
|
||||
|
||||
// Quantcast Choice
|
||||
'.qc-cmp2-container',
|
||||
'#qc-cmp2-main',
|
||||
'.qc-cmp-cleanslate',
|
||||
|
||||
// Didomi
|
||||
'#didomi-host',
|
||||
'#didomi-popup',
|
||||
'#didomi-notice',
|
||||
|
||||
// Usercentrics
|
||||
'#usercentrics-root',
|
||||
'#usercentrics-cmp-ui',
|
||||
|
||||
// Sourcepoint
|
||||
'div[id^="sp_message_container"]',
|
||||
'.sp_message_container',
|
||||
|
||||
// Google FundingChoices / FC
|
||||
'.fc-consent-root',
|
||||
'.fc-dialog-overlay',
|
||||
'.fc-dialog-container',
|
||||
|
||||
// --- Tier 2: Mid-Market CMPs ---
|
||||
|
||||
// Klaro
|
||||
'.klaro',
|
||||
|
||||
// Osano
|
||||
'.osano-cm-window',
|
||||
'.osano-cm-dialog',
|
||||
|
||||
// Iubenda
|
||||
'#iubenda-cs-banner',
|
||||
|
||||
// Complianz (WordPress)
|
||||
'.cmplz-cookiebanner',
|
||||
'#cmplz-cookiebanner-container',
|
||||
|
||||
// CookieYes
|
||||
'.cky-consent-container',
|
||||
'.cky-overlay',
|
||||
|
||||
// ConsentManager.net
|
||||
'.cmpbox',
|
||||
'#cmpbox',
|
||||
'#cmpbox2',
|
||||
'#cmpwrapper',
|
||||
|
||||
// LiveRamp / Fides (Ethyca)
|
||||
'.fides-overlay',
|
||||
'#fides-banner',
|
||||
'#fides-overlay',
|
||||
'#fides-overlay-wrapper',
|
||||
|
||||
// Termly
|
||||
'#termly-code-snippet-support',
|
||||
|
||||
// CookieFirst
|
||||
'#cookiefirst-root',
|
||||
'.cookiefirst-root',
|
||||
|
||||
// CookieScript
|
||||
'#cookiescript_injected',
|
||||
'.cookiescript_fsd_main',
|
||||
|
||||
// Borlabs Cookie (WordPress)
|
||||
'#BorlabsCookieBox',
|
||||
'._brlbs-bar-wrap',
|
||||
'._brlbs-box-wrap',
|
||||
|
||||
// Civic Cookie Control
|
||||
'#ccc',
|
||||
'#ccc-module',
|
||||
'#ccc-overlay',
|
||||
|
||||
// Cookie Information
|
||||
'#cookie-information-template-wrapper',
|
||||
'#coiOverlay',
|
||||
|
||||
// Evidon / Crownpeak
|
||||
'#_evidon_banner',
|
||||
'#_evidon-background',
|
||||
'#evidon-prefdiag-overlay',
|
||||
|
||||
// Axeptio
|
||||
'.axeptio_widget',
|
||||
'.axeptio_mount',
|
||||
|
||||
// HubSpot
|
||||
'#hs-eu-cookie-confirmation',
|
||||
|
||||
// Ketch
|
||||
'#lanyard_root',
|
||||
|
||||
// --- Tier 3: Regional / WordPress / Specialized CMPs ---
|
||||
|
||||
// tarteaucitron.js
|
||||
'#tarteaucitronRoot',
|
||||
'#tarteaucitronAlertBig',
|
||||
|
||||
// CookieHub
|
||||
'.ch2-container',
|
||||
'.ch2-dialog',
|
||||
|
||||
// Moove GDPR (WordPress)
|
||||
'#moove_gdpr_cookie_info_bar',
|
||||
'#moove_gdpr_cookie_modal',
|
||||
'.gdpr_cookie_settings_popup_overlay',
|
||||
|
||||
// TermsFeed
|
||||
'.termsfeed-com---nb',
|
||||
|
||||
// Cookie Notice (WP plugin)
|
||||
'#cookie-notice',
|
||||
|
||||
// Cookie Law Info / WebToffee
|
||||
'#cookie-law-info-bar',
|
||||
'#cookie-law-bg',
|
||||
'.cli-popupbar-overlay',
|
||||
|
||||
// EU Cookie Compliance (Drupal)
|
||||
'.eu-cookie-compliance-banner',
|
||||
|
||||
// WordPress GDPR Cookie Consent
|
||||
'#gdpr-cookie-consent-bar',
|
||||
|
||||
// Shopify Native
|
||||
'#shopify-pc__banner',
|
||||
|
||||
// Wix Native
|
||||
'[data-comp-type="cookie-banner-root-wix"]',
|
||||
'[data-hook="ccsu-banner-wrapper"]',
|
||||
|
||||
// Finsweet (Webflow)
|
||||
'[fs-consent-element="banner"]',
|
||||
'.fs-cc-components',
|
||||
|
||||
// Pandectes (Shopify)
|
||||
'#pandectes-banner',
|
||||
|
||||
// Clickio
|
||||
'#cl-consent',
|
||||
|
||||
// Snigel
|
||||
'.snigel-cmp-framework',
|
||||
|
||||
// Cassie
|
||||
'.cassie-cookie-module',
|
||||
'.cassie-pre-banner',
|
||||
|
||||
// FastCMP
|
||||
'#fast-cmp-root',
|
||||
|
||||
// Sibbo
|
||||
'sibbo-cmp-layout',
|
||||
|
||||
// PubTech
|
||||
'#pubtech-cmp',
|
||||
|
||||
// UniConsent
|
||||
'.unic',
|
||||
|
||||
// Ezoic
|
||||
'#ez-cookie-dialog-wrapper',
|
||||
|
||||
// Transcend
|
||||
'#transcend-consent-manager',
|
||||
|
||||
// Cloudflare Zaraz
|
||||
'.cf_modal_container',
|
||||
|
||||
// CookieConsent v2 (Insites/Osano OSS)
|
||||
'#cc--main',
|
||||
|
||||
// CookieConsent v3 (orestbida)
|
||||
'#cc-main',
|
||||
|
||||
// Openli / Legal Monster
|
||||
'.legalmonster-cleanslate',
|
||||
|
||||
// UK Cookie Consent (Catapult)
|
||||
'#catapult-cookie-bar',
|
||||
|
||||
// Sirdata
|
||||
'#sd-cmp',
|
||||
|
||||
// Mediavine
|
||||
'[data-name="mediavine-gdpr-cmp"]',
|
||||
|
||||
// Consentmo (Shopify)
|
||||
'.isense-cc-window',
|
||||
|
||||
// AdOpt
|
||||
'#cookie-banner',
|
||||
|
||||
// Truyo
|
||||
'#truyo-consent-module',
|
||||
|
||||
// KConsent
|
||||
'#kconsent',
|
||||
'.kc-overlay',
|
||||
|
||||
// Gravito
|
||||
'.gravitoCMP-background-overlay',
|
||||
|
||||
// CT Ultimate GDPR (WordPress)
|
||||
'#ct-ultimate-gdpr-cookie-popup',
|
||||
|
||||
// Hu-manity
|
||||
'#hu.hu-wrapper',
|
||||
|
||||
// GDPR Legal Cookie (Shopify)
|
||||
'.overlay_bc_banner',
|
||||
|
||||
// Piwik PRO
|
||||
'.PiwikPROConsentForm-container',
|
||||
|
||||
// Tealium
|
||||
'#__tealiumGDPRecModal',
|
||||
'#__tealiumImplicitmodal',
|
||||
'#consent-layer',
|
||||
|
||||
// PMC (Penske Media)
|
||||
'#pmc-pp-tou--notice',
|
||||
|
||||
// Privado
|
||||
'#cookie-consent-banner',
|
||||
|
||||
// Real Cookie Banner (devowl.io)
|
||||
'.rcb-banner',
|
||||
|
||||
// Bing / Microsoft
|
||||
'#bnp_container',
|
||||
'#bnp_cookie_banner',
|
||||
|
||||
// LinkedIn
|
||||
'.artdeco-global-alert[type="COOKIE_CONSENT"]',
|
||||
|
||||
// Privacy Manager
|
||||
'#gdpr-consent-tool-wrapper',
|
||||
|
||||
// --- Generic patterns (catch-all) ---
|
||||
'[class*="cookie-consent" i]',
|
||||
'[id*="cookie-consent" i]',
|
||||
'[class*="cookie-banner" i]',
|
||||
'[id*="cookie-banner" i]',
|
||||
'[class*="consent-banner" i]',
|
||||
'[id*="consent-banner" i]',
|
||||
'[class*="consent-popup" i]',
|
||||
'[id*="consent-popup" i]',
|
||||
'[class*="gdpr-banner" i]',
|
||||
'[id*="gdpr-banner" i]',
|
||||
'[class*="cookie-notice" i]',
|
||||
'[id*="cookie-notice" i]',
|
||||
'[class*="cookie-law" i]',
|
||||
'[id*="cookie-law" i]',
|
||||
'[class*="cookie-popup" i]',
|
||||
'[id*="cookie-popup" i]',
|
||||
'[class*="cookie-overlay" i]',
|
||||
'[id*="cookie-overlay" i]',
|
||||
'.cc-banner',
|
||||
'.cc-window',
|
||||
];
|
||||
|
||||
for (const selector of cmpContainerSelectors) {
|
||||
try {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
elements.forEach(el => el.remove());
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Phase 4: Remove CMP iframes
|
||||
// =========================================================================
|
||||
const cmpIframeSelectors = [
|
||||
'iframe[id^="sp_message_iframe"]',
|
||||
'iframe#fast-cmp-iframe',
|
||||
'iframe[src*="consent" i]',
|
||||
'iframe[src*="cookie-cdn" i]',
|
||||
'iframe[src*="cookiebot" i]',
|
||||
'iframe[src*="trustarc" i]',
|
||||
'iframe[src*="consentmanager" i]',
|
||||
'iframe[src*="privacymanager" i]',
|
||||
'iframe[src*="cmp-consent-tool" i]',
|
||||
'iframe[title*="consent" i]',
|
||||
'iframe[title*="cookie" i]',
|
||||
'iframe[title*="gdpr" i]',
|
||||
'iframe[name="__tcfapiLocator"]',
|
||||
];
|
||||
|
||||
for (const selector of cmpIframeSelectors) {
|
||||
try {
|
||||
const iframes = document.querySelectorAll(selector);
|
||||
iframes.forEach(iframe => {
|
||||
// Also remove parent if it's a CMP wrapper (fixed/high-z)
|
||||
const parent = iframe.parentElement;
|
||||
if (parent && parent.children.length <= 2) {
|
||||
const style = window.getComputedStyle(parent);
|
||||
if (style.position === 'fixed' || style.position === 'absolute' ||
|
||||
parseInt(style.zIndex) > 999) {
|
||||
parent.remove();
|
||||
return;
|
||||
}
|
||||
}
|
||||
iframe.remove();
|
||||
});
|
||||
} catch (e) { /* continue */ }
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Phase 5: Restore body scroll and clean up CMP artifacts
|
||||
// =========================================================================
|
||||
|
||||
// Reset overflow on body and html
|
||||
document.body.style.overflow = '';
|
||||
document.body.style.overflowY = '';
|
||||
document.body.style.position = '';
|
||||
document.body.style.marginRight = '';
|
||||
document.body.style.paddingRight = '';
|
||||
document.documentElement.style.overflow = '';
|
||||
document.documentElement.style.overflowY = '';
|
||||
document.documentElement.style.position = '';
|
||||
|
||||
// Remove known CMP body classes
|
||||
const cmpBodyClasses = [
|
||||
'ot-overflow-hidden',
|
||||
'sp_message_open',
|
||||
'didomi-popup-open',
|
||||
'cmpbox-show',
|
||||
'cmplz-blocked',
|
||||
'qc-cmp2-no-scroll',
|
||||
'osano-cm-show',
|
||||
'cky-modal-open',
|
||||
'fides-overlay-modal-open',
|
||||
'cc-no-scroll',
|
||||
'gdpr-cookie-notice-center-loaded',
|
||||
'fc-consent-root-open',
|
||||
'cookie-notification-active',
|
||||
'consent-bar-push-large',
|
||||
'with-eu-cookie-guideline',
|
||||
'cookies-request',
|
||||
'eu-cookie-compliance-popup-open',
|
||||
'has-cookie-bar',
|
||||
];
|
||||
|
||||
for (const cls of cmpBodyClasses) {
|
||||
document.body.classList.remove(cls);
|
||||
document.documentElement.classList.remove(cls);
|
||||
}
|
||||
};
|
||||
@@ -159,6 +159,7 @@ Use these for controlling whether you read or write from a local content cache.
|
||||
| **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. |
|
||||
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
|
||||
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
|
||||
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
|
||||
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
|
||||
| **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. |
|
||||
| **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. |
|
||||
|
||||
@@ -1787,6 +1787,7 @@ run_cfg = CrawlerRunConfig(
|
||||
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
|
||||
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
|
||||
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
|
||||
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
|
||||
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
|
||||
| **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. |
|
||||
| **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. |
|
||||
@@ -3344,8 +3345,9 @@ Below are the key interaction-related parameters in `CrawlerRunConfig`. For a fu
|
||||
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
|
||||
- **`session_id`**: Reuse the same page across calls.
|
||||
- **`cache_mode`**: Whether to read/write from the cache or bypass.
|
||||
- **`remove_overlay_elements`**: Remove certain popups automatically.
|
||||
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
|
||||
- **`remove_overlay_elements`**: Remove certain popups automatically.
|
||||
- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.).
|
||||
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions.
|
||||
## 8. Conclusion
|
||||
1. **Execute JavaScript** for scrolling, clicks, or form filling.
|
||||
2. **Wait** for CSS or custom JS conditions before capturing data.
|
||||
|
||||
@@ -153,8 +153,10 @@ Some sites embed content in `<iframe>` tags. If you want that inline:
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
# Merge iframe content into the final output
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True,
|
||||
# Remove GDPR/cookie consent popups (OneTrust, Cookiebot, etc.)
|
||||
remove_consent_popups=True
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@@ -326,8 +326,9 @@ Below are the key interaction-related parameters in `CrawlerRunConfig`. For a fu
|
||||
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
|
||||
- **`session_id`**: Reuse the same page across calls.
|
||||
- **`cache_mode`**: Whether to read/write from the cache or bypass.
|
||||
- **`remove_overlay_elements`**: Remove certain popups automatically.
|
||||
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
|
||||
- **`remove_overlay_elements`**: Remove certain popups automatically.
|
||||
- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.).
|
||||
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions.
|
||||
|
||||
---
|
||||
|
||||
|
||||
Reference in New Issue
Block a user