Add remove_consent_popups flag and fix from_kwargs dict deserialization

Add CrawlerRunConfig.remove_consent_popups (bool, default False) that
targets GDPR/cookie consent popups from 70+ known CMP providers including
OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Usercentrics,
Sourcepoint, Google FundingChoices, and many more.

The JS strategy uses a 5-phase approach:
1. Click "Accept All" buttons (cleanest dismissal, sets cookies)
2. Try CMP JavaScript APIs (__tcfapi, Didomi, Cookiebot, Osano, Klaro)
3. Remove known CMP containers by selector (~120 selectors)
4. Handle iframe-based and shadow DOM CMPs
5. Restore body scroll and remove CMP body classes

Also fix from_kwargs() in CrawlerRunConfig and BrowserConfig to
auto-deserialize dict values using the existing from_serializable_dict()
infrastructure. Previously, strategy objects like markdown_generator
arriving as {"type": "DefaultMarkdownGenerator", "params": {...}} from
JSON APIs were passed through as raw dicts, causing crashes when the
crawler later called methods on them.
This commit is contained in:
unclecode
2026-02-11 12:46:47 +00:00
parent 44b8afb6dc
commit 3fc7730aaf
7 changed files with 792 additions and 6 deletions

View File

@@ -734,6 +734,12 @@ class BrowserConfig:
@staticmethod
def from_kwargs(kwargs: dict) -> "BrowserConfig":
# Auto-deserialize any dict values that use the {"type": ..., "params": ...}
# serialization format (e.g. from JSON API requests or dump()/load() roundtrips).
kwargs = {
k: from_serializable_dict(v) if isinstance(v, dict) and "type" in v else v
for k, v in kwargs.items()
}
return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True),
@@ -1282,6 +1288,10 @@ class CrawlerRunConfig():
Default: False.
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
Default: False.
remove_consent_popups (bool): If True, remove GDPR/cookie consent popups (IAB TCF/CMP)
before extracting HTML. Targets known CMP providers like
OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, etc.
Default: False.
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
Default: False.
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
@@ -1445,6 +1455,7 @@ class CrawlerRunConfig():
max_scroll_steps: Optional[int] = None,
process_iframes: bool = False,
remove_overlay_elements: bool = False,
remove_consent_popups: bool = False,
simulate_user: bool = False,
override_navigator: bool = False,
magic: bool = False,
@@ -1573,6 +1584,7 @@ class CrawlerRunConfig():
self.max_scroll_steps = max_scroll_steps
self.process_iframes = process_iframes
self.remove_overlay_elements = remove_overlay_elements
self.remove_consent_popups = remove_consent_popups
self.simulate_user = simulate_user
self.override_navigator = override_navigator
self.magic = magic
@@ -1798,6 +1810,13 @@ class CrawlerRunConfig():
@staticmethod
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
# Auto-deserialize any dict values that use the {"type": ..., "params": ...}
# serialization format (e.g. from JSON API requests or dump()/load() roundtrips).
# This covers markdown_generator, extraction_strategy, content_filter, etc.
kwargs = {
k: from_serializable_dict(v) if isinstance(v, dict) and "type" in v else v
for k, v in kwargs.items()
}
return CrawlerRunConfig(
# Content Processing Parameters
word_count_threshold=kwargs.get("word_count_threshold", 200),
@@ -1854,6 +1873,7 @@ class CrawlerRunConfig():
max_scroll_steps=kwargs.get("max_scroll_steps"),
process_iframes=kwargs.get("process_iframes", False),
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
remove_consent_popups=kwargs.get("remove_consent_popups", False),
simulate_user=kwargs.get("simulate_user", False),
override_navigator=kwargs.get("override_navigator", False),
magic=kwargs.get("magic", False),
@@ -1978,6 +1998,7 @@ class CrawlerRunConfig():
"max_scroll_steps": self.max_scroll_steps,
"process_iframes": self.process_iframes,
"remove_overlay_elements": self.remove_overlay_elements,
"remove_consent_popups": self.remove_consent_popups,
"simulate_user": self.simulate_user,
"override_navigator": self.override_navigator,
"magic": self.magic,

View File

@@ -465,6 +465,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
config.wait_for or
config.scan_full_page or
config.remove_overlay_elements or
config.remove_consent_popups or
config.simulate_user or
config.magic or
config.process_iframes or
@@ -972,6 +973,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.delay_before_return_html:
await asyncio.sleep(config.delay_before_return_html)
# Handle CMP/consent popup removal (before generic overlay removal)
if config.remove_consent_popups:
await self.remove_consent_popups(page)
# Handle overlay removal
if config.remove_overlay_elements:
await self.remove_overlay_elements(page)
@@ -1447,6 +1452,50 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
params={"error": str(e)},
)
async def remove_consent_popups(self, page: Page) -> None:
"""
Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot,
TrustArc, Quantcast, Didomi, Usercentrics, Sourcepoint, Klaro, Osano, Iubenda,
Complianz, CookieYes, ConsentManager, LiveRamp/Fides, etc.).
Strategy:
1. Try clicking "Accept All" buttons (cleanest dismissal, sets cookies)
2. Try IAB TCF / CMP JavaScript APIs
3. Remove known CMP containers by selector
4. Handle iframe-based CMPs
5. Restore body scroll
Args:
page (Page): The Playwright page instance
"""
remove_consent_js = load_js_script("remove_consent_popups")
try:
await self.adapter.evaluate(page,
f"""
(async () => {{
try {{
const removeConsent = {remove_consent_js};
await removeConsent();
return {{ success: true }};
}} catch (error) {{
return {{
success: false,
error: error.toString(),
stack: error.stack
}};
}}
}})()
"""
)
await page.wait_for_timeout(500) # Wait for any animations to complete
except Exception as e:
self.logger.warning(
message="Failed to remove consent popups: {error}",
tag="SCRAPE",
params={"error": str(e)},
)
async def export_pdf(self, page: Page) -> bytes:
"""
Exports the current page as a PDF.

View File

@@ -0,0 +1,710 @@
async () => {
// Helper: check if element is visible
const isVisible = (elem) => {
if (!elem) return false;
const style = window.getComputedStyle(elem);
return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0";
};
// =========================================================================
// Phase 1: Click "Accept All" buttons (fastest, cleanest dismissal)
// =========================================================================
// CMP-specific accept button selectors (ordered by market share)
const cmpAcceptSelectors = [
// OneTrust / CookiePro
'#onetrust-accept-btn-handler',
'#accept-recommended-btn-handler',
// Cookiebot (Usercentrics/Cybot)
'#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll',
'#CybotCookiebotDialogBodyButtonAccept',
'#CybotCookiebotDialogBodyLevelButtonAccept',
// Didomi
'#didomi-notice-agree-button',
'.didomi-button-highlight',
// Quantcast Choice
'.qc-cmp2-summary-buttons button[mode="primary"]',
// Sourcepoint
'.sp_choice_type_11',
'.sp_choice_type_ACCEPT_ALL',
// Google FundingChoices / FC
'.fc-button.fc-cta-consent.fc-primary-button',
'.fc-cta-consent',
'.fc-confirm-choices',
// TrustArc
'#truste-consent-button',
'.truste_popframe .pdynamicbutton .call',
// ConsentManager.net
'.cmpboxbtnyes',
'#cmpwelcomebtnyes',
'.cmpboxbtnyescustomchoices',
// Osano
'.osano-cm-accept-all',
'.osano-cm-accept',
// Iubenda
'#iubenda-cs-accept-btn',
'.iubenda-cs-accept-btn',
// Complianz (WordPress)
'.cmplz-btn.cmplz-accept',
// LiveRamp / Fides (Ethyca)
'.fides-accept-all-button',
'#fides-banner .fides-accept-all-button',
// CookieYes
'.cky-btn-accept',
'[data-cky-tag="accept-button"]',
// Klaro
'.klaro .cm-btn-accept-all',
'.klaro .cm-btn-success',
'.klaro .cookie-notice .cm-btn-success',
// Termly
'[data-tid="banner-accept"]',
// CookieFirst
'button[data-cookiefirst-action="accept"]',
// CookieScript
'#cookiescript_accept',
// Borlabs Cookie (WordPress)
'a[data-cookie-accept-all]',
'.brlbs-btn-accept-all',
// Civic Cookie Control
'#ccc-recommended-settings',
'#ccc-notify-accept',
'.ccc-accept-button',
// Cookie Information
'.coi-banner__accept',
// Evidon / Crownpeak
'#_evidon-accept-button',
// Axeptio
'button#axeptio_btn_acceptAll',
// HubSpot
'#hs-eu-confirmation-button',
// Ketch
'#lanyard_root button[class*="confirmButton"]',
// Moove GDPR (WordPress)
'.moove-gdpr-infobar-allow-all',
// TermsFeed
'.cc-nb-okagree',
// tarteaucitron.js
'#tarteaucitronPersonalize2',
'.tarteaucitronAllow',
// CookieHub
'.ch2-allow-all-btn',
// Cookie Notice (WP plugin)
'#cn-accept-cookie',
// EU Cookie Compliance (Drupal)
'.eu-cookie-compliance-banner .agree-button',
'.eu-cookie-compliance-banner .accept-all',
// WordPress GDPR Cookie Consent
'#gdpr-cookie-consent-bar #cookie_action_accept',
// Cookie Law Info / WebToffee
'[data-cli_action="accept"]',
// Shopify Native
'#shopify-pc__banner__btn-accept',
// Wix Native
'[data-hook="ccsu-banner-accept"]',
// Finsweet (Webflow)
'[fs-consent-element="allow"]',
'[fs-cc="banner"] [fs-cc="allow"]',
// Pandectes (Shopify)
'#pandectes-banner .cc-allow',
// Clickio
'#cl-consent [data-role="b_agree"]',
// Snigel
'.snigel-cmp-framework #accept-choices',
// Cassie
'.cassie-accept-all',
// FastCMP
'.fast-cmp-home-accept button',
// Sibbo
'#acceptAllMain',
// PubTech
'#pt-accept-all',
// UniConsent
'#unic-agree',
// Ezoic
'#ez-accept-all',
// Transcend
'#transcend-consent-manager .inner-container button',
// Cloudflare Zaraz
'#cf_consent-buttons__accept-all',
// CookieConsent v2 (Insites/Osano OSS)
'#s-all-bn',
// CookieConsent v3 (orestbida)
'.cm__btn[data-role="all"]',
// Openli / Legal Monster
'#lm-accept-all',
// UK Cookie Consent (Catapult)
'#catapultCookie',
// Mediavine
'[data-name="mediavine-gdpr-cmp"] [format="primary"]',
// Consentmo (Shopify)
'.isense-cc-allow',
// AdOpt
'#adopt-accept-all-button',
// Truyo
'button#acceptAllCookieButton',
// KConsent
'#kc-acceptAndHide',
// Gravito
'#modalConfirmBtn.gravitoCMP-button',
// CT Ultimate GDPR (WordPress)
'#ct-ultimate-gdpr-cookie-accept',
// Hu-manity
'[data-hu-action="cookies-notice-consent-choices-3"]',
// GDPR Legal Cookie (Shopify)
'.overlay_bc_banner *[data-cookie-accept-all]',
// Bing / Microsoft
'#bnp_btn_accept',
// Privado
'#cookie-consent-banner #accept-button',
// Cookie Alert
'button[data-controller="cookie-alert/extended/button/accept"]',
// iWink / STARTER
'body.cookies-request #cookie-bar .allow-cookies',
// Real Cookie Banner (devowl.io)
'.rcb-banner-cta-accept-all',
];
// Generic accept button selectors (attribute-based)
const genericAcceptSelectors = [
'button[id*="accept" i]',
'button[class*="accept-all" i]',
'button[class*="acceptAll" i]',
'a[id*="accept" i]',
'button[id*="agree" i]',
'button[class*="agree" i]',
'button[class*="allow-all" i]',
'button[class*="allowAll" i]',
'button[data-action="accept"]',
'button[data-action="accept-all"]',
'button[data-gdpr="accept"]',
'button[data-consent="accept"]',
];
// Try clicking a CMP-specific accept button
const clickButton = async (selectors) => {
for (const selector of selectors) {
try {
const btn = document.querySelector(selector);
if (btn && isVisible(btn)) {
btn.click();
await new Promise(r => setTimeout(r, 300));
return true;
}
} catch (e) { /* continue */ }
}
return false;
};
let accepted = await clickButton(cmpAcceptSelectors);
if (!accepted) accepted = await clickButton(genericAcceptSelectors);
// Text-content fallback: find buttons by visible text
if (!accepted) {
const acceptPatterns = [
/^accept\s*(all)?(\s*cookies)?$/i,
/^allow\s*(all)?(\s*cookies)?$/i,
/^i\s*agree$/i,
/^agree(\s*(and|&)\s*(close|continue))?$/i,
/^got\s*it[!]?$/i,
/^consent$/i,
/^(accept|agree)\s*&?\s*close$/i,
];
const candidates = document.querySelectorAll(
'button, a[role="button"], [role="button"], input[type="submit"], input[type="button"]'
);
for (const btn of candidates) {
const text = (btn.textContent || btn.value || '').trim();
if (text.length > 0 && text.length < 40) {
for (const pattern of acceptPatterns) {
if (pattern.test(text) && isVisible(btn)) {
btn.click();
accepted = true;
await new Promise(r => setTimeout(r, 300));
break;
}
}
if (accepted) break;
}
}
}
// Shadow DOM: Usercentrics, Axeptio
if (!accepted) {
const shadowRoots = [
{ id: 'usercentrics-root', btn: 'button[data-testid="uc-accept-all-button"]' },
{ cls: 'axeptio_mount', btn: 'button#axeptio_btn_acceptAll' },
];
for (const cfg of shadowRoots) {
try {
const host = cfg.id
? document.getElementById(cfg.id)
: document.querySelector('.' + cfg.cls);
if (host && host.shadowRoot) {
const btn = host.shadowRoot.querySelector(cfg.btn);
if (btn) {
btn.click();
accepted = true;
await new Promise(r => setTimeout(r, 300));
break;
}
}
} catch (e) { /* continue */ }
}
}
// Iframe-based CMPs (Sourcepoint, FastCMP, TrustArc, Privacy Manager)
if (!accepted) {
const iframeSelectors = [
'iframe[id^="sp_message_iframe"]',
'iframe#fast-cmp-iframe',
'iframe[id*="consent" i]',
'iframe[title*="consent" i]',
'iframe[title*="cookie" i]',
'iframe[title*="privacy" i]',
'iframe[src*="privacymanager" i]',
'iframe[src*="consent-tool" i]',
];
for (const sel of iframeSelectors) {
try {
const iframe = document.querySelector(sel);
if (iframe && iframe.contentDocument) {
const iframeDoc = iframe.contentDocument;
const btns = iframeDoc.querySelectorAll(
'button[title="Accept All" i], button[title="Accept" i], ' +
'.sp_choice_type_11, button.message-button, ' +
'button[class*="accept" i], button[class*="agree" i]'
);
for (const btn of btns) {
if (btn.offsetParent !== null) {
btn.click();
accepted = true;
await new Promise(r => setTimeout(r, 300));
break;
}
}
if (accepted) break;
}
} catch (e) { /* cross-origin iframes will throw SecurityError, expected */ }
}
}
// =========================================================================
// Phase 2: Try CMP JavaScript APIs
// =========================================================================
// IAB TCF v2 API
if (typeof window.__tcfapi === 'function') {
try {
window.__tcfapi('addEventListener', 2, () => {});
} catch (e) { /* continue */ }
}
// Didomi API
if (typeof window.Didomi !== 'undefined') {
try {
window.Didomi.setUserAgreeToAll();
} catch (e) { /* continue */ }
}
// Cookiebot API
if (typeof window.Cookiebot !== 'undefined') {
try {
window.Cookiebot.submitCustomConsent(true, true, true);
} catch (e) { /* continue */ }
}
// Osano API
if (typeof window.Osano !== 'undefined') {
try {
window.Osano.cm.acceptAll();
} catch (e) { /* continue */ }
}
// Klaro API
if (typeof window.klaro !== 'undefined') {
try {
window.klaro.getManager().acceptAll();
} catch (e) { /* continue */ }
}
// Wait for CMP animations/transitions
await new Promise(r => setTimeout(r, 500));
// =========================================================================
// Phase 3: Remove known CMP containers by selector
// =========================================================================
const cmpContainerSelectors = [
// --- Tier 1: Enterprise CMPs ---
// OneTrust / CookiePro
'#onetrust-consent-sdk',
'#onetrust-banner-sdk',
'.onetrust-pc-dark-filter',
'#onetrust-pc-sdk',
// Cookiebot (Usercentrics/Cybot)
'#CybotCookiebotDialog',
'#CybotCookiebotDialogBodyUnderlay',
'#dtcookie-container',
'#cookiebanner',
// TrustArc
'#truste-consent-track',
'.truste_overlay',
'.truste_box_overlay',
'#truste-consent-content',
'#consent_blackbar',
'.trustarc-banner-container',
// Quantcast Choice
'.qc-cmp2-container',
'#qc-cmp2-main',
'.qc-cmp-cleanslate',
// Didomi
'#didomi-host',
'#didomi-popup',
'#didomi-notice',
// Usercentrics
'#usercentrics-root',
'#usercentrics-cmp-ui',
// Sourcepoint
'div[id^="sp_message_container"]',
'.sp_message_container',
// Google FundingChoices / FC
'.fc-consent-root',
'.fc-dialog-overlay',
'.fc-dialog-container',
// --- Tier 2: Mid-Market CMPs ---
// Klaro
'.klaro',
// Osano
'.osano-cm-window',
'.osano-cm-dialog',
// Iubenda
'#iubenda-cs-banner',
// Complianz (WordPress)
'.cmplz-cookiebanner',
'#cmplz-cookiebanner-container',
// CookieYes
'.cky-consent-container',
'.cky-overlay',
// ConsentManager.net
'.cmpbox',
'#cmpbox',
'#cmpbox2',
'#cmpwrapper',
// LiveRamp / Fides (Ethyca)
'.fides-overlay',
'#fides-banner',
'#fides-overlay',
'#fides-overlay-wrapper',
// Termly
'#termly-code-snippet-support',
// CookieFirst
'#cookiefirst-root',
'.cookiefirst-root',
// CookieScript
'#cookiescript_injected',
'.cookiescript_fsd_main',
// Borlabs Cookie (WordPress)
'#BorlabsCookieBox',
'._brlbs-bar-wrap',
'._brlbs-box-wrap',
// Civic Cookie Control
'#ccc',
'#ccc-module',
'#ccc-overlay',
// Cookie Information
'#cookie-information-template-wrapper',
'#coiOverlay',
// Evidon / Crownpeak
'#_evidon_banner',
'#_evidon-background',
'#evidon-prefdiag-overlay',
// Axeptio
'.axeptio_widget',
'.axeptio_mount',
// HubSpot
'#hs-eu-cookie-confirmation',
// Ketch
'#lanyard_root',
// --- Tier 3: Regional / WordPress / Specialized CMPs ---
// tarteaucitron.js
'#tarteaucitronRoot',
'#tarteaucitronAlertBig',
// CookieHub
'.ch2-container',
'.ch2-dialog',
// Moove GDPR (WordPress)
'#moove_gdpr_cookie_info_bar',
'#moove_gdpr_cookie_modal',
'.gdpr_cookie_settings_popup_overlay',
// TermsFeed
'.termsfeed-com---nb',
// Cookie Notice (WP plugin)
'#cookie-notice',
// Cookie Law Info / WebToffee
'#cookie-law-info-bar',
'#cookie-law-bg',
'.cli-popupbar-overlay',
// EU Cookie Compliance (Drupal)
'.eu-cookie-compliance-banner',
// WordPress GDPR Cookie Consent
'#gdpr-cookie-consent-bar',
// Shopify Native
'#shopify-pc__banner',
// Wix Native
'[data-comp-type="cookie-banner-root-wix"]',
'[data-hook="ccsu-banner-wrapper"]',
// Finsweet (Webflow)
'[fs-consent-element="banner"]',
'.fs-cc-components',
// Pandectes (Shopify)
'#pandectes-banner',
// Clickio
'#cl-consent',
// Snigel
'.snigel-cmp-framework',
// Cassie
'.cassie-cookie-module',
'.cassie-pre-banner',
// FastCMP
'#fast-cmp-root',
// Sibbo
'sibbo-cmp-layout',
// PubTech
'#pubtech-cmp',
// UniConsent
'.unic',
// Ezoic
'#ez-cookie-dialog-wrapper',
// Transcend
'#transcend-consent-manager',
// Cloudflare Zaraz
'.cf_modal_container',
// CookieConsent v2 (Insites/Osano OSS)
'#cc--main',
// CookieConsent v3 (orestbida)
'#cc-main',
// Openli / Legal Monster
'.legalmonster-cleanslate',
// UK Cookie Consent (Catapult)
'#catapult-cookie-bar',
// Sirdata
'#sd-cmp',
// Mediavine
'[data-name="mediavine-gdpr-cmp"]',
// Consentmo (Shopify)
'.isense-cc-window',
// AdOpt
'#cookie-banner',
// Truyo
'#truyo-consent-module',
// KConsent
'#kconsent',
'.kc-overlay',
// Gravito
'.gravitoCMP-background-overlay',
// CT Ultimate GDPR (WordPress)
'#ct-ultimate-gdpr-cookie-popup',
// Hu-manity
'#hu.hu-wrapper',
// GDPR Legal Cookie (Shopify)
'.overlay_bc_banner',
// Piwik PRO
'.PiwikPROConsentForm-container',
// Tealium
'#__tealiumGDPRecModal',
'#__tealiumImplicitmodal',
'#consent-layer',
// PMC (Penske Media)
'#pmc-pp-tou--notice',
// Privado
'#cookie-consent-banner',
// Real Cookie Banner (devowl.io)
'.rcb-banner',
// Bing / Microsoft
'#bnp_container',
'#bnp_cookie_banner',
// LinkedIn
'.artdeco-global-alert[type="COOKIE_CONSENT"]',
// Privacy Manager
'#gdpr-consent-tool-wrapper',
// --- Generic patterns (catch-all) ---
'[class*="cookie-consent" i]',
'[id*="cookie-consent" i]',
'[class*="cookie-banner" i]',
'[id*="cookie-banner" i]',
'[class*="consent-banner" i]',
'[id*="consent-banner" i]',
'[class*="consent-popup" i]',
'[id*="consent-popup" i]',
'[class*="gdpr-banner" i]',
'[id*="gdpr-banner" i]',
'[class*="cookie-notice" i]',
'[id*="cookie-notice" i]',
'[class*="cookie-law" i]',
'[id*="cookie-law" i]',
'[class*="cookie-popup" i]',
'[id*="cookie-popup" i]',
'[class*="cookie-overlay" i]',
'[id*="cookie-overlay" i]',
'.cc-banner',
'.cc-window',
];
for (const selector of cmpContainerSelectors) {
try {
const elements = document.querySelectorAll(selector);
elements.forEach(el => el.remove());
} catch (e) { /* continue */ }
}
// =========================================================================
// Phase 4: Remove CMP iframes
// =========================================================================
const cmpIframeSelectors = [
'iframe[id^="sp_message_iframe"]',
'iframe#fast-cmp-iframe',
'iframe[src*="consent" i]',
'iframe[src*="cookie-cdn" i]',
'iframe[src*="cookiebot" i]',
'iframe[src*="trustarc" i]',
'iframe[src*="consentmanager" i]',
'iframe[src*="privacymanager" i]',
'iframe[src*="cmp-consent-tool" i]',
'iframe[title*="consent" i]',
'iframe[title*="cookie" i]',
'iframe[title*="gdpr" i]',
'iframe[name="__tcfapiLocator"]',
];
for (const selector of cmpIframeSelectors) {
try {
const iframes = document.querySelectorAll(selector);
iframes.forEach(iframe => {
// Also remove parent if it's a CMP wrapper (fixed/high-z)
const parent = iframe.parentElement;
if (parent && parent.children.length <= 2) {
const style = window.getComputedStyle(parent);
if (style.position === 'fixed' || style.position === 'absolute' ||
parseInt(style.zIndex) > 999) {
parent.remove();
return;
}
}
iframe.remove();
});
} catch (e) { /* continue */ }
}
// =========================================================================
// Phase 5: Restore body scroll and clean up CMP artifacts
// =========================================================================
// Reset overflow on body and html
document.body.style.overflow = '';
document.body.style.overflowY = '';
document.body.style.position = '';
document.body.style.marginRight = '';
document.body.style.paddingRight = '';
document.documentElement.style.overflow = '';
document.documentElement.style.overflowY = '';
document.documentElement.style.position = '';
// Remove known CMP body classes
const cmpBodyClasses = [
'ot-overflow-hidden',
'sp_message_open',
'didomi-popup-open',
'cmpbox-show',
'cmplz-blocked',
'qc-cmp2-no-scroll',
'osano-cm-show',
'cky-modal-open',
'fides-overlay-modal-open',
'cc-no-scroll',
'gdpr-cookie-notice-center-loaded',
'fc-consent-root-open',
'cookie-notification-active',
'consent-bar-push-large',
'with-eu-cookie-guideline',
'cookies-request',
'eu-cookie-compliance-popup-open',
'has-cookie-bar',
];
for (const cls of cmpBodyClasses) {
document.body.classList.remove(cls);
document.documentElement.classList.remove(cls);
}
};

View File

@@ -159,6 +159,7 @@ Use these for controlling whether you read or write from a local content cache.
| **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. |
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
| **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. |
| **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. |

View File

@@ -1787,6 +1787,7 @@ run_cfg = CrawlerRunConfig(
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
| **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. |
| **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. |
@@ -3344,8 +3345,9 @@ Below are the key interaction-related parameters in `CrawlerRunConfig`. For a fu
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
- **`session_id`**: Reuse the same page across calls.
- **`cache_mode`**: Whether to read/write from the cache or bypass.
- **`remove_overlay_elements`**: Remove certain popups automatically.
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
- **`remove_overlay_elements`**: Remove certain popups automatically.
- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.).
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions.
## 8. Conclusion
1. **Execute JavaScript** for scrolling, clicks, or form filling.
2. **Wait** for CSS or custom JS conditions before capturing data.

View File

@@ -153,8 +153,10 @@ Some sites embed content in `<iframe>` tags. If you want that inline:
```python
config = CrawlerRunConfig(
# Merge iframe content into the final output
process_iframes=True,
remove_overlay_elements=True
process_iframes=True,
remove_overlay_elements=True,
# Remove GDPR/cookie consent popups (OneTrust, Cookiebot, etc.)
remove_consent_popups=True
)
```

View File

@@ -326,8 +326,9 @@ Below are the key interaction-related parameters in `CrawlerRunConfig`. For a fu
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
- **`session_id`**: Reuse the same page across calls.
- **`cache_mode`**: Whether to read/write from the cache or bypass.
- **`remove_overlay_elements`**: Remove certain popups automatically.
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
- **`remove_overlay_elements`**: Remove certain popups automatically.
- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.).
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions.
---