From 3fc7730aaf705afae64a7cda63cfbadab258a80e Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 11 Feb 2026 12:46:47 +0000 Subject: [PATCH] Add remove_consent_popups flag and fix from_kwargs dict deserialization Add CrawlerRunConfig.remove_consent_popups (bool, default False) that targets GDPR/cookie consent popups from 70+ known CMP providers including OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Usercentrics, Sourcepoint, Google FundingChoices, and many more. The JS strategy uses a 5-phase approach: 1. Click "Accept All" buttons (cleanest dismissal, sets cookies) 2. Try CMP JavaScript APIs (__tcfapi, Didomi, Cookiebot, Osano, Klaro) 3. Remove known CMP containers by selector (~120 selectors) 4. Handle iframe-based and shadow DOM CMPs 5. Restore body scroll and remove CMP body classes Also fix from_kwargs() in CrawlerRunConfig and BrowserConfig to auto-deserialize dict values using the existing from_serializable_dict() infrastructure. Previously, strategy objects like markdown_generator arriving as {"type": "DefaultMarkdownGenerator", "params": {...}} from JSON APIs were passed through as raw dicts, causing crashes when the crawler later called methods on them. --- crawl4ai/async_configs.py | 21 + crawl4ai/async_crawler_strategy.py | 49 ++ crawl4ai/js_snippet/remove_consent_popups.js | 710 +++++++++++++++++++ docs/md_v2/api/parameters.md | 1 + docs/md_v2/complete-sdk-reference.md | 6 +- docs/md_v2/core/content-selection.md | 6 +- docs/md_v2/core/page-interaction.md | 5 +- 7 files changed, 792 insertions(+), 6 deletions(-) create mode 100644 crawl4ai/js_snippet/remove_consent_popups.js diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 69ab389f..dbacaa75 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -734,6 +734,12 @@ class BrowserConfig: @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": + # Auto-deserialize any dict values that use the {"type": ..., "params": ...} + # serialization format (e.g. from JSON API requests or dump()/load() roundtrips). + kwargs = { + k: from_serializable_dict(v) if isinstance(v, dict) and "type" in v else v + for k, v in kwargs.items() + } return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), @@ -1282,6 +1288,10 @@ class CrawlerRunConfig(): Default: False. remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. Default: False. + remove_consent_popups (bool): If True, remove GDPR/cookie consent popups (IAB TCF/CMP) + before extracting HTML. Targets known CMP providers like + OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, etc. + Default: False. simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures. Default: False. override_navigator (bool): If True, overrides navigator properties for more human-like behavior. @@ -1445,6 +1455,7 @@ class CrawlerRunConfig(): max_scroll_steps: Optional[int] = None, process_iframes: bool = False, remove_overlay_elements: bool = False, + remove_consent_popups: bool = False, simulate_user: bool = False, override_navigator: bool = False, magic: bool = False, @@ -1573,6 +1584,7 @@ class CrawlerRunConfig(): self.max_scroll_steps = max_scroll_steps self.process_iframes = process_iframes self.remove_overlay_elements = remove_overlay_elements + self.remove_consent_popups = remove_consent_popups self.simulate_user = simulate_user self.override_navigator = override_navigator self.magic = magic @@ -1798,6 +1810,13 @@ class CrawlerRunConfig(): @staticmethod def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": + # Auto-deserialize any dict values that use the {"type": ..., "params": ...} + # serialization format (e.g. from JSON API requests or dump()/load() roundtrips). + # This covers markdown_generator, extraction_strategy, content_filter, etc. + kwargs = { + k: from_serializable_dict(v) if isinstance(v, dict) and "type" in v else v + for k, v in kwargs.items() + } return CrawlerRunConfig( # Content Processing Parameters word_count_threshold=kwargs.get("word_count_threshold", 200), @@ -1854,6 +1873,7 @@ class CrawlerRunConfig(): max_scroll_steps=kwargs.get("max_scroll_steps"), process_iframes=kwargs.get("process_iframes", False), remove_overlay_elements=kwargs.get("remove_overlay_elements", False), + remove_consent_popups=kwargs.get("remove_consent_popups", False), simulate_user=kwargs.get("simulate_user", False), override_navigator=kwargs.get("override_navigator", False), magic=kwargs.get("magic", False), @@ -1978,6 +1998,7 @@ class CrawlerRunConfig(): "max_scroll_steps": self.max_scroll_steps, "process_iframes": self.process_iframes, "remove_overlay_elements": self.remove_overlay_elements, + "remove_consent_popups": self.remove_consent_popups, "simulate_user": self.simulate_user, "override_navigator": self.override_navigator, "magic": self.magic, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 944bb2b7..9ac3f7fc 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -465,6 +465,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): config.wait_for or config.scan_full_page or config.remove_overlay_elements or + config.remove_consent_popups or config.simulate_user or config.magic or config.process_iframes or @@ -972,6 +973,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.delay_before_return_html: await asyncio.sleep(config.delay_before_return_html) + # Handle CMP/consent popup removal (before generic overlay removal) + if config.remove_consent_popups: + await self.remove_consent_popups(page) + # Handle overlay removal if config.remove_overlay_elements: await self.remove_overlay_elements(page) @@ -1447,6 +1452,50 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": str(e)}, ) + async def remove_consent_popups(self, page: Page) -> None: + """ + Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, + TrustArc, Quantcast, Didomi, Usercentrics, Sourcepoint, Klaro, Osano, Iubenda, + Complianz, CookieYes, ConsentManager, LiveRamp/Fides, etc.). + + Strategy: + 1. Try clicking "Accept All" buttons (cleanest dismissal, sets cookies) + 2. Try IAB TCF / CMP JavaScript APIs + 3. Remove known CMP containers by selector + 4. Handle iframe-based CMPs + 5. Restore body scroll + + Args: + page (Page): The Playwright page instance + """ + remove_consent_js = load_js_script("remove_consent_popups") + + try: + await self.adapter.evaluate(page, + f""" + (async () => {{ + try {{ + const removeConsent = {remove_consent_js}; + await removeConsent(); + return {{ success: true }}; + }} catch (error) {{ + return {{ + success: false, + error: error.toString(), + stack: error.stack + }}; + }} + }})() + """ + ) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + self.logger.warning( + message="Failed to remove consent popups: {error}", + tag="SCRAPE", + params={"error": str(e)}, + ) + async def export_pdf(self, page: Page) -> bytes: """ Exports the current page as a PDF. diff --git a/crawl4ai/js_snippet/remove_consent_popups.js b/crawl4ai/js_snippet/remove_consent_popups.js new file mode 100644 index 00000000..9aac8d34 --- /dev/null +++ b/crawl4ai/js_snippet/remove_consent_popups.js @@ -0,0 +1,710 @@ +async () => { + // Helper: check if element is visible + const isVisible = (elem) => { + if (!elem) return false; + const style = window.getComputedStyle(elem); + return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0"; + }; + + // ========================================================================= + // Phase 1: Click "Accept All" buttons (fastest, cleanest dismissal) + // ========================================================================= + + // CMP-specific accept button selectors (ordered by market share) + const cmpAcceptSelectors = [ + // OneTrust / CookiePro + '#onetrust-accept-btn-handler', + '#accept-recommended-btn-handler', + // Cookiebot (Usercentrics/Cybot) + '#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll', + '#CybotCookiebotDialogBodyButtonAccept', + '#CybotCookiebotDialogBodyLevelButtonAccept', + // Didomi + '#didomi-notice-agree-button', + '.didomi-button-highlight', + // Quantcast Choice + '.qc-cmp2-summary-buttons button[mode="primary"]', + // Sourcepoint + '.sp_choice_type_11', + '.sp_choice_type_ACCEPT_ALL', + // Google FundingChoices / FC + '.fc-button.fc-cta-consent.fc-primary-button', + '.fc-cta-consent', + '.fc-confirm-choices', + // TrustArc + '#truste-consent-button', + '.truste_popframe .pdynamicbutton .call', + // ConsentManager.net + '.cmpboxbtnyes', + '#cmpwelcomebtnyes', + '.cmpboxbtnyescustomchoices', + // Osano + '.osano-cm-accept-all', + '.osano-cm-accept', + // Iubenda + '#iubenda-cs-accept-btn', + '.iubenda-cs-accept-btn', + // Complianz (WordPress) + '.cmplz-btn.cmplz-accept', + // LiveRamp / Fides (Ethyca) + '.fides-accept-all-button', + '#fides-banner .fides-accept-all-button', + // CookieYes + '.cky-btn-accept', + '[data-cky-tag="accept-button"]', + // Klaro + '.klaro .cm-btn-accept-all', + '.klaro .cm-btn-success', + '.klaro .cookie-notice .cm-btn-success', + // Termly + '[data-tid="banner-accept"]', + // CookieFirst + 'button[data-cookiefirst-action="accept"]', + // CookieScript + '#cookiescript_accept', + // Borlabs Cookie (WordPress) + 'a[data-cookie-accept-all]', + '.brlbs-btn-accept-all', + // Civic Cookie Control + '#ccc-recommended-settings', + '#ccc-notify-accept', + '.ccc-accept-button', + // Cookie Information + '.coi-banner__accept', + // Evidon / Crownpeak + '#_evidon-accept-button', + // Axeptio + 'button#axeptio_btn_acceptAll', + // HubSpot + '#hs-eu-confirmation-button', + // Ketch + '#lanyard_root button[class*="confirmButton"]', + // Moove GDPR (WordPress) + '.moove-gdpr-infobar-allow-all', + // TermsFeed + '.cc-nb-okagree', + // tarteaucitron.js + '#tarteaucitronPersonalize2', + '.tarteaucitronAllow', + // CookieHub + '.ch2-allow-all-btn', + // Cookie Notice (WP plugin) + '#cn-accept-cookie', + // EU Cookie Compliance (Drupal) + '.eu-cookie-compliance-banner .agree-button', + '.eu-cookie-compliance-banner .accept-all', + // WordPress GDPR Cookie Consent + '#gdpr-cookie-consent-bar #cookie_action_accept', + // Cookie Law Info / WebToffee + '[data-cli_action="accept"]', + // Shopify Native + '#shopify-pc__banner__btn-accept', + // Wix Native + '[data-hook="ccsu-banner-accept"]', + // Finsweet (Webflow) + '[fs-consent-element="allow"]', + '[fs-cc="banner"] [fs-cc="allow"]', + // Pandectes (Shopify) + '#pandectes-banner .cc-allow', + // Clickio + '#cl-consent [data-role="b_agree"]', + // Snigel + '.snigel-cmp-framework #accept-choices', + // Cassie + '.cassie-accept-all', + // FastCMP + '.fast-cmp-home-accept button', + // Sibbo + '#acceptAllMain', + // PubTech + '#pt-accept-all', + // UniConsent + '#unic-agree', + // Ezoic + '#ez-accept-all', + // Transcend + '#transcend-consent-manager .inner-container button', + // Cloudflare Zaraz + '#cf_consent-buttons__accept-all', + // CookieConsent v2 (Insites/Osano OSS) + '#s-all-bn', + // CookieConsent v3 (orestbida) + '.cm__btn[data-role="all"]', + // Openli / Legal Monster + '#lm-accept-all', + // UK Cookie Consent (Catapult) + '#catapultCookie', + // Mediavine + '[data-name="mediavine-gdpr-cmp"] [format="primary"]', + // Consentmo (Shopify) + '.isense-cc-allow', + // AdOpt + '#adopt-accept-all-button', + // Truyo + 'button#acceptAllCookieButton', + // KConsent + '#kc-acceptAndHide', + // Gravito + '#modalConfirmBtn.gravitoCMP-button', + // CT Ultimate GDPR (WordPress) + '#ct-ultimate-gdpr-cookie-accept', + // Hu-manity + '[data-hu-action="cookies-notice-consent-choices-3"]', + // GDPR Legal Cookie (Shopify) + '.overlay_bc_banner *[data-cookie-accept-all]', + // Bing / Microsoft + '#bnp_btn_accept', + // Privado + '#cookie-consent-banner #accept-button', + // Cookie Alert + 'button[data-controller="cookie-alert/extended/button/accept"]', + // iWink / STARTER + 'body.cookies-request #cookie-bar .allow-cookies', + // Real Cookie Banner (devowl.io) + '.rcb-banner-cta-accept-all', + ]; + + // Generic accept button selectors (attribute-based) + const genericAcceptSelectors = [ + 'button[id*="accept" i]', + 'button[class*="accept-all" i]', + 'button[class*="acceptAll" i]', + 'a[id*="accept" i]', + 'button[id*="agree" i]', + 'button[class*="agree" i]', + 'button[class*="allow-all" i]', + 'button[class*="allowAll" i]', + 'button[data-action="accept"]', + 'button[data-action="accept-all"]', + 'button[data-gdpr="accept"]', + 'button[data-consent="accept"]', + ]; + + // Try clicking a CMP-specific accept button + const clickButton = async (selectors) => { + for (const selector of selectors) { + try { + const btn = document.querySelector(selector); + if (btn && isVisible(btn)) { + btn.click(); + await new Promise(r => setTimeout(r, 300)); + return true; + } + } catch (e) { /* continue */ } + } + return false; + }; + + let accepted = await clickButton(cmpAcceptSelectors); + if (!accepted) accepted = await clickButton(genericAcceptSelectors); + + // Text-content fallback: find buttons by visible text + if (!accepted) { + const acceptPatterns = [ + /^accept\s*(all)?(\s*cookies)?$/i, + /^allow\s*(all)?(\s*cookies)?$/i, + /^i\s*agree$/i, + /^agree(\s*(and|&)\s*(close|continue))?$/i, + /^got\s*it[!]?$/i, + /^consent$/i, + /^(accept|agree)\s*&?\s*close$/i, + ]; + + const candidates = document.querySelectorAll( + 'button, a[role="button"], [role="button"], input[type="submit"], input[type="button"]' + ); + for (const btn of candidates) { + const text = (btn.textContent || btn.value || '').trim(); + if (text.length > 0 && text.length < 40) { + for (const pattern of acceptPatterns) { + if (pattern.test(text) && isVisible(btn)) { + btn.click(); + accepted = true; + await new Promise(r => setTimeout(r, 300)); + break; + } + } + if (accepted) break; + } + } + } + + // Shadow DOM: Usercentrics, Axeptio + if (!accepted) { + const shadowRoots = [ + { id: 'usercentrics-root', btn: 'button[data-testid="uc-accept-all-button"]' }, + { cls: 'axeptio_mount', btn: 'button#axeptio_btn_acceptAll' }, + ]; + for (const cfg of shadowRoots) { + try { + const host = cfg.id + ? document.getElementById(cfg.id) + : document.querySelector('.' + cfg.cls); + if (host && host.shadowRoot) { + const btn = host.shadowRoot.querySelector(cfg.btn); + if (btn) { + btn.click(); + accepted = true; + await new Promise(r => setTimeout(r, 300)); + break; + } + } + } catch (e) { /* continue */ } + } + } + + // Iframe-based CMPs (Sourcepoint, FastCMP, TrustArc, Privacy Manager) + if (!accepted) { + const iframeSelectors = [ + 'iframe[id^="sp_message_iframe"]', + 'iframe#fast-cmp-iframe', + 'iframe[id*="consent" i]', + 'iframe[title*="consent" i]', + 'iframe[title*="cookie" i]', + 'iframe[title*="privacy" i]', + 'iframe[src*="privacymanager" i]', + 'iframe[src*="consent-tool" i]', + ]; + for (const sel of iframeSelectors) { + try { + const iframe = document.querySelector(sel); + if (iframe && iframe.contentDocument) { + const iframeDoc = iframe.contentDocument; + const btns = iframeDoc.querySelectorAll( + 'button[title="Accept All" i], button[title="Accept" i], ' + + '.sp_choice_type_11, button.message-button, ' + + 'button[class*="accept" i], button[class*="agree" i]' + ); + for (const btn of btns) { + if (btn.offsetParent !== null) { + btn.click(); + accepted = true; + await new Promise(r => setTimeout(r, 300)); + break; + } + } + if (accepted) break; + } + } catch (e) { /* cross-origin iframes will throw SecurityError, expected */ } + } + } + + // ========================================================================= + // Phase 2: Try CMP JavaScript APIs + // ========================================================================= + + // IAB TCF v2 API + if (typeof window.__tcfapi === 'function') { + try { + window.__tcfapi('addEventListener', 2, () => {}); + } catch (e) { /* continue */ } + } + + // Didomi API + if (typeof window.Didomi !== 'undefined') { + try { + window.Didomi.setUserAgreeToAll(); + } catch (e) { /* continue */ } + } + + // Cookiebot API + if (typeof window.Cookiebot !== 'undefined') { + try { + window.Cookiebot.submitCustomConsent(true, true, true); + } catch (e) { /* continue */ } + } + + // Osano API + if (typeof window.Osano !== 'undefined') { + try { + window.Osano.cm.acceptAll(); + } catch (e) { /* continue */ } + } + + // Klaro API + if (typeof window.klaro !== 'undefined') { + try { + window.klaro.getManager().acceptAll(); + } catch (e) { /* continue */ } + } + + // Wait for CMP animations/transitions + await new Promise(r => setTimeout(r, 500)); + + // ========================================================================= + // Phase 3: Remove known CMP containers by selector + // ========================================================================= + const cmpContainerSelectors = [ + // --- Tier 1: Enterprise CMPs --- + + // OneTrust / CookiePro + '#onetrust-consent-sdk', + '#onetrust-banner-sdk', + '.onetrust-pc-dark-filter', + '#onetrust-pc-sdk', + + // Cookiebot (Usercentrics/Cybot) + '#CybotCookiebotDialog', + '#CybotCookiebotDialogBodyUnderlay', + '#dtcookie-container', + '#cookiebanner', + + // TrustArc + '#truste-consent-track', + '.truste_overlay', + '.truste_box_overlay', + '#truste-consent-content', + '#consent_blackbar', + '.trustarc-banner-container', + + // Quantcast Choice + '.qc-cmp2-container', + '#qc-cmp2-main', + '.qc-cmp-cleanslate', + + // Didomi + '#didomi-host', + '#didomi-popup', + '#didomi-notice', + + // Usercentrics + '#usercentrics-root', + '#usercentrics-cmp-ui', + + // Sourcepoint + 'div[id^="sp_message_container"]', + '.sp_message_container', + + // Google FundingChoices / FC + '.fc-consent-root', + '.fc-dialog-overlay', + '.fc-dialog-container', + + // --- Tier 2: Mid-Market CMPs --- + + // Klaro + '.klaro', + + // Osano + '.osano-cm-window', + '.osano-cm-dialog', + + // Iubenda + '#iubenda-cs-banner', + + // Complianz (WordPress) + '.cmplz-cookiebanner', + '#cmplz-cookiebanner-container', + + // CookieYes + '.cky-consent-container', + '.cky-overlay', + + // ConsentManager.net + '.cmpbox', + '#cmpbox', + '#cmpbox2', + '#cmpwrapper', + + // LiveRamp / Fides (Ethyca) + '.fides-overlay', + '#fides-banner', + '#fides-overlay', + '#fides-overlay-wrapper', + + // Termly + '#termly-code-snippet-support', + + // CookieFirst + '#cookiefirst-root', + '.cookiefirst-root', + + // CookieScript + '#cookiescript_injected', + '.cookiescript_fsd_main', + + // Borlabs Cookie (WordPress) + '#BorlabsCookieBox', + '._brlbs-bar-wrap', + '._brlbs-box-wrap', + + // Civic Cookie Control + '#ccc', + '#ccc-module', + '#ccc-overlay', + + // Cookie Information + '#cookie-information-template-wrapper', + '#coiOverlay', + + // Evidon / Crownpeak + '#_evidon_banner', + '#_evidon-background', + '#evidon-prefdiag-overlay', + + // Axeptio + '.axeptio_widget', + '.axeptio_mount', + + // HubSpot + '#hs-eu-cookie-confirmation', + + // Ketch + '#lanyard_root', + + // --- Tier 3: Regional / WordPress / Specialized CMPs --- + + // tarteaucitron.js + '#tarteaucitronRoot', + '#tarteaucitronAlertBig', + + // CookieHub + '.ch2-container', + '.ch2-dialog', + + // Moove GDPR (WordPress) + '#moove_gdpr_cookie_info_bar', + '#moove_gdpr_cookie_modal', + '.gdpr_cookie_settings_popup_overlay', + + // TermsFeed + '.termsfeed-com---nb', + + // Cookie Notice (WP plugin) + '#cookie-notice', + + // Cookie Law Info / WebToffee + '#cookie-law-info-bar', + '#cookie-law-bg', + '.cli-popupbar-overlay', + + // EU Cookie Compliance (Drupal) + '.eu-cookie-compliance-banner', + + // WordPress GDPR Cookie Consent + '#gdpr-cookie-consent-bar', + + // Shopify Native + '#shopify-pc__banner', + + // Wix Native + '[data-comp-type="cookie-banner-root-wix"]', + '[data-hook="ccsu-banner-wrapper"]', + + // Finsweet (Webflow) + '[fs-consent-element="banner"]', + '.fs-cc-components', + + // Pandectes (Shopify) + '#pandectes-banner', + + // Clickio + '#cl-consent', + + // Snigel + '.snigel-cmp-framework', + + // Cassie + '.cassie-cookie-module', + '.cassie-pre-banner', + + // FastCMP + '#fast-cmp-root', + + // Sibbo + 'sibbo-cmp-layout', + + // PubTech + '#pubtech-cmp', + + // UniConsent + '.unic', + + // Ezoic + '#ez-cookie-dialog-wrapper', + + // Transcend + '#transcend-consent-manager', + + // Cloudflare Zaraz + '.cf_modal_container', + + // CookieConsent v2 (Insites/Osano OSS) + '#cc--main', + + // CookieConsent v3 (orestbida) + '#cc-main', + + // Openli / Legal Monster + '.legalmonster-cleanslate', + + // UK Cookie Consent (Catapult) + '#catapult-cookie-bar', + + // Sirdata + '#sd-cmp', + + // Mediavine + '[data-name="mediavine-gdpr-cmp"]', + + // Consentmo (Shopify) + '.isense-cc-window', + + // AdOpt + '#cookie-banner', + + // Truyo + '#truyo-consent-module', + + // KConsent + '#kconsent', + '.kc-overlay', + + // Gravito + '.gravitoCMP-background-overlay', + + // CT Ultimate GDPR (WordPress) + '#ct-ultimate-gdpr-cookie-popup', + + // Hu-manity + '#hu.hu-wrapper', + + // GDPR Legal Cookie (Shopify) + '.overlay_bc_banner', + + // Piwik PRO + '.PiwikPROConsentForm-container', + + // Tealium + '#__tealiumGDPRecModal', + '#__tealiumImplicitmodal', + '#consent-layer', + + // PMC (Penske Media) + '#pmc-pp-tou--notice', + + // Privado + '#cookie-consent-banner', + + // Real Cookie Banner (devowl.io) + '.rcb-banner', + + // Bing / Microsoft + '#bnp_container', + '#bnp_cookie_banner', + + // LinkedIn + '.artdeco-global-alert[type="COOKIE_CONSENT"]', + + // Privacy Manager + '#gdpr-consent-tool-wrapper', + + // --- Generic patterns (catch-all) --- + '[class*="cookie-consent" i]', + '[id*="cookie-consent" i]', + '[class*="cookie-banner" i]', + '[id*="cookie-banner" i]', + '[class*="consent-banner" i]', + '[id*="consent-banner" i]', + '[class*="consent-popup" i]', + '[id*="consent-popup" i]', + '[class*="gdpr-banner" i]', + '[id*="gdpr-banner" i]', + '[class*="cookie-notice" i]', + '[id*="cookie-notice" i]', + '[class*="cookie-law" i]', + '[id*="cookie-law" i]', + '[class*="cookie-popup" i]', + '[id*="cookie-popup" i]', + '[class*="cookie-overlay" i]', + '[id*="cookie-overlay" i]', + '.cc-banner', + '.cc-window', + ]; + + for (const selector of cmpContainerSelectors) { + try { + const elements = document.querySelectorAll(selector); + elements.forEach(el => el.remove()); + } catch (e) { /* continue */ } + } + + // ========================================================================= + // Phase 4: Remove CMP iframes + // ========================================================================= + const cmpIframeSelectors = [ + 'iframe[id^="sp_message_iframe"]', + 'iframe#fast-cmp-iframe', + 'iframe[src*="consent" i]', + 'iframe[src*="cookie-cdn" i]', + 'iframe[src*="cookiebot" i]', + 'iframe[src*="trustarc" i]', + 'iframe[src*="consentmanager" i]', + 'iframe[src*="privacymanager" i]', + 'iframe[src*="cmp-consent-tool" i]', + 'iframe[title*="consent" i]', + 'iframe[title*="cookie" i]', + 'iframe[title*="gdpr" i]', + 'iframe[name="__tcfapiLocator"]', + ]; + + for (const selector of cmpIframeSelectors) { + try { + const iframes = document.querySelectorAll(selector); + iframes.forEach(iframe => { + // Also remove parent if it's a CMP wrapper (fixed/high-z) + const parent = iframe.parentElement; + if (parent && parent.children.length <= 2) { + const style = window.getComputedStyle(parent); + if (style.position === 'fixed' || style.position === 'absolute' || + parseInt(style.zIndex) > 999) { + parent.remove(); + return; + } + } + iframe.remove(); + }); + } catch (e) { /* continue */ } + } + + // ========================================================================= + // Phase 5: Restore body scroll and clean up CMP artifacts + // ========================================================================= + + // Reset overflow on body and html + document.body.style.overflow = ''; + document.body.style.overflowY = ''; + document.body.style.position = ''; + document.body.style.marginRight = ''; + document.body.style.paddingRight = ''; + document.documentElement.style.overflow = ''; + document.documentElement.style.overflowY = ''; + document.documentElement.style.position = ''; + + // Remove known CMP body classes + const cmpBodyClasses = [ + 'ot-overflow-hidden', + 'sp_message_open', + 'didomi-popup-open', + 'cmpbox-show', + 'cmplz-blocked', + 'qc-cmp2-no-scroll', + 'osano-cm-show', + 'cky-modal-open', + 'fides-overlay-modal-open', + 'cc-no-scroll', + 'gdpr-cookie-notice-center-loaded', + 'fc-consent-root-open', + 'cookie-notification-active', + 'consent-bar-push-large', + 'with-eu-cookie-guideline', + 'cookies-request', + 'eu-cookie-compliance-popup-open', + 'has-cookie-bar', + ]; + + for (const cls of cmpBodyClasses) { + document.body.classList.remove(cls); + document.documentElement.classList.remove(cls); + } +}; diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index dc5699aa..06a1e751 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -159,6 +159,7 @@ Use these for controlling whether you read or write from a local content cache. | **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. | | **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. | | **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. | +| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. | | **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. | | **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. | | **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. | diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md index 7e6abf5c..fa53218c 100644 --- a/docs/md_v2/complete-sdk-reference.md +++ b/docs/md_v2/complete-sdk-reference.md @@ -1787,6 +1787,7 @@ run_cfg = CrawlerRunConfig( | **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. | | **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. | | **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. | +| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. | | **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. | | **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. | | **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. | @@ -3344,8 +3345,9 @@ Below are the key interaction-related parameters in `CrawlerRunConfig`. For a fu - **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for. - **`session_id`**: Reuse the same page across calls. - **`cache_mode`**: Whether to read/write from the cache or bypass. -- **`remove_overlay_elements`**: Remove certain popups automatically. -- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions. +- **`remove_overlay_elements`**: Remove certain popups automatically. +- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.). +- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions. ## 8. Conclusion 1. **Execute JavaScript** for scrolling, clicks, or form filling. 2. **Wait** for CSS or custom JS conditions before capturing data. diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md index 85998433..4a7dc2a3 100644 --- a/docs/md_v2/core/content-selection.md +++ b/docs/md_v2/core/content-selection.md @@ -153,8 +153,10 @@ Some sites embed content in `