docs: Enhance session management example for dynamic content crawling with improved JavaScript handling and extraction schema. ref #226

2025-07-15 10:19:29 +02:00
parent 58024755c5
commit 2640dc73a5
1 changed files with 54 additions and 25 deletions
--- a/docs/md_v2/advanced/session-management.md
+++ b/docs/md_v2/advanced/session-management.md
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.cache_context import CacheMode
 async def crawl_dynamic_content():
-    async with AsyncWebCrawler() as crawler:
+    url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "github_commits_session"
+    session_id = "wait_for_session"
-        url = "https://github.com/microsoft/TypeScript/commits/main"
+    all_commits = []
        all_commits = []
-        # Define extraction schema
+    js_next_page = """
-        schema = {
+    const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
-            "name": "Commit Extractor",
+    if (commits.length > 0) {
-            "baseSelector": "li.Box-sc-g0xbh4-0",
+        window.lastCommit = commits[0].textContent.trim();
-            "fields": [{
+    }
-                "name": "title", "selector": "h4.markdown-title", "type": "text"
+    const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            }],
+    if (button) {button.click(); console.log('button clicked') }
-        }
+    """
        extraction_strategy = JsonCssExtractionStrategy(schema)
-        # JavaScript and wait configurations
+    wait_for = """() => {
-        js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
+        const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
-        wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
+        if (commits.length === 0) return false;
        const firstCommit = commits[0].textContent.trim();
        return firstCommit !== window.lastCommit;
    }"""
-        # Crawl multiple pages
+    schema = {
        "name": "Commit Extractor",
        "baseSelector": "li[data-testid='commit-row-item']",
        "fields": [
            {
                "name": "title",
                "selector": "h4 a",
                "type": "text",
                "transform": "strip",
            },
        ],
    }
    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
    browser_config = BrowserConfig(
        verbose=True,
        headless=False,
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        for page in range(3):
-            config = CrawlerRunConfig(
+            crawler_config = CrawlerRunConfig(
                url=url,
                session_id=session_id,
                css_selector="li[data-testid='commit-row-item']",
                extraction_strategy=extraction_strategy,
                js_code=js_next_page if page > 0 else None,
                wait_for=wait_for if page > 0 else None,
                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
+                cache_mode=CacheMode.BYPASS,
                capture_console_messages=True,
            )
-            result = await crawler.arun(config=config)
+            result = await crawler.arun(url=url, config=crawler_config)
-            if result.success:
+            
            if result.console_messages:
                print(f"Page {page + 1} console messages:", result.console_messages)
            if result.extracted_content:
                # print(f"Page {page + 1} result:", result.extracted_content)
                commits = json.loads(result.extracted_content)
                all_commits.extend(commits)
                print(f"Page {page + 1}: Found {len(commits)} commits")
            else:
                print(f"Page {page + 1}: No content extracted")
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
        # Clean up session
        await crawler.crawler_strategy.kill_session(session_id)
        return all_commits
 ```
 ---