docs: Enhance session management example for dynamic content crawling with improved JavaScript handling and extraction schema. ref #226

2025-07-15 10:19:29 +02:00
parent 58024755c5
commit 2640dc73a5
1 changed files with 54 additions and 25 deletions
--- a/docs/md_v2/advanced/session-management.md
+++ b/docs/md_v2/advanced/session-management.md
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.cache_context import CacheMode

 async def crawl_dynamic_content():
-    async with AsyncWebCrawler() as crawler:
-        session_id = "github_commits_session"
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        all_commits = []
+    url = "https://github.com/microsoft/TypeScript/commits/main"
+    session_id = "wait_for_session"
+    all_commits = []

-        # Define extraction schema
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [{
-                "name": "title", "selector": "h4.markdown-title", "type": "text"
-            }],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema)
+    js_next_page = """
+    const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
+    if (commits.length > 0) {
+        window.lastCommit = commits[0].textContent.trim();
+    }
+    const button = document.querySelector('a[data-testid="pagination-next-button"]');
+    if (button) {button.click(); console.log('button clicked') }
+    """

-        # JavaScript and wait configurations
-        js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
-        wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
-
-        # Crawl multiple pages
+    wait_for = """() => {
+        const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
+        if (commits.length === 0) return false;
+        const firstCommit = commits[0].textContent.trim();
+        return firstCommit !== window.lastCommit;
+    }"""
+    
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li[data-testid='commit-row-item']",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4 a",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+    
+    
+    browser_config = BrowserConfig(
+        verbose=True,
+        headless=False,
+    )
+        
+    async with AsyncWebCrawler(config=browser_config) as crawler:
        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
+            crawler_config = CrawlerRunConfig(
                session_id=session_id,
+                css_selector="li[data-testid='commit-row-item']",
                extraction_strategy=extraction_strategy,
                js_code=js_next_page if page > 0 else None,
                wait_for=wait_for if page > 0 else None,
                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
+                cache_mode=CacheMode.BYPASS,
+                capture_console_messages=True,
            )
-
-            result = await crawler.arun(config=config)
-            if result.success:
+            
+            result = await crawler.arun(url=url, config=crawler_config)
+            
+            if result.console_messages:
+                print(f"Page {page + 1} console messages:", result.console_messages)
+            
+            if result.extracted_content:
+                # print(f"Page {page + 1} result:", result.extracted_content)
                commits = json.loads(result.extracted_content)
                all_commits.extend(commits)
                print(f"Page {page + 1}: Found {len(commits)} commits")
+            else:
+                print(f"Page {page + 1}: No content extracted")

+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
        # Clean up session
        await crawler.crawler_strategy.kill_session(session_id)
-        return all_commits
 ```

 ---