docs: Enhance session management example for dynamic content crawling with improved JavaScript handling and extraction schema. ref #226
This commit is contained in:
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
|||||||
from crawl4ai.cache_context import CacheMode
|
from crawl4ai.cache_context import CacheMode
|
||||||
|
|
||||||
async def crawl_dynamic_content():
|
async def crawl_dynamic_content():
|
||||||
async with AsyncWebCrawler() as crawler:
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||||
session_id = "github_commits_session"
|
session_id = "wait_for_session"
|
||||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
all_commits = []
|
||||||
all_commits = []
|
|
||||||
|
|
||||||
# Define extraction schema
|
js_next_page = """
|
||||||
schema = {
|
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||||
"name": "Commit Extractor",
|
if (commits.length > 0) {
|
||||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
window.lastCommit = commits[0].textContent.trim();
|
||||||
"fields": [{
|
}
|
||||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||||
}],
|
if (button) {button.click(); console.log('button clicked') }
|
||||||
}
|
"""
|
||||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
|
||||||
|
|
||||||
# JavaScript and wait configurations
|
wait_for = """() => {
|
||||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
if (commits.length === 0) return false;
|
||||||
|
const firstCommit = commits[0].textContent.trim();
|
||||||
|
return firstCommit !== window.lastCommit;
|
||||||
|
}"""
|
||||||
|
|
||||||
# Crawl multiple pages
|
schema = {
|
||||||
|
"name": "Commit Extractor",
|
||||||
|
"baseSelector": "li[data-testid='commit-row-item']",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"name": "title",
|
||||||
|
"selector": "h4 a",
|
||||||
|
"type": "text",
|
||||||
|
"transform": "strip",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
verbose=True,
|
||||||
|
headless=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
for page in range(3):
|
for page in range(3):
|
||||||
config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
url=url,
|
|
||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
|
css_selector="li[data-testid='commit-row-item']",
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
wait_for=wait_for if page > 0 else None,
|
wait_for=wait_for if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
cache_mode=CacheMode.BYPASS
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
capture_console_messages=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(config=config)
|
result = await crawler.arun(url=url, config=crawler_config)
|
||||||
if result.success:
|
|
||||||
|
if result.console_messages:
|
||||||
|
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||||
|
|
||||||
|
if result.extracted_content:
|
||||||
|
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||||
commits = json.loads(result.extracted_content)
|
commits = json.loads(result.extracted_content)
|
||||||
all_commits.extend(commits)
|
all_commits.extend(commits)
|
||||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||||
|
else:
|
||||||
|
print(f"Page {page + 1}: No content extracted")
|
||||||
|
|
||||||
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||||
# Clean up session
|
# Clean up session
|
||||||
await crawler.crawler_strategy.kill_session(session_id)
|
await crawler.crawler_strategy.kill_session(session_id)
|
||||||
return all_commits
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
Reference in New Issue
Block a user