Add session_id feature to allow reusing browser pages across multiple crawls. Add support for view-source: protocol in URL handling. Fix browser config reference and string formatting issues. Update examples to demonstrate new session management features. BREAKING CHANGE: Browser page handling now persists when using session_id
30 lines
757 B
Python
30 lines
757 B
Python
import asyncio
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
CrawlResult
|
|
)
|
|
|
|
|
|
async def main():
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
verbose=True,
|
|
)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
crawler_config = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter()
|
|
),
|
|
)
|
|
result: CrawlResult = await crawler.arun(
|
|
url="https://www.helloworld.org", config=crawler_config
|
|
)
|
|
print(result.markdown.raw_markdown[:500])
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|