diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 00000000..2d51a74b --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,35 @@ +name: Discord GitHub Notifications + +on: + issues: + types: [opened] + issue_comment: + types: [created] + pull_request: + types: [opened] + discussion: + types: [created] + +jobs: + notify-discord: + runs-on: ubuntu-latest + steps: + - name: Set webhook based on event type + id: set-webhook + run: | + if [ "${{ github.event_name }}" == "discussion" ]; then + echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT + else + echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT + fi + + - name: Discord Notification + uses: Ilshidur/action-discord@master + env: + DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }} + with: + args: | + ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || + github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) || + github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) || + format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }} diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index 6ad7bb7a..b7ef4885 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -68,4 +68,4 @@ observability: enabled: True endpoint: "/metrics" health_check: - endpoint: "/health" \ No newline at end of file + endpoint: "/health" diff --git a/docs/examples/quickstart_examples_set_1.py b/docs/examples/quickstart_examples_set_1.py new file mode 100644 index 00000000..76224746 --- /dev/null +++ b/docs/examples/quickstart_examples_set_1.py @@ -0,0 +1,412 @@ +import asyncio +import os +import json +import base64 +from pathlib import Path +from typing import List +from crawl4ai.proxy_strategy import ProxyConfig + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult +from crawl4ai import RoundRobinProxyStrategy +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai import LLMConfig +from crawl4ai import PruningContentFilter, BM25ContentFilter +from crawl4ai import DefaultMarkdownGenerator +from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain +from crawl4ai import BrowserConfig + +__cur_dir__ = Path(__file__).parent + +async def demo_basic_crawl(): + """Basic web crawling with markdown generation""" + print("\n=== 1. Basic Web Crawling ===") + async with AsyncWebCrawler(config = BrowserConfig( + viewport_height=800, + viewport_width=1200, + headless=True, + verbose=True, + )) as crawler: + results: List[CrawlResult] = await crawler.arun( + url="https://news.ycombinator.com/" + ) + + for i, result in enumerate(results): + print(f"Result {i + 1}:") + print(f"Success: {result.success}") + if result.success: + print(f"Markdown length: {len(result.markdown.raw_markdown)} chars") + print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...") + else: + print("Failed to crawl the URL") + +async def demo_parallel_crawl(): + """Crawl multiple URLs in parallel""" + print("\n=== 2. Parallel Crawling ===") + + urls = [ + "https://news.ycombinator.com/", + "https://example.com/", + "https://httpbin.org/html", + ] + + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun_many( + urls=urls, + ) + + print(f"Crawled {len(results)} URLs in parallel:") + for i, result in enumerate(results): + print( + f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}" + ) + +async def demo_fit_markdown(): + """Generate focused markdown with LLM content filter""" + print("\n=== 3. Fit Markdown with LLM Content Filter ===") + + async with AsyncWebCrawler() as crawler: + result: CrawlResult = await crawler.arun( + url = "https://en.wikipedia.org/wiki/Python_(programming_language)", + config=CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() + ) + ), + ) + + # Print stats and save the fit markdown + print(f"Raw: {len(result.markdown.raw_markdown)} chars") + print(f"Fit: {len(result.markdown.fit_markdown)} chars") + +async def demo_llm_structured_extraction_no_schema(): + # Create a simple LLM extraction strategy (no schema required) + extraction_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="groq/qwen-2.5-32b", + api_token="env:GROQ_API_KEY", + ), + instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.", + extract_type="schema", + schema="{title: string, url: string, comments: int}", + extra_args={ + "temperature": 0.0, + "max_tokens": 4096, + }, + verbose=True, + ) + + config = CrawlerRunConfig(extraction_strategy=extraction_strategy) + + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun( + "https://news.ycombinator.com/", config=config + ) + + for result in results: + print(f"URL: {result.url}") + print(f"Success: {result.success}") + if result.success: + data = json.loads(result.extracted_content) + print(json.dumps(data, indent=2)) + else: + print("Failed to extract structured data") + +async def demo_css_structured_extraction_no_schema(): + """Extract structured data using CSS selectors""" + print("\n=== 5. CSS-Based Structured Extraction ===") + # Sample HTML for schema generation (one-time cost) + sample_html = """ +
+ +
+
+
+ ... +
+
+
+

Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data

+
+ î ‚Apr 05, 2025 + Malware / Supply Chain Attack +
+
Cybersecurity researchers have...
+
+
+
+
+ """ + + # Check if schema file exists + schema_file_path = f"{__cur_dir__}/tmp/schema.json" + if os.path.exists(schema_file_path): + with open(schema_file_path, "r") as f: + schema = json.load(f) + else: + # Generate schema using LLM (one-time setup) + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + llm_config=LLMConfig( + provider="groq/qwen-2.5-32b", + api_token="env:GROQ_API_KEY", + ), + query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.", + ) + + print(f"Generated schema: {json.dumps(schema, indent=2)}") + # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once + with open(f"{__cur_dir__}/tmp/schema.json", "w") as f: + json.dump(schema, f, indent=2) + + # Create no-LLM extraction strategy with the generated schema + extraction_strategy = JsonCssExtractionStrategy(schema) + config = CrawlerRunConfig(extraction_strategy=extraction_strategy) + + # Use the fast CSS extraction (no LLM calls during extraction) + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun( + "https://thehackernews.com", config=config + ) + + for result in results: + print(f"URL: {result.url}") + print(f"Success: {result.success}") + if result.success: + data = json.loads(result.extracted_content) + print(json.dumps(data, indent=2)) + else: + print("Failed to extract structured data") + +async def demo_deep_crawl(): + """Deep crawling with BFS strategy""" + print("\n=== 6. Deep Crawling ===") + + filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])]) + + deep_crawl_strategy = BFSDeepCrawlStrategy( + max_depth=1, max_pages=5, filter_chain=filter_chain + ) + + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun( + url="https://docs.crawl4ai.com", + config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy), + ) + + print(f"Deep crawl returned {len(results)} pages:") + for i, result in enumerate(results): + depth = result.metadata.get("depth", "unknown") + print(f" {i + 1}. {result.url} (Depth: {depth})") + +async def demo_js_interaction(): + """Execute JavaScript to load more content""" + print("\n=== 7. JavaScript Interaction ===") + + # A simple page that needs JS to reveal content + async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler: + # Initial load + + news_schema = { + "name": "news", + "baseSelector": "tr.athing", + "fields": [ + { + "name": "title", + "selector": "span.titleline", + "type": "text", + } + ], + } + results: List[CrawlResult] = await crawler.arun( + url="https://news.ycombinator.com", + config=CrawlerRunConfig( + session_id="hn_session", # Keep session + extraction_strategy=JsonCssExtractionStrategy(schema=news_schema), + ), + ) + + news = [] + for result in results: + if result.success: + data = json.loads(result.extracted_content) + news.extend(data) + print(json.dumps(data, indent=2)) + else: + print("Failed to extract structured data") + + print(f"Initial items: {len(news)}") + + # Click "More" link + more_config = CrawlerRunConfig( + js_code="document.querySelector('a.morelink').click();", + js_only=True, # Continue in same page + session_id="hn_session", # Keep session + extraction_strategy=JsonCssExtractionStrategy( + schema=news_schema, + ), + ) + + result: List[CrawlResult] = await crawler.arun( + url="https://news.ycombinator.com", config=more_config + ) + + # Extract new items + for result in results: + if result.success: + data = json.loads(result.extracted_content) + news.extend(data) + print(json.dumps(data, indent=2)) + else: + print("Failed to extract structured data") + print(f"Total items: {len(news)}") + +async def demo_media_and_links(): + """Extract media and links from a page""" + print("\n=== 8. Media and Links Extraction ===") + + async with AsyncWebCrawler() as crawler: + result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page") + + for i, result in enumerate(result): + # Extract and save all images + images = result.media.get("images", []) + print(f"Found {len(images)} images") + + # Extract and save all links (internal and external) + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Found {len(internal_links)} internal links") + print(f"Found {len(external_links)} external links") + + # Print some of the images and links + for image in images[:3]: + print(f"Image: {image['src']}") + for link in internal_links[:3]: + print(f"Internal link: {link['href']}") + for link in external_links[:3]: + print(f"External link: {link['href']}") + + # # Save everything to files + with open(f"{__cur_dir__}/tmp/images.json", "w") as f: + json.dump(images, f, indent=2) + + with open(f"{__cur_dir__}/tmp/links.json", "w") as f: + json.dump( + {"internal": internal_links, "external": external_links}, + f, + indent=2, + ) + +async def demo_screenshot_and_pdf(): + """Capture screenshot and PDF of a page""" + print("\n=== 9. Screenshot and PDF Capture ===") + + async with AsyncWebCrawler() as crawler: + result: List[CrawlResult] = await crawler.arun( + # url="https://example.com", + url="https://en.wikipedia.org/wiki/Giant_anteater", + config=CrawlerRunConfig(screenshot=True, pdf=True), + ) + + for i, result in enumerate(result): + # if result.screenshot_data: + if result.screenshot: + # Save screenshot + screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png" + with open(screenshot_path, "wb") as f: + f.write(base64.b64decode(result.screenshot)) + print(f"Screenshot saved to {screenshot_path}") + + # if result.pdf_data: + if result.pdf: + # Save PDF + pdf_path = f"{__cur_dir__}/tmp/example.pdf" + with open(pdf_path, "wb") as f: + f.write(result.pdf) + print(f"PDF saved to {pdf_path}") + +async def demo_proxy_rotation(): + """Proxy rotation for multiple requests""" + print("\n=== 10. Proxy Rotation ===") + + # Example proxies (replace with real ones) + proxies = [ + ProxyConfig(server="http://proxy1.example.com:8080"), + ProxyConfig(server="http://proxy2.example.com:8080"), + ] + + proxy_strategy = RoundRobinProxyStrategy(proxies) + + print(f"Using {len(proxies)} proxies in rotation") + print( + "Note: This example uses placeholder proxies - replace with real ones to test" + ) + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + proxy_rotation_strategy=proxy_strategy + ) + + # In a real scenario, these would be run and the proxies would rotate + print("In a real scenario, requests would rotate through the available proxies") + +async def demo_raw_html_and_file(): + """Process raw HTML and local files""" + print("\n=== 11. Raw HTML and Local Files ===") + + raw_html = """ + +

Sample Article

+

This is sample content for testing Crawl4AI's raw HTML processing.

+ + """ + + # Save to file + file_path = Path("docs/examples/tmp/sample.html").absolute() + with open(file_path, "w") as f: + f.write(raw_html) + + async with AsyncWebCrawler() as crawler: + # Crawl raw HTML + raw_result = await crawler.arun( + url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + print("Raw HTML processing:") + print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...") + + # Crawl local file + file_result = await crawler.arun( + url=f"file://{file_path}", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("\nLocal file processing:") + print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...") + + # Clean up + os.remove(file_path) + print(f"Processed both raw HTML and local file ({file_path})") + +async def main(): + """Run all demo functions sequentially""" + print("=== Comprehensive Crawl4AI Demo ===") + print("Note: Some examples require API keys or other configurations") + + # Run all demos + await demo_basic_crawl() + await demo_parallel_crawl() + await demo_fit_markdown() + await demo_llm_structured_extraction_no_schema() + await demo_css_structured_extraction_no_schema() + await demo_deep_crawl() + await demo_js_interaction() + await demo_media_and_links() + await demo_screenshot_and_pdf() + # # await demo_proxy_rotation() + await demo_raw_html_and_file() + + # Clean up any temp files that may have been created + print("\n=== Demo Complete ===") + print("Check for any generated files (screenshots, PDFs) in the current directory") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/quickstart_examples_set_2.py b/docs/examples/quickstart_examples_set_2.py new file mode 100644 index 00000000..3adbfc0d --- /dev/null +++ b/docs/examples/quickstart_examples_set_2.py @@ -0,0 +1,562 @@ +import os, sys + +from crawl4ai.types import LLMConfig + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) + +import asyncio +import time +import json +import re +from typing import Dict +from bs4 import BeautifulSoup +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.extraction_strategy import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, +) + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +print("Crawl4AI: Advanced Web Crawling and Data Extraction") +print("GitHub Repository: https://github.com/unclecode/crawl4ai") +print("Twitter: @unclecode") +print("Website: https://crawl4ai.com") + + +# Basic Example - Simple Crawl +async def simple_crawl(): + print("\n--- Basic Usage ---") + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + + +async def clean_content(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + excluded_tags=["nav", "footer", "aside"], + remove_overlay_elements=True, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ), + options={"ignore_links": True}, + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + config=crawler_config, + ) + full_markdown_length = len(result.markdown.raw_markdown) + fit_markdown_length = len(result.markdown.fit_markdown) + print(f"Full Markdown Length: {full_markdown_length}") + print(f"Fit Markdown Length: {fit_markdown_length}") + + +async def link_analysis(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + exclude_external_links=True, + exclude_social_media_links=True, + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config, + ) + print(f"Found {len(result.links['internal'])} internal links") + print(f"Found {len(result.links['external'])} external links") + + for link in result.links["internal"][:5]: + print(f"Href: {link['href']}\nText: {link['text']}\n") + + +# JavaScript Execution Example +async def simple_example_with_running_js_code(): + print("\n--- Executing JavaScript and Using CSS Selectors ---") + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();", + # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + + +# CSS Selector Example +async def simple_example_with_css_selector(): + print("\n--- Using CSS Selectors ---") + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + + +async def media_handling(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + for img in result.media["images"][:5]: + print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}") + + +async def custom_hook_workflow(verbose=True): + async with AsyncWebCrawler() as crawler: + # Set a 'before_goto' hook to run custom code just before navigation + crawler.crawler_strategy.set_hook( + "before_goto", + lambda page, context: print("[Hook] Preparing to navigate..."), + ) + + # Perform the crawl operation + result = await crawler.arun(url="https://crawl4ai.com") + print(result.markdown.raw_markdown[:500].replace("\n", " -- ")) + + +# Proxy Example +async def use_proxy(): + print("\n--- Using a Proxy ---") + browser_config = BrowserConfig( + headless=True, + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "username", + "password": "password", + }, + ) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + if result.success: + print(result.markdown[:500]) + + +# Screenshot Example +async def capture_and_save_screenshot(url: str, output_path: str): + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=url, config=crawler_config) + + if result.success and result.screenshot: + import base64 + + screenshot_data = base64.b64decode(result.screenshot) + with open(output_path, "wb") as f: + f.write(screenshot_data) + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + + +# LLM Extraction Example +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field( + ..., description="Fee for output token for the OpenAI model." + ) + + +async def extract_structured_data_using_llm( + provider: str, api_token: str = None, extra_headers: Dict[str, str] = None +): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") + return + + browser_config = BrowserConfig(headless=True) + + extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} + if extra_headers: + extra_args["extra_headers"] = extra_headers + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=1, + page_timeout=80000, + extraction_strategy=LLMExtractionStrategy( + llm_config=LLMConfig(provider=provider,api_token=api_token), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content.""", + extra_args=extra_args, + ), + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", config=crawler_config + ) + print(result.extracted_content) + + +# CSS Extraction Example +async def extract_structured_data_using_css_extractor(): + print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") + schema = { + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src", + }, + ], + } + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + for(let tab of tabs) { + tab.scrollIntoView(); + tab.click(); + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + js_code=[js_click_tabs], + delay_before_return_html=1 + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.kidocode.com/degrees/technology", config=crawler_config + ) + + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) + + +# Dynamic Content Examples - Method 1 +async def crawl_dynamic_content_pages_method_1(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + first_commit = "" + + async def on_execution_started(page, **kwargs): + nonlocal first_commit + try: + while True: + await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4") + commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4") + commit = await commit.evaluate("(element) => element.textContent") + commit = re.sub(r"\s+", "", commit) + if commit and commit != first_commit: + first_commit = commit + break + await asyncio.sleep(0.5) + except Exception as e: + print(f"Warning: New content didn't appear after JavaScript execution: {e}") + + browser_config = BrowserConfig(headless=False, java_script_enabled=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + js_next_page = """ + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + """ + + for page in range(3): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="li.Box-sc-g0xbh4-0", + js_code=js_next_page if page > 0 else None, + js_only=page > 0, + session_id=session_id, + ) + + result = await crawler.arun(url=url, config=crawler_config) + assert result.success, f"Failed to crawl page {page + 1}" + + soup = BeautifulSoup(result.cleaned_html, "html.parser") + commits = soup.select("li") + all_commits.extend(commits) + + print(f"Page {page + 1}: Found {len(commits)} commits") + + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + + +# Dynamic Content Examples - Method 2 +async def crawl_dynamic_content_pages_method_2(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + + browser_config = BrowserConfig(headless=False, java_script_enabled=True) + + js_next_page_and_wait = """ + (async () => { + const getCurrentCommit = () => { + const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); + return commits.length > 0 ? commits[0].textContent.trim() : null; + }; + + const initialCommit = getCurrentCommit(); + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + + while (true) { + await new Promise(resolve => setTimeout(resolve, 100)); + const newCommit = getCurrentCommit(); + if (newCommit && newCommit !== initialCommit) { + break; + } + } + })(); + """ + + schema = { + "name": "Commit Extractor", + "baseSelector": "li.Box-sc-g0xbh4-0", + "fields": [ + { + "name": "title", + "selector": "h4.markdown-title", + "type": "text", + "transform": "strip", + }, + ], + } + + async with AsyncWebCrawler(config=browser_config) as crawler: + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + extraction_strategy = JsonCssExtractionStrategy(schema) + + for page in range(3): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="li.Box-sc-g0xbh4-0", + extraction_strategy=extraction_strategy, + js_code=js_next_page_and_wait if page > 0 else None, + js_only=page > 0, + session_id=session_id, + ) + + result = await crawler.arun(url=url, config=crawler_config) + assert result.success, f"Failed to crawl page {page + 1}" + + commits = json.loads(result.extracted_content) + all_commits.extend(commits) + print(f"Page {page + 1}: Found {len(commits)} commits") + + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + + +async def cosine_similarity_extraction(): + from crawl4ai.extraction_strategy import CosineStrategy + crawl_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=CosineStrategy( + word_count_threshold=10, + max_dist=0.2, # Maximum distance between two words + linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single) + top_k=3, # Number of top keywords to extract + sim_threshold=0.3, # Similarity threshold for clustering + semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings + verbose=True, + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156", + config=crawl_config, + ) + print(json.loads(result.extracted_content)[:5]) + + +# Browser Comparison +async def crawl_custom_browser_type(): + print("\n--- Browser Comparison ---") + + # Firefox + browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True) + start = time.time() + async with AsyncWebCrawler(config=browser_config_firefox) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("Firefox:", time.time() - start) + print(result.markdown[:500]) + + # WebKit + browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True) + start = time.time() + async with AsyncWebCrawler(config=browser_config_webkit) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("WebKit:", time.time() - start) + print(result.markdown[:500]) + + # Chromium (default) + browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True) + start = time.time() + async with AsyncWebCrawler(config=browser_config_chromium) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("Chromium:", time.time() - start) + print(result.markdown[:500]) + + +# Anti-Bot and User Simulation +async def crawl_with_user_simulation(): + browser_config = BrowserConfig( + headless=True, + user_agent_mode="random", + user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, + ) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + magic=True, + simulate_user=True, + override_navigator=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config) + print(result.markdown) + + +async def ssl_certification(): + # Configure crawler to fetch SSL certificate + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + tmp_dir = os.path.join(__location__, "tmp") + os.makedirs(tmp_dir, exist_ok=True) + + # 1. Access certificate properties directly + print("\nCertificate Information:") + print(f"Issuer: {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # 2. Export certificate in different formats + cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis + print("\nCertificate exported to:") + print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") + + pem_data = cert.to_pem( + os.path.join(tmp_dir, "certificate.pem") + ) # For web servers + print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") + + der_data = cert.to_der( + os.path.join(tmp_dir, "certificate.der") + ) # For Java apps + print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") + + +# Main execution +async def main(): + # Basic examples + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() + + # Advanced examples + await extract_structured_data_using_css_extractor() + await extract_structured_data_using_llm( + "openai/gpt-4o", os.getenv("OPENAI_API_KEY") + ) + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() + + # Browser comparisons + await crawl_custom_browser_type() + + # Screenshot example + await capture_and_save_screenshot( + "https://www.example.com", + os.path.join(__location__, "tmp/example_screenshot.jpg") + ) + + +if __name__ == "__main__": + asyncio.run(main())