refactor(docs): update import statement in quickstart.py for improved clarity

2025-04-05 23:12:06 +08:00
parent 49d904ca0a
commit e1d9e2489c
5 changed files with 2 additions and 1082 deletions
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -1,6 +1,6 @@
 import os, sys

-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig

 sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -1,675 +0,0 @@
-import os, sys
-
-from crawl4ai import LLMConfig
-
-# append parent directory to system path
-sys.path.append(
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-)
-os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
-
-import asyncio
-# import nest_asyncio
-# nest_asyncio.apply()
-
-import time
-import json
-import os
-import re
-from typing import Dict, List
-from bs4 import BeautifulSoup
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
-    JsonCssExtractionStrategy,
-    LLMExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-print("Crawl4AI: Advanced Web Crawling and Data Extraction")
-print("GitHub Repository: https://github.com/unclecode/crawl4ai")
-print("Twitter: @unclecode")
-print("Website: https://crawl4ai.com")
-
-
-async def simple_crawl():
-    print("\n--- Basic Usage ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_running_js_code():
-    print("\n--- Executing JavaScript and Using CSS Selectors ---")
-    # New code to handle the wait_for parameter
-    wait_for = """() => {
-        return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
-    }"""
-
-    # wait_for can be also just a css selector
-    # wait_for = "article.tease-card:nth-child(10)"
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        js_code = [
-            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-        ]
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=js_code,
-            # wait_for=wait_for,
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_css_selector():
-    print("\n--- Using CSS Selectors ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            css_selector=".wide-tease-item__description",
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def use_proxy():
-    print("\n--- Using a Proxy ---")
-    print(
-        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
-    )
-    # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(
-        verbose=True, proxy="http://your-proxy-url:port"
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        if result.success:
-            print(result.markdown[:500])  # Print first 500 characters
-
-
-async def capture_and_save_screenshot(url: str, output_path: str):
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
-        )
-
-        if result.success and result.screenshot:
-            import base64
-
-            # Decode the base64 screenshot data
-            screenshot_data = base64.b64decode(result.screenshot)
-
-            # Save the screenshot as a JPEG file
-            with open(output_path, "wb") as f:
-                f.write(screenshot_data)
-
-            print(f"Screenshot saved successfully to {output_path}")
-        else:
-            print("Failed to capture screenshot")
-
-
-class OpenAIModelFee(BaseModel):
-    model_name: str = Field(..., description="Name of the OpenAI model.")
-    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
-        ..., description="Fee for output token for the OpenAI model."
-    )
-
-
-async def extract_structured_data_using_llm(
-    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
-):
-    print(f"\n--- Extracting Structured Data with {provider} ---")
-
-    if api_token is None and provider != "ollama":
-        print(f"API token is required for {provider}. Skipping this example.")
-        return
-
-    # extra_args = {}
-    extra_args = {
-        "temperature": 0,
-        "top_p": 0.9,
-        "max_tokens": 2000,
-        # any other supported parameters for litellm
-    }
-    if extra_headers:
-        extra_args["extra_headers"] = extra_headers
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://openai.com/api/pricing/",
-            word_count_threshold=1,
-            extraction_strategy=LLMExtractionStrategy(
-                llm_config=LLMConfig(provider=provider,api_token=api_token),
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
-                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
-                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args,
-            ),
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.extracted_content)
-
-
-async def extract_structured_data_using_css_extractor():
-    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
-    schema = {
-        "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
-        "fields": [
-            {
-                "name": "section_title",
-                "selector": "h3.heading-50",
-                "type": "text",
-            },
-            {
-                "name": "section_description",
-                "selector": ".charge-content",
-                "type": "text",
-            },
-            {
-                "name": "course_name",
-                "selector": ".text-block-93",
-                "type": "text",
-            },
-            {
-                "name": "course_description",
-                "selector": ".course-content-text",
-                "type": "text",
-            },
-            {
-                "name": "course_icon",
-                "selector": ".image-92",
-                "type": "attribute",
-                "attribute": "src",
-            },
-        ],
-    }
-
-    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
-        # Create the JavaScript that handles clicking multiple times
-        js_click_tabs = """
-        (async () => {
-            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
-            
-            for(let tab of tabs) {
-                // scroll to the tab
-                tab.scrollIntoView();
-                tab.click();
-                // Wait for content to load and animations to complete
-                await new Promise(r => setTimeout(r, 500));
-            }
-        })();
-        """
-
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
-            js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        companies = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(companies)} companies")
-        print(json.dumps(companies[0], indent=2))
-
-
-# Advanced Session-Based Crawling with Dynamic Content 🔄
-async def crawl_dynamic_content_pages_method_1():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-    first_commit = ""
-
-    async def on_execution_started(page):
-        nonlocal first_commit
-        try:
-            while True:
-                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await commit.evaluate("(element) => element.textContent")
-                commit = re.sub(r"\s+", "", commit)
-                if commit and commit != first_commit:
-                    first_commit = commit
-                    break
-                await asyncio.sleep(0.5)
-        except Exception as e:
-            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        (() => {
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-        })();
-        """
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                js=js_next_page if page > 0 else None,
-                cache_mode=CacheMode.BYPASS,
-                js_only=page > 0,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            soup = BeautifulSoup(result.cleaned_html, "html.parser")
-            commits = soup.select("li")
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_2():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-        last_commit = ""
-
-        js_next_page_and_wait = """
-        (async () => {
-            const getCurrentCommit = () => {
-                const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-                return commits.length > 0 ? commits[0].textContent.trim() : null;
-            };
-
-            const initialCommit = getCurrentCommit();
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-
-            // Poll for changes
-            while (true) {
-                await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
-                const newCommit = getCurrentCommit();
-                if (newCommit && newCommit !== initialCommit) {
-                    break;
-                }
-            }
-        })();
-        """
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page_and_wait if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_3():
-    print(
-        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
-    )
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-        if (commits.length > 0) {
-            window.firstCommit = commits[0].textContent.trim();
-        }
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
-        if (button) button.click();
-        """
-
-        wait_for = """() => {
-            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-            if (commits.length === 0) return false;
-            const firstCommit = commits[0].textContent.trim();
-            return firstCommit !== window.firstCommit;
-        }"""
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page if page > 0 else None,
-                wait_for=wait_for if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_custom_browser_type():
-    # Use Firefox
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="firefox", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use WebKit
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="webkit", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use Chromium (default)
-    start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-
-async def crawl_with_user_simultion():
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        url = "YOUR-URL-HERE"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
-            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
-            # override_navigator = True # Overrides the navigator object to make it look like a real user
-        )
-
-        print(result.markdown)
-
-
-async def speed_comparison():
-    # print("\n--- Speed Comparison ---")
-    # print("Firecrawl (simulated):")
-    # print("Time taken: 7.02 seconds")
-    # print("Content length: 42074 characters")
-    # print("Images found: 49")
-    # print()
-    # Simulated Firecrawl performance
-    from firecrawl import FirecrawlApp
-
-    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
-    start = time.time()
-    scrape_status = app.scrape_url(
-        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
-    )
-    end = time.time()
-    print("Firecrawl:")
-    print(f"Time taken: {end - start:.2f} seconds")
-    print(f"Content length: {len(scrape_status['markdown'])} characters")
-    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()
-
-    async with AsyncWebCrawler() as crawler:
-        # Crawl4AI simple crawl
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (simple crawl):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with advanced content filtering
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (Markdown Plus):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with JavaScript execution
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=[
-                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-            ],
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (with JavaScript execution):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-
-    print("\nNote on Speed Comparison:")
-    print("The speed test conducted here may not reflect optimal conditions.")
-    print("When we call Firecrawl's API, we're seeing its best performance,")
-    print("while Crawl4AI's performance is limited by the local network speed.")
-    print("For a more accurate comparison, it's recommended to run these tests")
-    print("on servers with a stable and fast internet connection.")
-    print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
-    print("If you run these tests in an environment with better network conditions,")
-    print("you may observe an even more significant speed advantage for Crawl4AI.")
-
-
-async def generate_knowledge_graph():
-    class Entity(BaseModel):
-        name: str
-        description: str
-
-    class Relationship(BaseModel):
-        entity1: Entity
-        entity2: Entity
-        description: str
-        relation_type: str
-
-    class KnowledgeGraph(BaseModel):
-        entities: List[Entity]
-        relationships: List[Relationship]
-
-    extraction_strategy = LLMExtractionStrategy(
-        llm_config=LLMConfig(provider="openai/gpt-4o-mini",  api_token=os.getenv("OPENAI_API_KEY")),  # In case of Ollama just pass "no-token"
-        schema=KnowledgeGraph.model_json_schema(),
-        extraction_type="schema",
-        instruction="""Extract entities and relationships from the given text.""",
-    )
-    async with AsyncWebCrawler() as crawler:
-        url = "https://paulgraham.com/love.html"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            extraction_strategy=extraction_strategy,
-            # magic=True
-        )
-        # print(result.extracted_content)
-        with open(os.path.join(__location__, "kb.json"), "w") as f:
-            f.write(result.extracted_content)
-
-
-async def fit_markdown_remove_overlay():
-    async with AsyncWebCrawler(
-        headless=True,  # Set to False to see what is happening
-        verbose=True,
-        user_agent_mode="random",
-        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                ),
-                options={"ignore_links": True},
-            ),
-            # markdown_generator=DefaultMarkdownGenerator(
-            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
-            #     options={
-            #         "ignore_links": True
-            #     }
-            # ),
-        )
-
-        if result.success:
-            print(len(result.markdown.raw_markdown))
-            print(len(result.markdown.markdown_with_citations))
-            print(len(result.markdown.fit_markdown))
-
-            # Save clean html
-            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
-                f.write(result.cleaned_html)
-
-            with open(
-                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown.raw_markdown)
-
-            with open(
-                os.path.join(__location__, "output/output_markdown_with_citations.md"),
-                "w",
-            ) as f:
-                f.write(result.markdown.markdown_with_citations)
-
-            with open(
-                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown.fit_markdown)
-
-    print("Done")
-
-
-async def main():
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
-    # # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
-
-    # LLM extraction examples
-    # await extract_structured_data_using_llm()
-    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")
-
-    # You always can pass custom headers to the extraction strategy
-    # custom_headers = {
-    #     "Authorization": "Bearer your-custom-token",
-    #     "X-Custom-Header": "Some-Value"
-    # }
-    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
-    await crawl_dynamic_content_pages_method_3()
-
-    # await crawl_custom_browser_type()
-
-    # await speed_comparison()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/docs/examples/quickstart_examples_set_1.py
+++ b/docs/examples/quickstart_examples_set_1.py
--- a/docs/examples/quickstart_examples_set_2.py
+++ b/docs/examples/quickstart_examples_set_2.py
@@ -1,6 +1,6 @@
 import os, sys

-from crawl4ai import LLMConfig
+from crawl4ai.types import LLMConfig

 sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
--- a/docs/examples/quickstart_sync.py
+++ b/docs/examples/quickstart_sync.py
@@ -1,405 +0,0 @@
-import os
-import time
-from crawl4ai import LLMConfig
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
-from rich import print
-from rich.console import Console
-from functools import lru_cache
-
-console = Console()
-
-
-@lru_cache()
-def create_crawler():
-    crawler = WebCrawler(verbose=True)
-    crawler.warmup()
-    return crawler
-
-
-def print_result(result):
-    # Print each key in one line and just the first 10 characters of each one's value and three dots
-    console.print("\t[bold]Result:[/bold]")
-    for key, value in result.model_dump().items():
-        if isinstance(value, str) and value:
-            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
-    if result.extracted_content:
-        items = json.loads(result.extracted_content)
-        print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
-
-
-def cprint(message, press_any_key=False):
-    console.print(message)
-    if press_any_key:
-        console.print("Press any key to continue...", style="")
-        input()
-
-
-def basic_usage(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def basic_usage_some_params(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
-    )
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def screenshot_usage(crawler):
-    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
-    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
-    # Save the screenshot to a file
-    with open("screenshot.png", "wb") as f:
-        f.write(base64.b64decode(result.screenshot))
-    cprint("Screenshot saved to 'screenshot.png'!")
-    print_result(result)
-
-
-def understanding_parameters(crawler):
-    cprint(
-        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
-    )
-    cprint(
-        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
-    )
-
-    # First crawl (reads from cache)
-    cprint("1️⃣ First crawl (caches the result):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business")
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
-    )
-    print_result(result)
-
-    # Force to crawl again
-    cprint("2️⃣ Second crawl (Force to crawl again):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_chunking_strategy(crawler):
-    # Adding a chunking strategy: RegexChunking
-    cprint(
-        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"]),
-    )
-    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
-    print_result(result)
-
-    # Adding another chunking strategy: NlpSentenceChunking
-    cprint(
-        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
-    )
-    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
-    print_result(result)
-
-
-def add_extraction_strategy(crawler):
-    # Adding an extraction strategy: CosineStrategy
-    cprint(
-        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
-        True,
-    )
-    cprint(
-        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            word_count_threshold=10,
-            max_dist=0.2,
-            linkage_method="ward",
-            top_k=3,
-            sim_threshold=0.3,
-            verbose=True,
-        ),
-    )
-    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
-    print_result(result)
-
-    # Using semantic_filter with CosineStrategy
-    cprint(
-        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            semantic_filter="inflation rent prices",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_llm_extraction_strategy(crawler):
-    # Adding an LLM extraction strategy without instructions
-    cprint(
-        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config =  LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    # Adding an LLM extraction strategy with instructions
-    cprint(
-        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-            instruction="I am interested in only financial news",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
-            instruction="Extract only content related to technology",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def targeted_extraction(crawler):
-    # Using a CSS selector to extract only H2 tags
-    cprint(
-        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
-        True,
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
-    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
-    print_result(result)
-
-
-def interactive_extraction(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def multiple_scrip(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = [
-        """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    ] * 2
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def using_crawler_hooks(crawler):
-    # Example usage of the hooks for authentication and setting a cookie
-    def on_driver_created(driver):
-        print("[HOOK] on_driver_created")
-        # Example customization: maximize the window
-        driver.maximize_window()
-
-        # Example customization: logging in to a hypothetical website
-        driver.get("https://example.com/login")
-
-        from selenium.webdriver.support.ui import WebDriverWait
-        from selenium.webdriver.common.by import By
-        from selenium.webdriver.support import expected_conditions as EC
-
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.NAME, "username"))
-        )
-        driver.find_element(By.NAME, "username").send_keys("testuser")
-        driver.find_element(By.NAME, "password").send_keys("password123")
-        driver.find_element(By.NAME, "login").click()
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.ID, "welcome"))
-        )
-        # Add a custom cookie
-        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
-        return driver
-
-    def before_get_url(driver):
-        print("[HOOK] before_get_url")
-        # Example customization: add a custom header
-        # Enable Network domain for sending headers
-        driver.execute_cdp_cmd("Network.enable", {})
-        # Add a custom header
-        driver.execute_cdp_cmd(
-            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
-        )
-        return driver
-
-    def after_get_url(driver):
-        print("[HOOK] after_get_url")
-        # Example customization: log the URL
-        print(driver.current_url)
-        return driver
-
-    def before_return_html(driver, html):
-        print("[HOOK] before_return_html")
-        # Example customization: log the HTML
-        print(len(html))
-        return driver
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
-        True,
-    )
-
-    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-    crawler_strategy.set_hook("on_driver_created", on_driver_created)
-    crawler_strategy.set_hook("before_get_url", before_get_url)
-    crawler_strategy.set_hook("after_get_url", after_get_url)
-    crawler_strategy.set_hook("before_return_html", before_return_html)
-
-    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-    crawler.warmup()
-    result = crawler.run(url="https://example.com")
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result=result)
-
-
-def using_crawler_hooks_dleay_example(crawler):
-    def delay(driver):
-        print("Delaying for 5 seconds...")
-        time.sleep(5)
-        print("Resuming...")
-
-    def create_crawler():
-        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-        crawler_strategy.set_hook("after_get_url", delay)
-        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-        crawler.warmup()
-        return crawler
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
-    )
-    crawler = create_crawler()
-    result = crawler.run(url="https://google.com", bypass_cache=True)
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result)
-
-
-def main():
-    cprint(
-        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
-    )
-    cprint(
-        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
-    )
-    cprint(
-        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
-    )
-
-    crawler = create_crawler()
-
-    crawler.always_by_pass_cache = True
-    basic_usage(crawler)
-    # basic_usage_some_params(crawler)
-    understanding_parameters(crawler)
-
-    crawler.always_by_pass_cache = True
-    screenshot_usage(crawler)
-    add_chunking_strategy(crawler)
-    add_extraction_strategy(crawler)
-    add_llm_extraction_strategy(crawler)
-    targeted_extraction(crawler)
-    interactive_extraction(crawler)
-    multiple_scrip(crawler)
-
-    cprint(
-        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
-    )
-
-
-if __name__ == "__main__":
-    main()