feat(proxy): add proxy rotation strategy

Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
2025-02-09 18:49:10 +08:00
parent b957ff2ecd
commit 19df96ed56
12 changed files with 257 additions and 162 deletions
--- a/docs/releases_review/v0.3.74.overview.py
+++ b/docs/releases_review/v0.3.74.overview.py
@@ -0,0 +1,276 @@
+import os, sys
+
+# append the parent directory to the sys.path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+__data__ = os.path.join(__location__, "__data")
+import asyncio
+from pathlib import Path
+import aiohttp
+import json
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+
+# 1. File Download Processing Example
+async def download_example():
+    """Example of downloading files from Python.org"""
+    # downloads_path = os.path.join(os.getcwd(), "downloads")
+    downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
+    os.makedirs(downloads_path, exist_ok=True)
+
+    print(f"Downloads will be saved to: {downloads_path}")
+
+    async with AsyncWebCrawler(
+        accept_downloads=True, downloads_path=downloads_path, verbose=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.python.org/downloads/",
+            js_code="""
+            // Find and click the first Windows installer link
+            const downloadLink = document.querySelector('a[href$=".exe"]');
+            if (downloadLink) {
+                console.log('Found download link:', downloadLink.href);
+                downloadLink.click();
+            } else {
+                console.log('No .exe download link found');
+            }
+            """,
+            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        if result.downloaded_files:
+            print("\nDownload successful!")
+            print("Downloaded files:")
+            for file_path in result.downloaded_files:
+                print(f"- {file_path}")
+                print(f"  File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
+        else:
+            print("\nNo files were downloaded")
+
+
+# 2. Local File and Raw HTML Processing Example
+async def local_and_raw_html_example():
+    """Example of processing local files and raw HTML"""
+    # Create a sample HTML file
+    sample_file = os.path.join(__data__, "sample.html")
+    with open(sample_file, "w") as f:
+        f.write(
+            """
+        <html><body>
+            <h1>Test Content</h1>
+            <p>This is a test paragraph.</p>
+        </body></html>
+        """
+        )
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # Process local file
+        local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
+
+        # Process raw HTML
+        raw_html = """
+        <html><body>
+            <h1>Raw HTML Test</h1>
+            <p>This is a test of raw HTML processing.</p>
+        </body></html>
+        """
+        raw_result = await crawler.arun(url=f"raw:{raw_html}")
+
+        # Clean up
+        os.remove(sample_file)
+
+        print("Local file content:", local_result.markdown)
+        print("\nRaw HTML content:", raw_result.markdown)
+
+
+# 3. Enhanced Markdown Generation Example
+async def markdown_generation_example():
+    """Example of enhanced markdown generation with citations and LLM-friendly features"""
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # Create a content filter (optional)
+        content_filter = BM25ContentFilter(
+            # user_query="History and cultivation",
+            bm25_threshold=1.0
+        )
+
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            css_selector="main div#bodyContent",
+            content_filter=content_filter,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            css_selector="main div#bodyContent",
+            content_filter=BM25ContentFilter(),
+        )
+        print(result.markdown_v2.fit_markdown)
+
+        print("\nMarkdown Generation Results:")
+        print(f"1. Original markdown length: {len(result.markdown)}")
+        print("2. New markdown versions (markdown_v2):")
+        print(f"   - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
+        print(
+            f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
+        )
+        print(
+            f"   - References section length: {len(result.markdown_v2.references_markdown)}"
+        )
+        if result.markdown_v2.fit_markdown:
+            print(
+                f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
+            )
+
+        # Save examples to files
+        output_dir = os.path.join(__data__, "markdown_examples")
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Save different versions
+        with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
+            f.write(result.markdown_v2.raw_markdown)
+
+        with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
+            f.write(result.markdown_v2.markdown_with_citations)
+
+        with open(os.path.join(output_dir, "3_references.md"), "w") as f:
+            f.write(result.markdown_v2.references_markdown)
+
+        if result.markdown_v2.fit_markdown:
+            with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
+                f.write(result.markdown_v2.fit_markdown)
+
+        print(f"\nMarkdown examples saved to: {output_dir}")
+
+        # Show a sample of citations and references
+        print("\nSample of markdown with citations:")
+        print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
+        print("Sample of references:")
+        print(
+            "\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
+        )
+
+
+# 4. Browser Management Example
+async def browser_management_example():
+    """Example of using enhanced browser management features"""
+    # Use the specified user directory path
+    user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
+    os.makedirs(user_data_dir, exist_ok=True)
+
+    print(f"Browser profile will be saved to: {user_data_dir}")
+
+    async with AsyncWebCrawler(
+        use_managed_browser=True,
+        user_data_dir=user_data_dir,
+        headless=False,
+        verbose=True,
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://crawl4ai.com",
+            # session_id="persistent_session_1",
+            cache_mode=CacheMode.BYPASS,
+        )
+        # Use GitHub as an example - it's a good test for browser management
+        # because it requires proper browser handling
+        result = await crawler.arun(
+            url="https://github.com/trending",
+            # session_id="persistent_session_1",
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        print("\nBrowser session result:", result.success)
+        if result.success:
+            print("Page title:", result.metadata.get("title", "No title found"))
+
+
+# 5. API Usage Example
+async def api_example():
+    """Example of using the new API endpoints"""
+    api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+    headers = {"Authorization": f"Bearer {api_token}"}
+    async with aiohttp.ClientSession() as session:
+        # Submit crawl job
+        crawl_request = {
+            "urls": ["https://news.ycombinator.com"],  # Hacker News as an example
+            "extraction_config": {
+                "type": "json_css",
+                "params": {
+                    "schema": {
+                        "name": "Hacker News Articles",
+                        "baseSelector": ".athing",
+                        "fields": [
+                            {"name": "title", "selector": ".title a", "type": "text"},
+                            {"name": "score", "selector": ".score", "type": "text"},
+                            {
+                                "name": "url",
+                                "selector": ".title a",
+                                "type": "attribute",
+                                "attribute": "href",
+                            },
+                        ],
+                    }
+                },
+            },
+            "crawler_params": {
+                "headless": True,
+                # "use_managed_browser": True
+            },
+            "cache_mode": "bypass",
+            # "screenshot": True,
+            # "magic": True
+        }
+
+        async with session.post(
+            "http://localhost:11235/crawl", json=crawl_request, headers=headers
+        ) as response:
+            task_data = await response.json()
+            task_id = task_data["task_id"]
+
+            # Check task status
+            while True:
+                async with session.get(
+                    f"http://localhost:11235/task/{task_id}", headers=headers
+                ) as status_response:
+                    result = await status_response.json()
+                    print(f"Task status: {result['status']}")
+
+                    if result["status"] == "completed":
+                        print("Task completed!")
+                        print("Results:")
+                        news = json.loads(result["results"][0]["extracted_content"])
+                        print(json.dumps(news[:4], indent=2))
+                        break
+                    else:
+                        await asyncio.sleep(1)
+
+
+# Main execution
+async def main():
+    # print("Running Crawl4AI feature examples...")
+
+    # print("\n1. Running Download Example:")
+    # await download_example()
+
+    # print("\n2. Running Markdown Generation Example:")
+    # await markdown_generation_example()
+
+    # # print("\n3. Running Local and Raw HTML Example:")
+    # await local_and_raw_html_example()
+
+    # # print("\n4. Running Browser Management Example:")
+    await browser_management_example()
+
+    # print("\n5. Running API Example:")
+    await api_example()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())