Merge branch '2025-MAY-2' into next-MAY

This commit is contained in:
ntohidi
2025-07-08 11:46:13 +02:00
28 changed files with 448 additions and 154 deletions

View File

@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
```python
import os, asyncio
from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
async def main():
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
pdf=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
cache_mode=CacheMode.BYPASS,
pdf=True,
screenshot=True
config=run_config
)
if result.success:
# Save screenshot
print(f"Screenshot data present: {result.screenshot is not None}")
print(f"PDF data present: {result.pdf is not None}")
if result.screenshot:
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
with open("wikipedia_screenshot.png", "wb") as f:
f.write(b64decode(result.screenshot))
# Save PDF
else:
print("[WARN] Screenshot data is None.")
if result.pdf:
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
with open("wikipedia_page.pdf", "wb") as f:
f.write(result.pdf)
print("[OK] PDF & screenshot captured.")
else:
print("[WARN] PDF data is None.")
else:
print("[ERROR]", result.error_message)

View File

@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
```python
from crawl4ai.async_configs import BrowserConfig
proxy_config = {
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass"
}
browser_config = BrowserConfig(proxy_config=proxy_config)
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
Here's the corrected documentation:
## Rotating Proxies
Example using a proxy rotation service dynamically:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def get_next_proxy():
# Your proxy rotation logic here
return {"server": "http://next.proxy.com:8080"}
import re
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
RoundRobinProxyStrategy,
)
import asyncio
from crawl4ai import ProxyConfig
async def main():
browser_config = BrowserConfig()
run_config = CrawlerRunConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
# For each URL, create a new run config with different proxy
for url in urls:
proxy = await get_next_proxy()
# Clone the config and update proxy - this creates a new browser context
current_config = run_config.clone(proxy_config=proxy)
result = await crawler.arun(url=url, config=current_config)
# Load proxies and create rotation strategy
proxies = ProxyConfig.from_env()
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
async with AsyncWebCrawler(config=browser_config) as crawler:
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler:
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config
)
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy.ip
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
asyncio.run(main())
if __name__ == "__main__":
import asyncio
asyncio.run(main())
```

View File

@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
from crawl4ai import JsonCssExtractionStrategy
async def main():
@@ -298,7 +298,7 @@ async def main():
# 3) Example LLM content filtering
gemini_config = LLMConfig(
provider="gemini/gemini-1.5-pro"
provider="gemini/gemini-1.5-pro",
api_token = "env:GEMINI_API_TOKEN"
)
@@ -322,8 +322,9 @@ async def main():
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
content_filter=filter,
options={"ignore_links": True}
)
# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig(

View File

@@ -17,6 +17,9 @@
- [Configuration Reference](#configuration-reference)
- [Best Practices & Tips](#best-practices--tips)
## Installation
The Crawl4AI CLI will be installed automatically when you install the library.
## Basic Usage
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:

View File

@@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_web():
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/apple",
@@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_local_file():
local_file_path = "/path/to/apple.html" # Replace with your file path
file_url = f"file://{local_file_path}"
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url, config=config)
@@ -93,7 +93,7 @@ import os
import sys
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def main():
@@ -104,7 +104,7 @@ async def main():
async with AsyncWebCrawler() as crawler:
# Step 1: Crawl the Web URL
print("\n=== Step 1: Crawling the Wikipedia URL ===")
web_config = CrawlerRunConfig(bypass_cache=True)
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result = await crawler.arun(url=wikipedia_url, config=web_config)
if not result.success:
@@ -119,7 +119,7 @@ async def main():
# Step 2: Crawl from the Local HTML File
print("=== Step 2: Crawling from the Local HTML File ===")
file_url = f"file://{html_file_path.resolve()}"
file_config = CrawlerRunConfig(bypass_cache=True)
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
local_result = await crawler.arun(url=file_url, config=file_config)
if not local_result.success:
@@ -135,7 +135,7 @@ async def main():
with open(html_file_path, 'r', encoding='utf-8') as f:
raw_html_content = f.read()
raw_html_url = f"raw:{raw_html_content}"
raw_config = CrawlerRunConfig(bypass_cache=True)
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
if not raw_result.success:

View File

@@ -201,6 +201,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
- **`language (str)`**: Language for stemming (default: 'english').
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
async def main():
@@ -255,9 +256,12 @@ async def main():
chunk_token_threshold=4096, # Adjust based on your needs
verbose=True
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig(
content_filter=filter
markdown_generator=md_generator,
)
async with AsyncWebCrawler() as crawler: