feat(docs): update examples and documentation to replace bypass_cache with cache_mode for improved clarity
This commit is contained in:
24
README.md
24
README.md
@@ -11,21 +11,19 @@
|
|||||||
|
|
||||||
Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐
|
||||||
|
|
||||||
## 🌟 Meet the Crawl4AI Assistant: Your Copilot for Crawling
|
## New in 0.3.74 ✨
|
||||||
|
|
||||||
Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-powered copilot! With this assistant, you can:
|
- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)!
|
||||||
|
- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object.
|
||||||
|
- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag.
|
||||||
|
- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly.
|
||||||
|
- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters.
|
||||||
|
- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter.
|
||||||
|
- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable.
|
||||||
|
- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`.
|
||||||
|
- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions.
|
||||||
|
- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing.
|
||||||
|
|
||||||
- 🧑💻 Generate code for complex crawling and extraction tasks
|
|
||||||
- 💡 Get tailored support and examples
|
|
||||||
- 📘 Learn Crawl4AI faster with step-by-step guidance
|
|
||||||
|
|
||||||
## New in 0.3.73 ✨
|
|
||||||
|
|
||||||
- 🐳 Docker Ready: Full API server with seamless deployment & scaling
|
|
||||||
- 🎯 Browser Takeover: Use your own browser with cookies & history intact (CDP support)
|
|
||||||
- 📝 Mockdown+: Enhanced tag preservation & content extraction
|
|
||||||
- ⚡️ Parallel Power: Supercharged multi-URL crawling performance
|
|
||||||
- 🌟 And many more exciting updates...
|
|
||||||
|
|
||||||
## Try it Now!
|
## Try it Now!
|
||||||
|
|
||||||
|
|||||||
@@ -10,14 +10,14 @@ import asyncio
|
|||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import *
|
from .chunking_strategy import *
|
||||||
|
from .content_filter_strategy import *
|
||||||
from .extraction_strategy import *
|
from .extraction_strategy import *
|
||||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||||
from .content_scrapping_strategy import WebScrapingStrategy
|
from .content_scrapping_strategy import WebScrapingStrategy
|
||||||
from .config import (
|
from .config import (
|
||||||
MIN_WORD_THRESHOLD,
|
MIN_WORD_THRESHOLD,
|
||||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||||
SHOW_DEPRECATION_WARNINGS # New import
|
|
||||||
)
|
)
|
||||||
from .utils import (
|
from .utils import (
|
||||||
sanitize_input_encode,
|
sanitize_input_encode,
|
||||||
@@ -49,7 +49,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
To disable deprecation warnings:
|
To disable deprecation warnings:
|
||||||
Set SHOW_DEPRECATION_WARNINGS = False in config.py
|
Pass warning=False to suppress the warning.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -85,11 +85,11 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Handle deprecated parameter
|
# Handle deprecated parameter
|
||||||
if always_by_pass_cache is not None:
|
if always_by_pass_cache is not None:
|
||||||
if SHOW_DEPRECATION_WARNINGS:
|
if kwargs.get("warning", True):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"'always_by_pass_cache' is deprecated and will be removed in version X.X.X. "
|
"'always_by_pass_cache' is deprecated and will be removed in version X.X.X. "
|
||||||
"Use 'always_bypass_cache' instead. "
|
"Use 'always_bypass_cache' instead. "
|
||||||
"Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
|
"Pass warning=False to suppress this warning.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2
|
stacklevel=2
|
||||||
)
|
)
|
||||||
@@ -126,6 +126,7 @@ class AsyncWebCrawler:
|
|||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
content_filter: RelevantContentFilter = None,
|
||||||
cache_mode: Optional[CacheMode] = None,
|
cache_mode: Optional[CacheMode] = None,
|
||||||
# Deprecated parameters
|
# Deprecated parameters
|
||||||
bypass_cache: bool = False,
|
bypass_cache: bool = False,
|
||||||
@@ -172,7 +173,7 @@ class AsyncWebCrawler:
|
|||||||
try:
|
try:
|
||||||
# Handle deprecated parameters
|
# Handle deprecated parameters
|
||||||
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||||
if SHOW_DEPRECATION_WARNINGS:
|
if kwargs.get("warning", True):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"Cache control boolean flags are deprecated and will be removed in version X.X.X. "
|
"Cache control boolean flags are deprecated and will be removed in version X.X.X. "
|
||||||
"Use 'cache_mode' parameter instead. Examples:\n"
|
"Use 'cache_mode' parameter instead. Examples:\n"
|
||||||
@@ -180,7 +181,7 @@ class AsyncWebCrawler:
|
|||||||
"- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n"
|
"- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n"
|
||||||
"- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n"
|
"- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n"
|
||||||
"- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n"
|
"- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n"
|
||||||
"Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
|
"Pass warning=False to suppress this warning.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2
|
stacklevel=2
|
||||||
)
|
)
|
||||||
@@ -257,6 +258,7 @@ class AsyncWebCrawler:
|
|||||||
word_count_threshold=word_count_threshold,
|
word_count_threshold=word_count_threshold,
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
chunking_strategy=chunking_strategy,
|
chunking_strategy=chunking_strategy,
|
||||||
|
content_filter=content_filter,
|
||||||
css_selector=css_selector,
|
css_selector=css_selector,
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
@@ -308,6 +310,7 @@ class AsyncWebCrawler:
|
|||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
content_filter: RelevantContentFilter = None,
|
||||||
cache_mode: Optional[CacheMode] = None,
|
cache_mode: Optional[CacheMode] = None,
|
||||||
# Deprecated parameters
|
# Deprecated parameters
|
||||||
bypass_cache: bool = False,
|
bypass_cache: bool = False,
|
||||||
@@ -335,14 +338,15 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
List[CrawlResult]: Results for each URL
|
List[CrawlResult]: Results for each URL
|
||||||
"""
|
"""
|
||||||
if bypass_cache and SHOW_DEPRECATION_WARNINGS:
|
if bypass_cache:
|
||||||
warnings.warn(
|
if kwargs.get("warning", True):
|
||||||
"'bypass_cache' is deprecated and will be removed in version X.X.X. "
|
warnings.warn(
|
||||||
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
"'bypass_cache' is deprecated and will be removed in version X.X.X. "
|
||||||
"Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.",
|
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
||||||
DeprecationWarning,
|
"Pass warning=False to suppress this warning.",
|
||||||
stacklevel=2
|
DeprecationWarning,
|
||||||
)
|
stacklevel=2
|
||||||
|
)
|
||||||
if cache_mode is None:
|
if cache_mode is None:
|
||||||
cache_mode = CacheMode.BYPASS
|
cache_mode = CacheMode.BYPASS
|
||||||
|
|
||||||
@@ -356,6 +360,7 @@ class AsyncWebCrawler:
|
|||||||
word_count_threshold=word_count_threshold,
|
word_count_threshold=word_count_threshold,
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
chunking_strategy=chunking_strategy,
|
chunking_strategy=chunking_strategy,
|
||||||
|
content_filter=content_filter,
|
||||||
cache_mode=cache_mode,
|
cache_mode=cache_mode,
|
||||||
css_selector=css_selector,
|
css_selector=css_selector,
|
||||||
screenshot=screenshot,
|
screenshot=screenshot,
|
||||||
@@ -377,6 +382,7 @@ class AsyncWebCrawler:
|
|||||||
word_count_threshold: int,
|
word_count_threshold: int,
|
||||||
extraction_strategy: ExtractionStrategy,
|
extraction_strategy: ExtractionStrategy,
|
||||||
chunking_strategy: ChunkingStrategy,
|
chunking_strategy: ChunkingStrategy,
|
||||||
|
content_filter: RelevantContentFilter,
|
||||||
css_selector: str,
|
css_selector: str,
|
||||||
screenshot: str,
|
screenshot: str,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
@@ -397,6 +403,7 @@ class AsyncWebCrawler:
|
|||||||
image_description_min_word_threshold=kwargs.get(
|
image_description_min_word_threshold=kwargs.get(
|
||||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||||
),
|
),
|
||||||
|
content_filter = content_filter,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -532,14 +532,13 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||||
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
|
||||||
if kwargs.get('fit_markdown', False):
|
if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
|
||||||
# cleaner = ContentCleaningStrategy()
|
content_filter = kwargs.get('content_filter', None)
|
||||||
# fit_html = cleaner.clean(cleaned_html)
|
if not content_filter:
|
||||||
# fit_markdown = h.handle(fit_html)
|
content_filter = BM25ContentFilter(
|
||||||
content_filter = BM25ContentFilter(
|
user_query= kwargs.get('fit_markdown_user_query', None),
|
||||||
user_query= kwargs.get('fit_markdown_user_query', None),
|
bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||||
bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
)
|
||||||
)
|
|
||||||
fit_html = content_filter.filter_content(html)
|
fit_html = content_filter.filter_content(html)
|
||||||
fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
|
fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
|
||||||
fit_markdown = h.handle(fit_html)
|
fit_markdown = h.handle(fit_html)
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ async def content_filtering_example():
|
|||||||
|
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://openai.com/blog",
|
url="https://openai.com/blog",
|
||||||
extraction_strategy=content_filter
|
content_filter=content_filter
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Filtered content: {result.extracted_content}")
|
print(f"Filtered content: {result.extracted_content}")
|
||||||
|
|||||||
84
docs/md_v2/advanced/managed_browser.md
Normal file
84
docs/md_v2/advanced/managed_browser.md
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# Content Filtering in Crawl4AI
|
||||||
|
|
||||||
|
This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
|
||||||
|
|
||||||
|
## Relevance Content Filter
|
||||||
|
|
||||||
|
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
|
||||||
|
|
||||||
|
## BM25 Algorithm
|
||||||
|
|
||||||
|
The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
|
||||||
|
async def filter_content(url, query=None):
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
content_filter = BM25ContentFilter(user_query=query)
|
||||||
|
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
|
||||||
|
if result.success:
|
||||||
|
print(f"Filtered Content (JSON):\n{result.extracted_content}")
|
||||||
|
print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object
|
||||||
|
print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing.
|
||||||
|
else:
|
||||||
|
print("Error:", result.error_message)
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query
|
||||||
|
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parameters
|
||||||
|
|
||||||
|
- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query.
|
||||||
|
- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering.
|
||||||
|
|
||||||
|
|
||||||
|
## Fit Markdown Flag
|
||||||
|
|
||||||
|
Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`.
|
||||||
|
|
||||||
|
|
||||||
|
## Custom Content Filtering Strategies
|
||||||
|
|
||||||
|
You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.content_filter_strategy import RelevantContentFilter
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
class MyCustomFilter(RelevantContentFilter):
|
||||||
|
def filter_content(self, html: str) -> List[str]:
|
||||||
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
|
# Implement custom filtering logic here
|
||||||
|
# Example: extract all paragraphs within divs with class "article-body"
|
||||||
|
filtered_paragraphs = []
|
||||||
|
for tag in soup.select("div.article-body p"):
|
||||||
|
if isinstance(tag, Tag):
|
||||||
|
filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element.
|
||||||
|
return filtered_paragraphs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def custom_filter_demo(url: str):
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
custom_filter = MyCustomFilter()
|
||||||
|
result = await crawler.arun(url, extraction_strategy=custom_filter)
|
||||||
|
if result.success:
|
||||||
|
print(result.extracted_content)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline.
|
||||||
@@ -30,7 +30,7 @@ Let's start with a basic example of session-based crawling:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
|
|
||||||
async def basic_session_crawl():
|
async def basic_session_crawl():
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
@@ -43,7 +43,7 @@ async def basic_session_crawl():
|
|||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
|
js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
|
||||||
css_selector=".content-item",
|
css_selector=".content-item",
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
|
print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
|
||||||
@@ -102,7 +102,7 @@ async def advanced_session_crawl_with_hooks():
|
|||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
css_selector="li.commit-item",
|
css_selector="li.commit-item",
|
||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
bypass_cache=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
js_only=page > 0
|
js_only=page > 0
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -174,7 +174,7 @@ async def integrated_js_and_wait_crawl():
|
|||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
js_code=js_next_page_and_wait if page > 0 else None,
|
js_code=js_next_page_and_wait if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
commits = json.loads(result.extracted_content)
|
commits = json.loads(result.extracted_content)
|
||||||
@@ -241,7 +241,7 @@ async def wait_for_parameter_crawl():
|
|||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
wait_for=wait_for if page > 0 else None,
|
wait_for=wait_for if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
commits = json.loads(result.extracted_content)
|
commits = json.loads(result.extracted_content)
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ async def crawl_dynamic_content():
|
|||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
wait_for=wait_for if page > 0 else None,
|
wait_for=wait_for if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
|
|||||||
@@ -8,11 +8,26 @@ The following parameters can be passed to the `arun()` method. They are organize
|
|||||||
await crawler.arun(
|
await crawler.arun(
|
||||||
url="https://example.com", # Required: URL to crawl
|
url="https://example.com", # Required: URL to crawl
|
||||||
verbose=True, # Enable detailed logging
|
verbose=True, # Enable detailed logging
|
||||||
bypass_cache=False, # Skip cache for this request
|
cache_mode=CacheMode.ENABLED, # Control cache behavior
|
||||||
warmup=True # Whether to run warmup check
|
warmup=True # Whether to run warmup check
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Cache Control
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import CacheMode
|
||||||
|
|
||||||
|
await crawler.arun(
|
||||||
|
cache_mode=CacheMode.ENABLED, # Normal caching (read/write)
|
||||||
|
# Other cache modes:
|
||||||
|
# cache_mode=CacheMode.DISABLED # No caching at all
|
||||||
|
# cache_mode=CacheMode.READ_ONLY # Only read from cache
|
||||||
|
# cache_mode=CacheMode.WRITE_ONLY # Only write to cache
|
||||||
|
# cache_mode=CacheMode.BYPASS # Skip cache for this operation
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
## Content Processing Parameters
|
## Content Processing Parameters
|
||||||
|
|
||||||
### Text Processing
|
### Text Processing
|
||||||
@@ -162,14 +177,13 @@ await crawler.arun(
|
|||||||
|
|
||||||
## Parameter Interactions and Notes
|
## Parameter Interactions and Notes
|
||||||
|
|
||||||
1. **Magic Mode Combinations**
|
1. **Cache and Performance Setup**
|
||||||
```python
|
```python
|
||||||
# Full anti-detection setup
|
# Optimal caching for repeated crawls
|
||||||
await crawler.arun(
|
await crawler.arun(
|
||||||
magic=True,
|
cache_mode=CacheMode.ENABLED,
|
||||||
headless=False,
|
word_count_threshold=10,
|
||||||
simulate_user=True,
|
process_iframes=False
|
||||||
override_navigator=True
|
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -179,7 +193,8 @@ await crawler.arun(
|
|||||||
await crawler.arun(
|
await crawler.arun(
|
||||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||||
wait_for="css:.lazy-content",
|
wait_for="css:.lazy-content",
|
||||||
delay_before_return_html=2.0
|
delay_before_return_html=2.0,
|
||||||
|
cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -192,7 +207,8 @@ await crawler.arun(
|
|||||||
extraction_strategy=my_strategy,
|
extraction_strategy=my_strategy,
|
||||||
chunking_strategy=my_chunking,
|
chunking_strategy=my_chunking,
|
||||||
process_iframes=True,
|
process_iframes=True,
|
||||||
remove_overlay_elements=True
|
remove_overlay_elements=True,
|
||||||
|
cache_mode=CacheMode.ENABLED
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -201,7 +217,7 @@ await crawler.arun(
|
|||||||
1. **Performance Optimization**
|
1. **Performance Optimization**
|
||||||
```python
|
```python
|
||||||
await crawler.arun(
|
await crawler.arun(
|
||||||
bypass_cache=False, # Use cache when possible
|
cache_mode=CacheMode.ENABLED, # Use full caching
|
||||||
word_count_threshold=10, # Filter out noise
|
word_count_threshold=10, # Filter out noise
|
||||||
process_iframes=False # Skip iframes if not needed
|
process_iframes=False # Skip iframes if not needed
|
||||||
)
|
)
|
||||||
@@ -212,7 +228,8 @@ await crawler.arun(
|
|||||||
await crawler.arun(
|
await crawler.arun(
|
||||||
magic=True, # Enable anti-detection
|
magic=True, # Enable anti-detection
|
||||||
delay_before_return_html=1.0, # Wait for dynamic content
|
delay_before_return_html=1.0, # Wait for dynamic content
|
||||||
page_timeout=60000 # Longer timeout for slow pages
|
page_timeout=60000, # Longer timeout for slow pages
|
||||||
|
cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -221,6 +238,7 @@ await crawler.arun(
|
|||||||
await crawler.arun(
|
await crawler.arun(
|
||||||
remove_overlay_elements=True, # Remove popups
|
remove_overlay_elements=True, # Remove popups
|
||||||
excluded_tags=['nav', 'aside'],# Remove unnecessary elements
|
excluded_tags=['nav', 'aside'],# Remove unnecessary elements
|
||||||
keep_data_attributes=False # Remove data attributes
|
keep_data_attributes=False, # Remove data attributes
|
||||||
|
cache_mode=CacheMode.ENABLED # Use cache for faster processing
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
@@ -20,6 +20,7 @@ class CrawlResult(BaseModel):
|
|||||||
fit_html: Optional[str] = None # Most relevant HTML content
|
fit_html: Optional[str] = None # Most relevant HTML content
|
||||||
markdown: Optional[str] = None # HTML converted to markdown
|
markdown: Optional[str] = None # HTML converted to markdown
|
||||||
fit_markdown: Optional[str] = None # Most relevant markdown content
|
fit_markdown: Optional[str] = None # Most relevant markdown content
|
||||||
|
downloaded_files: Optional[List[str]] = None # Downloaded files
|
||||||
|
|
||||||
# Extracted Data
|
# Extracted Data
|
||||||
extracted_content: Optional[str] = None # Content from extraction strategy
|
extracted_content: Optional[str] = None # Content from extraction strategy
|
||||||
|
|||||||
@@ -32,4 +32,5 @@
|
|||||||
| async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request |
|
| async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request |
|
||||||
| async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse |
|
| async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse |
|
||||||
| async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content |
|
| async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content |
|
||||||
| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
|
| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
|
||||||
|
| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request |
|
||||||
84
docs/md_v2/basic/content_filtering.md
Normal file
84
docs/md_v2/basic/content_filtering.md
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
# Content Filtering in Crawl4AI
|
||||||
|
|
||||||
|
This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
|
||||||
|
|
||||||
|
## Relevance Content Filter
|
||||||
|
|
||||||
|
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
|
||||||
|
|
||||||
|
## BM25 Algorithm
|
||||||
|
|
||||||
|
The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
|
||||||
|
async def filter_content(url, query=None):
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
content_filter = BM25ContentFilter(user_query=query)
|
||||||
|
result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
|
||||||
|
if result.success:
|
||||||
|
print(f"Filtered Content (JSON):\n{result.extracted_content}")
|
||||||
|
print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object
|
||||||
|
print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing.
|
||||||
|
else:
|
||||||
|
print("Error:", result.error_message)
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query
|
||||||
|
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parameters
|
||||||
|
|
||||||
|
- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query.
|
||||||
|
- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering.
|
||||||
|
|
||||||
|
|
||||||
|
## Fit Markdown Flag
|
||||||
|
|
||||||
|
Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`.
|
||||||
|
|
||||||
|
|
||||||
|
## Custom Content Filtering Strategies
|
||||||
|
|
||||||
|
You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.content_filter_strategy import RelevantContentFilter
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
class MyCustomFilter(RelevantContentFilter):
|
||||||
|
def filter_content(self, html: str) -> List[str]:
|
||||||
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
|
# Implement custom filtering logic here
|
||||||
|
# Example: extract all paragraphs within divs with class "article-body"
|
||||||
|
filtered_paragraphs = []
|
||||||
|
for tag in soup.select("div.article-body p"):
|
||||||
|
if isinstance(tag, Tag):
|
||||||
|
filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element.
|
||||||
|
return filtered_paragraphs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def custom_filter_demo(url: str):
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
custom_filter = MyCustomFilter()
|
||||||
|
result = await crawler.arun(url, content_filter=custom_filter)
|
||||||
|
if result.success:
|
||||||
|
print(result.extracted_content)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline.
|
||||||
148
docs/md_v2/basic/file-download.md
Normal file
148
docs/md_v2/basic/file-download.md
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
# Download Handling in Crawl4AI
|
||||||
|
|
||||||
|
This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
|
||||||
|
|
||||||
|
## Enabling Downloads
|
||||||
|
|
||||||
|
By default, Crawl4AI does not download files. To enable downloads, set the `accept_downloads` parameter to `True` in either the `AsyncWebCrawler` constructor or the `arun` method.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with AsyncWebCrawler(accept_downloads=True) as crawler: # Globally enable downloads
|
||||||
|
# ... your crawling logic ...
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, enable it for a specific crawl:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def main():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="...", accept_downloads=True)
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Specifying Download Location
|
||||||
|
|
||||||
|
You can specify the download directory using the `downloads_path` parameter. If not provided, Crawl4AI creates a "downloads" directory inside the `.crawl4ai` folder in your home directory.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ... inside your crawl function:
|
||||||
|
|
||||||
|
downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path
|
||||||
|
os.makedirs(downloads_path, exist_ok=True)
|
||||||
|
|
||||||
|
result = await crawler.arun(url="...", downloads_path=downloads_path, accept_downloads=True)
|
||||||
|
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are setting it globally, provide the path to the AsyncWebCrawler:
|
||||||
|
```python
|
||||||
|
async def crawl_with_downloads(url: str, download_path: str):
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
accept_downloads=True,
|
||||||
|
downloads_path=download_path, # or set it on arun
|
||||||
|
verbose=True
|
||||||
|
) as crawler:
|
||||||
|
result = await crawler.arun(url=url) # you still need to enable downloads per call.
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Triggering Downloads
|
||||||
|
|
||||||
|
Downloads are typically triggered by user interactions on a web page (e.g., clicking a download button). You can simulate these actions with the `js_code` parameter, injecting JavaScript code to be executed within the browser context. The `wait_for` parameter might also be crucial to allowing sufficient time for downloads to initiate before the crawler proceeds.
|
||||||
|
|
||||||
|
```python
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://www.python.org/downloads/",
|
||||||
|
js_code="""
|
||||||
|
// Find and click the first Windows installer link
|
||||||
|
const downloadLink = document.querySelector('a[href$=".exe"]');
|
||||||
|
if (downloadLink) {
|
||||||
|
downloadLink.click();
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
wait_for=5 # Wait for 5 seconds for the download to start
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Accessing Downloaded Files
|
||||||
|
|
||||||
|
Downloaded file paths are stored in the `downloaded_files` attribute of the returned `CrawlResult` object. This is a list of strings, with each string representing the absolute path to a downloaded file.
|
||||||
|
|
||||||
|
```python
|
||||||
|
if result.downloaded_files:
|
||||||
|
print("Downloaded files:")
|
||||||
|
for file_path in result.downloaded_files:
|
||||||
|
print(f"- {file_path}")
|
||||||
|
# Perform operations with downloaded files, e.g., check file size
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
print(f"- File size: {file_size} bytes")
|
||||||
|
else:
|
||||||
|
print("No files downloaded.")
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Example: Downloading Multiple Files
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def download_multiple_files(url: str, download_path: str):
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
accept_downloads=True,
|
||||||
|
downloads_path=download_path,
|
||||||
|
verbose=True
|
||||||
|
) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
js_code="""
|
||||||
|
// Trigger multiple downloads (example)
|
||||||
|
const downloadLinks = document.querySelectorAll('a[download]'); // Or a more specific selector
|
||||||
|
for (const link of downloadLinks) {
|
||||||
|
link.click();
|
||||||
|
await new Promise(r => setTimeout(r, 2000)); // Add a small delay between clicks if needed
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
wait_for=10 # Adjust the timeout to match the expected time for all downloads to start
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.downloaded_files:
|
||||||
|
print("Downloaded files:")
|
||||||
|
for file in result.downloaded_files:
|
||||||
|
print(f"- {file}")
|
||||||
|
else:
|
||||||
|
print("No files downloaded.")
|
||||||
|
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||||
|
os.makedirs(download_path, exist_ok=True) # Create directory if it doesn't exist
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
|
||||||
|
```
|
||||||
|
|
||||||
|
## Important Considerations
|
||||||
|
|
||||||
|
- **Browser Context:** Downloads are managed within the browser context. Ensure your `js_code` correctly targets the download triggers on the specific web page.
|
||||||
|
- **Waiting:** Use `wait_for` to manage the timing of the crawl process if immediate download might not occur.
|
||||||
|
- **Error Handling:** Implement proper error handling to gracefully manage failed downloads or incorrect file paths.
|
||||||
|
- **Security:** Downloaded files should be scanned for potential security threats before use.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
This guide provides a foundation for handling downloads with Crawl4AI. You can adapt these techniques to manage downloads in various scenarios and integrate them into more complex crawling workflows.
|
||||||
@@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CasheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
@@ -42,7 +42,7 @@ async def capture_and_save_screenshot(url: str, output_path: str):
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url=url,
|
||||||
screenshot=True,
|
screenshot=True,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success and result.screenshot:
|
if result.success and result.screenshot:
|
||||||
@@ -62,15 +62,15 @@ Crawl4AI supports multiple browser engines. Here's how to use different browsers
|
|||||||
```python
|
```python
|
||||||
# Use Firefox
|
# Use Firefox
|
||||||
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
|
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
# Use WebKit
|
# Use WebKit
|
||||||
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
|
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
# Use Chromium (default)
|
# Use Chromium (default)
|
||||||
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS)
|
||||||
```
|
```
|
||||||
|
|
||||||
### User Simulation 🎭
|
### User Simulation 🎭
|
||||||
@@ -81,7 +81,7 @@ Simulate real user behavior to avoid detection:
|
|||||||
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="YOUR-URL-HERE",
|
url="YOUR-URL-HERE",
|
||||||
bypass_cache=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
simulate_user=True, # Causes random mouse movements and clicks
|
simulate_user=True, # Causes random mouse movements and clicks
|
||||||
override_navigator=True # Makes the browser appear more like a real user
|
override_navigator=True # Makes the browser appear more like a real user
|
||||||
)
|
)
|
||||||
@@ -99,7 +99,7 @@ async def main():
|
|||||||
print(f"First crawl result: {result1.markdown[:100]}...")
|
print(f"First crawl result: {result1.markdown[:100]}...")
|
||||||
|
|
||||||
# Force to crawl again
|
# Force to crawl again
|
||||||
result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True)
|
result2 = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS)
|
||||||
print(f"Second crawl result: {result2.markdown[:100]}...")
|
print(f"Second crawl result: {result2.markdown[:100]}...")
|
||||||
|
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
@@ -189,7 +189,7 @@ extraction_strategy = LLMExtractionStrategy(
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://paulgraham.com/love.html",
|
url="https://paulgraham.com/love.html",
|
||||||
bypass_cache=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
extraction_strategy=extraction_strategy
|
extraction_strategy=extraction_strategy
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
@@ -239,7 +239,7 @@ async def crawl_dynamic_content():
|
|||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
wait_for=wait_for if page > 0 else None,
|
wait_for=wait_for if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
bypass_cache=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
headless=False,
|
headless=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -254,7 +254,7 @@ Remove overlay elements and fit content appropriately:
|
|||||||
async with AsyncWebCrawler(headless=False) as crawler:
|
async with AsyncWebCrawler(headless=False) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="your-url-here",
|
url="your-url-here",
|
||||||
bypass_cache=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
word_count_threshold=10,
|
word_count_threshold=10,
|
||||||
remove_overlay_elements=True,
|
remove_overlay_elements=True,
|
||||||
screenshot=True
|
screenshot=True
|
||||||
@@ -282,7 +282,7 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
word_count_threshold=0,
|
word_count_threshold=0,
|
||||||
bypass_cache=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
)
|
)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|||||||
@@ -12,7 +12,9 @@ from crawl4ai import AsyncWebCrawler
|
|||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(
|
||||||
|
url="https://example.com"
|
||||||
|
)
|
||||||
print(result.markdown) # Print clean markdown content
|
print(result.markdown) # Print clean markdown content
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -24,7 +26,7 @@ if __name__ == "__main__":
|
|||||||
The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
|
The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com", fit_markdown=True)
|
||||||
|
|
||||||
# Different content formats
|
# Different content formats
|
||||||
print(result.html) # Raw HTML
|
print(result.html) # Raw HTML
|
||||||
@@ -81,7 +83,7 @@ Here's a more comprehensive example showing common usage patterns:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
@@ -97,7 +99,7 @@ async def main():
|
|||||||
remove_overlay_elements=True,
|
remove_overlay_elements=True,
|
||||||
|
|
||||||
# Cache control
|
# Cache control
|
||||||
bypass_cache=False # Use cache if available
|
cache_mode=CacheMode.ENABLE # Use cache if available
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove
|
|||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="Extract model names and fees for input and output tokens from the page."
|
instruction="Extract model names and fees for input and output tokens from the page."
|
||||||
),
|
),
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
@@ -98,7 +98,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://example.com/some-article",
|
url="https://example.com/some-article",
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url=url,
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
@@ -103,7 +103,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url=url,
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ Here's a condensed outline of the **Installation and Setup** video content:
|
|||||||
- Walk through a simple test script to confirm the setup:
|
- Walk through a simple test script to confirm the setup:
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
@@ -1093,7 +1093,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove
|
|||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="Extract model names and fees for input and output tokens from the page."
|
instruction="Extract model names and fees for input and output tokens from the page."
|
||||||
),
|
),
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
@@ -1139,7 +1139,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://example.com/some-article",
|
url="https://example.com/some-article",
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
@@ -1248,7 +1248,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url=url,
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
@@ -1296,7 +1296,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove
|
|||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url=url,
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
bypass_cache=True
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.extracted_content)
|
print(result.extracted_content)
|
||||||
```
|
```
|
||||||
|
|||||||
Reference in New Issue
Block a user