Enhance Crawl4AI with new features and documentation

- Fix crawler text mode for improved performance; cover missing `srcset` and `data_srcset` attributes in image tags.
  - Introduced Managed Browsers for enhanced crawling experience.
  - Updated documentation for clearer navigation on configuration.
  - Changed 'text_only' to 'text_mode' in configuration and methods.
  - Improved performance and relevance in content filtering strategies.
This commit is contained in:
UncleCode
2024-12-19 21:02:29 +08:00
parent 393bb911c0
commit 849765712f
23 changed files with 1825 additions and 1721 deletions

View File

@@ -5,15 +5,15 @@
### **File: `crawl4ai/async_crawler_strategy.py`**
#### **New Parameters and Attributes Added**
- **`text_only` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering.
- **`text_mode` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering.
- **`light_mode` (boolean)**: Optimizes the browser by disabling unnecessary background processes and features for efficiency.
- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_only` mode (default values: 800x600 for `text_only`, 1920x1080 otherwise).
- **`extra_args`**: Adds browser-specific flags for `text_only` mode.
- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_mode` mode (default values: 800x600 for `text_mode`, 1920x1080 otherwise).
- **`extra_args`**: Adds browser-specific flags for `text_mode` mode.
- **`adjust_viewport_to_content`**: Dynamically adjusts the viewport to the content size for accurate rendering.
#### **Browser Context Adjustments**
- Added **`viewport` adjustments**: Dynamically computed based on `text_only` or custom configuration.
- Enhanced support for `light_mode` and `text_only` by adding specific browser arguments to reduce resource consumption.
- Added **`viewport` adjustments**: Dynamically computed based on `text_mode` or custom configuration.
- Enhanced support for `light_mode` and `text_mode` by adding specific browser arguments to reduce resource consumption.
#### **Dynamic Content Handling**
- **Full Page Scan Feature**:

View File

@@ -61,7 +61,7 @@ class BrowserConfig:
user_agent as-is. Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
text_only (bool): If True, disables images and other rich content for potentially faster load times.
text_mode (bool): If True, disables images and other rich content for potentially faster load times.
Default: False.
light_mode (bool): Disables certain background features for performance gains. Default: False.
extra_args (list): Additional command-line arguments passed to the browser.
@@ -78,8 +78,8 @@ class BrowserConfig:
chrome_channel: str = "chrome",
proxy: str = None,
proxy_config: dict = None,
viewport_width: int = 1920,
viewport_height: int = 1080,
viewport_width: int = 800,
viewport_height: int = 600,
accept_downloads: bool = False,
downloads_path: str = None,
storage_state=None,
@@ -95,7 +95,7 @@ class BrowserConfig:
),
user_agent_mode: str = None,
user_agent_generator_config: dict = None,
text_only: bool = False,
text_mode: bool = False,
light_mode: bool = False,
extra_args: list = None,
):
@@ -126,7 +126,7 @@ class BrowserConfig:
self.user_agent = user_agent
self.user_agent_mode = user_agent_mode
self.user_agent_generator_config = user_agent_generator_config
self.text_only = text_only
self.text_mode = text_mode
self.light_mode = light_mode
self.extra_args = extra_args if extra_args is not None else []
self.sleep_on_close = sleep_on_close
@@ -171,7 +171,7 @@ class BrowserConfig:
),
user_agent_mode=kwargs.get("user_agent_mode"),
user_agent_generator_config=kwargs.get("user_agent_generator_config"),
text_only=kwargs.get("text_only", False),
text_mode=kwargs.get("text_mode", False),
light_mode=kwargs.get("light_mode", False),
extra_args=kwargs.get("extra_args", []),
)
@@ -366,7 +366,11 @@ class CrawlerRunConfig:
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
url: str = None,
):
self.url = url
# Content Processing Parameters
self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy
@@ -510,6 +514,8 @@ class CrawlerRunConfig:
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
url=kwargs.get("url"),
)

File diff suppressed because it is too large Load Diff

View File

@@ -197,12 +197,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Constants for checks
classes_to_check = frozenset(['button', 'icon', 'logo'])
tags_to_check = frozenset(['button', 'input'])
image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif'])
# Pre-fetch commonly used attributes
style = img.get('style', '')
alt = img.get('alt', '')
src = img.get('src', '')
data_src = img.get('data-src', '')
srcset = img.get('srcset', '')
data_srcset = img.get('data-srcset', '')
width = img.get('width')
height = img.get('height')
parent = img.parent
@@ -228,14 +231,36 @@ class WebScrapingStrategy(ContentScrapingStrategy):
score += 1
score += index/total_images < 0.5
image_format = ''
if "data:image/" in src:
image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
else:
image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
# image_format = ''
# if "data:image/" in src:
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
# else:
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
if image_format in ('jpg', 'png', 'webp', 'avif'):
# if image_format in ('jpg', 'png', 'webp', 'avif'):
# score += 1
# Check for image format in all possible sources
def has_image_format(url):
return any(fmt in url.lower() for fmt in image_formats)
# Score for having proper image sources
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
score += 1
if srcset or data_srcset:
score += 1
if img.find_parent('picture'):
score += 1
# Detect format from any available source
detected_format = None
for url in [src, data_src, srcset, data_srcset]:
if url:
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
if format_matches:
detected_format = format_matches[0]
break
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
return None
@@ -254,7 +279,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
'desc': self.find_closest_parent_with_useful_text(img, **kwargs),
'score': score,
'type': 'image',
'group_id': group_id # Group ID for this set of variants
'group_id': group_id, # Group ID for this set of variants
'format': detected_format,
}
# Inline function for adding variants
@@ -287,7 +313,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return image_variants if image_variants else None
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
media = {'images': [], 'videos': [], 'audios': []}
internal_links_dict = {}

View File

@@ -57,6 +57,11 @@ class NoExtractionStrategy(ExtractionStrategy):
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
#######################################################
# Strategies using LLM-based extraction for text data #
#######################################################
class LLMExtractionStrategy(ExtractionStrategy):
def __init__(self,
provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
@@ -234,6 +239,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
return extracted_content
#######################################################
# Strategies using clustering for text data extraction #
#######################################################
class CosineStrategy(ExtractionStrategy):
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
"""
@@ -514,6 +524,11 @@ class CosineStrategy(ExtractionStrategy):
return self.extract(url, self.DEL.join(sections), **kwargs)
#######################################################
# Strategies based on the extraction of specific types #
#######################################################
class TopicExtractionStrategy(ExtractionStrategy):
def __init__(self, num_keywords: int = 3, **kwargs):
"""
@@ -637,7 +652,222 @@ class ContentSummarizationStrategy(ExtractionStrategy):
summaries.sort(key=lambda x: x[0])
return [summary for _, summary in summaries]
class JsonCssExtractionStrategy(ExtractionStrategy):
#######################################################
# New extraction strategies for JSON-based extraction #
#######################################################
class JsonElementExtractionStrategy(ExtractionStrategy):
DEL = '\n'
def __init__(self, schema: Dict[str, Any], **kwargs):
super().__init__(**kwargs)
self.schema = schema
self.verbose = kwargs.get('verbose', False)
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
parsed_html = self._parse_html(html_content)
base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
results = []
for element in base_elements:
# Extract base element attributes
item = {}
if 'baseFields' in self.schema:
for field in self.schema['baseFields']:
value = self._extract_single_field(element, field)
if value is not None:
item[field['name']] = value
# Extract child fields
field_data = self._extract_item(element, self.schema['fields'])
item.update(field_data)
if item:
results.append(item)
return results
@abstractmethod
def _parse_html(self, html_content: str):
"""Parse HTML content into appropriate format"""
pass
@abstractmethod
def _get_base_elements(self, parsed_html, selector: str):
"""Get all base elements using the selector"""
pass
@abstractmethod
def _get_elements(self, element, selector: str):
"""Get child elements using the selector"""
pass
def _extract_field(self, element, field):
try:
if field['type'] == 'nested':
nested_elements = self._get_elements(element, field['selector'])
nested_element = nested_elements[0] if nested_elements else None
return self._extract_item(nested_element, field['fields']) if nested_element else {}
if field['type'] == 'list':
elements = self._get_elements(element, field['selector'])
return [self._extract_list_item(el, field['fields']) for el in elements]
if field['type'] == 'nested_list':
elements = self._get_elements(element, field['selector'])
return [self._extract_item(el, field['fields']) for el in elements]
return self._extract_single_field(element, field)
except Exception as e:
if self.verbose:
print(f"Error extracting field {field['name']}: {str(e)}")
return field.get('default')
def _extract_single_field(self, element, field):
if 'selector' in field:
selected = self._get_elements(element, field['selector'])
if not selected:
return field.get('default')
selected = selected[0]
else:
selected = element
value = None
if field['type'] == 'text':
value = self._get_element_text(selected)
elif field['type'] == 'attribute':
value = self._get_element_attribute(selected, field['attribute'])
elif field['type'] == 'html':
value = self._get_element_html(selected)
elif field['type'] == 'regex':
text = self._get_element_text(selected)
match = re.search(field['pattern'], text)
value = match.group(1) if match else None
if 'transform' in field:
value = self._apply_transform(value, field['transform'])
return value if value is not None else field.get('default')
def _extract_list_item(self, element, fields):
item = {}
for field in fields:
value = self._extract_single_field(element, field)
if value is not None:
item[field['name']] = value
return item
def _extract_item(self, element, fields):
item = {}
for field in fields:
if field['type'] == 'computed':
value = self._compute_field(item, field)
else:
value = self._extract_field(element, field)
if value is not None:
item[field['name']] = value
return item
def _apply_transform(self, value, transform):
if transform == 'lowercase':
return value.lower()
elif transform == 'uppercase':
return value.upper()
elif transform == 'strip':
return value.strip()
return value
def _compute_field(self, item, field):
try:
if 'expression' in field:
return eval(field['expression'], {}, item)
elif 'function' in field:
return field['function'](item)
except Exception as e:
if self.verbose:
print(f"Error computing field {field['name']}: {str(e)}")
return field.get('default')
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
combined_html = self.DEL.join(sections)
return self.extract(url, combined_html, **kwargs)
@abstractmethod
def _get_element_text(self, element) -> str:
"""Get text content from element"""
pass
@abstractmethod
def _get_element_html(self, element) -> str:
"""Get HTML content from element"""
pass
@abstractmethod
def _get_element_attribute(self, element, attribute: str):
"""Get attribute value from element"""
pass
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
def _parse_html(self, html_content: str):
return BeautifulSoup(html_content, 'html.parser')
def _get_base_elements(self, parsed_html, selector: str):
return parsed_html.select(selector)
def _get_elements(self, element, selector: str):
selected = element.select_one(selector)
return [selected] if selected else []
def _get_element_text(self, element) -> str:
return element.get_text(strip=True)
def _get_element_html(self, element) -> str:
return str(element)
def _get_element_attribute(self, element, attribute: str):
return element.get(attribute)
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
def _parse_html(self, html_content: str):
return html.fromstring(html_content)
def _get_base_elements(self, parsed_html, selector: str):
return parsed_html.xpath(selector)
def _css_to_xpath(self, css_selector: str) -> str:
"""Convert CSS selector to XPath if needed"""
if '/' in css_selector: # Already an XPath
return css_selector
return self._basic_css_to_xpath(css_selector)
def _basic_css_to_xpath(self, css_selector: str) -> str:
"""Basic CSS to XPath conversion for common cases"""
if ' > ' in css_selector:
parts = css_selector.split(' > ')
return '//' + '/'.join(parts)
if ' ' in css_selector:
parts = css_selector.split(' ')
return '//' + '//'.join(parts)
return '//' + css_selector
def _get_elements(self, element, selector: str):
xpath = self._css_to_xpath(selector)
if not xpath.startswith('.'):
xpath = '.' + xpath
return element.xpath(xpath)
def _get_element_text(self, element) -> str:
return ''.join(element.xpath('.//text()')).strip()
def _get_element_html(self, element) -> str:
return etree.tostring(element, encoding='unicode')
def _get_element_attribute(self, element, attribute: str):
return element.get(attribute)
class _JsonCssExtractionStrategy(ExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs):
super().__init__(**kwargs)
self.schema = schema
@@ -648,14 +878,22 @@ class JsonCssExtractionStrategy(ExtractionStrategy):
results = []
for element in base_elements:
item = self._extract_item(element, self.schema['fields'])
if item:
results.append(item)
# Extract base element attributes first
item = {}
if 'baseFields' in self.schema:
for field in self.schema['baseFields']:
value = self._extract_single_field(element, field)
if value is not None:
item[field['name']] = value
# Then extract child fields
field_data = self._extract_item(element, self.schema['fields'])
item.update(field_data)
results.append(item)
return results
def _extract_field(self, element, field):
try:
if field['type'] == 'nested':
@@ -743,7 +981,7 @@ class JsonCssExtractionStrategy(ExtractionStrategy):
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
combined_html = self.DEL.join(sections)
return self.extract(url, combined_html, **kwargs)
class JsonXPathExtractionStrategy(ExtractionStrategy):
class _JsonXPathExtractionStrategy(ExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs):
super().__init__(**kwargs)
self.schema = schema
@@ -755,9 +993,19 @@ class JsonXPathExtractionStrategy(ExtractionStrategy):
results = []
for element in base_elements:
item = self._extract_item(element, self.schema['fields'])
if item:
results.append(item)
# Extract base element attributes first
item = {}
if 'baseFields' in self.schema:
for field in self.schema['baseFields']:
value = self._extract_single_field(element, field)
if value is not None:
item[field['name']] = value
# Then extract child fields
field_data = self._extract_item(element, self.schema['fields'])
item.update(field_data)
results.append(item)
return results

View File

@@ -2,80 +2,12 @@
Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction.
## Content Cleaning
### Understanding Clean Content
When crawling web pages, you often encounter a lot of noise - advertisements, navigation menus, footers, popups, and other irrelevant content. Crawl4AI automatically cleans this noise using several approaches:
1. **Basic Cleaning**: Removes unwanted HTML elements and attributes
2. **Content Relevance**: Identifies and preserves meaningful content blocks
3. **Layout Analysis**: Understands page structure to identify main content areas
```python
result = await crawler.arun(
url="https://example.com",
word_count_threshold=10, # Remove blocks with fewer words
excluded_tags=['form', 'nav'], # Remove specific HTML tags
remove_overlay_elements=True # Remove popups/modals
)
# Get clean content
print(result.cleaned_html) # Cleaned HTML
print(result.markdown) # Clean markdown version
```
### Fit Markdown: Smart Content Extraction
One of Crawl4AI's most powerful features is `fit_markdown`. This feature uses advanced heuristics to identify and extract the main content from a webpage while excluding irrelevant elements.
#### How Fit Markdown Works
- Analyzes content density and distribution
- Identifies content patterns and structures
- Removes boilerplate content (headers, footers, sidebars)
- Preserves the most relevant content blocks
- Maintains content hierarchy and formatting
#### Perfect For:
- Blog posts and articles
- News content
- Documentation pages
- Any page with a clear main content area
#### Not Recommended For:
- E-commerce product listings
- Search results pages
- Social media feeds
- Pages with multiple equal-weight content sections
```python
result = await crawler.arun(url="https://example.com")
# Get the most relevant content
main_content = result.fit_markdown
# Compare with regular markdown
all_content = result.markdown
print(f"Fit Markdown Length: {len(main_content)}")
print(f"Regular Markdown Length: {len(all_content)}")
```
#### Example Use Case
```python
async def extract_article_content(url: str) -> str:
"""Extract main article content from a blog or news site."""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url)
# fit_markdown will focus on the article content,
# excluding navigation, ads, and other distractions
return result.fit_markdown
```
## Media Processing
Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance.
### Image Processing
The library handles various image scenarios, including:
- Regular images
- Lazy-loaded images
@@ -84,7 +16,10 @@ The library handles various image scenarios, including:
- Image metadata and context
```python
result = await crawler.arun(url="https://example.com")
from crawl4ai.async_configs import CrawlerRunConfig
config = CrawlerRunConfig()
result = await crawler.arun(url="https://example.com", config=config)
for image in result.media["images"]:
# Each image includes rich metadata
@@ -96,20 +31,27 @@ for image in result.media["images"]:
```
### Handling Lazy-Loaded Content
Crawl4aai already handles lazy loading for media elements. You can also customize the wait time for lazy-loaded content:
Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`:
```python
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
wait_for="css:img[data-src]", # Wait for lazy images
delay_before_return_html=2.0 # Additional wait time
)
result = await crawler.arun(url="https://example.com", config=config)
```
### Video and Audio Content
The library extracts video and audio elements with their metadata:
```python
from crawl4ai.async_configs import CrawlerRunConfig
config = CrawlerRunConfig()
result = await crawler.arun(url="https://example.com", config=config)
# Process videos
for video in result.media["videos"]:
print(f"Video source: {video['src']}")
@@ -129,6 +71,7 @@ for audio in result.media["audios"]:
Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns.
### Link Classification
The library automatically categorizes links into:
- Internal links (same domain)
- External links (different domains)
@@ -137,7 +80,10 @@ The library automatically categorizes links into:
- Content links
```python
result = await crawler.arun(url="https://example.com")
from crawl4ai.async_configs import CrawlerRunConfig
config = CrawlerRunConfig()
result = await crawler.arun(url="https://example.com", config=config)
# Analyze internal links
for link in result.links["internal"]:
@@ -154,18 +100,19 @@ for link in result.links["external"]:
```
### Smart Link Filtering
Control which links are included in the results:
Control which links are included in the results with `CrawlerRunConfig`:
```python
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
exclude_external_links=True, # Remove external links
exclude_social_media_links=True, # Remove social media links
exclude_social_media_domains=[ # Custom social media domains
exclude_social_media_domains=[ # Custom social media domains
"facebook.com", "twitter.com", "instagram.com"
],
exclude_domains=["ads.example.com"] # Exclude specific domains
)
result = await crawler.arun(url="https://example.com", config=config)
```
## Metadata Extraction
@@ -173,7 +120,10 @@ result = await crawler.arun(
Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content:
```python
result = await crawler.arun(url="https://example.com")
from crawl4ai.async_configs import CrawlerRunConfig
config = CrawlerRunConfig()
result = await crawler.arun(url="https://example.com", config=config)
metadata = result.metadata
print(f"Title: {metadata['title']}")
@@ -184,40 +134,3 @@ print(f"Published Date: {metadata['published_date']}")
print(f"Modified Date: {metadata['modified_date']}")
print(f"Language: {metadata['language']}")
```
## Best Practices
1. **Use Fit Markdown for Articles**
```python
# Perfect for blog posts, news articles, documentation
content = result.fit_markdown
```
2. **Handle Media Appropriately**
```python
# Filter by relevance score
relevant_images = [
img for img in result.media["images"]
if img['score'] > 5
]
```
3. **Combine Link Analysis with Content**
```python
# Get content links with context
content_links = [
link for link in result.links["internal"]
if link['type'] == 'content'
]
```
4. **Clean Content with Purpose**
```python
# Customize cleaning based on your needs
result = await crawler.arun(
url=url,
word_count_threshold=20, # Adjust based on content type
keep_data_attributes=False, # Remove data attributes
process_iframes=True # Include iframe content
)
```

View File

@@ -1,114 +1,121 @@
# Hooks & Auth for AsyncWebCrawler
Crawl4AI's AsyncWebCrawler allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions that are called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This example demonstrates how to use various hooks to customize the asynchronous crawling process.
Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`.
## Example: Using Crawler Hooks with AsyncWebCrawler
Let's see how we can customize the AsyncWebCrawler using hooks! In this example, we'll:
In this example, we'll:
1. Configure the browser when it's created.
2. Add custom headers before navigating to the URL.
3. Log the current URL after navigation.
4. Perform actions after JavaScript execution.
5. Log the length of the HTML before returning it.
1. Configure the browser and set up authentication when it's created.
2. Apply custom routing and initial actions when the page context is created.
3. Add custom headers before navigating to the URL.
4. Log the current URL after navigation.
5. Perform actions after JavaScript execution.
6. Log the length of the HTML before returning it.
### Hook Definitions
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from playwright.async_api import Page, Browser, BrowserContext
async def on_browser_created(browser: Browser):
def log_routing(route):
# Example: block loading images
if route.request.resource_type == "image":
print(f"[HOOK] Blocking image request: {route.request.url}")
asyncio.create_task(route.abort())
else:
asyncio.create_task(route.continue_())
async def on_browser_created(browser: Browser, **kwargs):
print("[HOOK] on_browser_created")
# Example customization: set browser viewport size
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
# Example: Set browser viewport size and log in
context = await browser.new_context(viewport={"width": 1920, "height": 1080})
page = await context.new_page()
# Example customization: logging in to a hypothetical website
await page.goto('https://example.com/login')
await page.fill('input[name="username"]', 'testuser')
await page.fill('input[name="password"]', 'password123')
await page.click('button[type="submit"]')
await page.wait_for_selector('#welcome')
# Add a custom cookie
await context.add_cookies([{'name': 'test_cookie', 'value': 'cookie_value', 'url': 'https://example.com'}])
await page.goto("https://example.com/login")
await page.fill("input[name='username']", "testuser")
await page.fill("input[name='password']", "password123")
await page.click("button[type='submit']")
await page.wait_for_selector("#welcome")
await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}])
await page.close()
await context.close()
async def before_goto(page: Page):
print("[HOOK] before_goto")
# Example customization: add custom headers
await page.set_extra_http_headers({'X-Test-Header': 'test'})
async def on_page_context_created(context: BrowserContext, page: Page, **kwargs):
print("[HOOK] on_page_context_created")
await context.route("**", log_routing)
async def after_goto(page: Page):
async def before_goto(page: Page, context: BrowserContext, **kwargs):
print("[HOOK] before_goto")
await page.set_extra_http_headers({"X-Test-Header": "test"})
async def after_goto(page: Page, context: BrowserContext, **kwargs):
print("[HOOK] after_goto")
# Example customization: log the URL
print(f"Current URL: {page.url}")
async def on_execution_started(page: Page):
async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
print("[HOOK] on_execution_started")
# Example customization: perform actions after JS execution
await page.evaluate("console.log('Custom JS executed')")
async def before_return_html(page: Page, html: str):
async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs):
print("[HOOK] before_return_html")
# Example customization: log the HTML length
print(f"HTML length: {len(html)}")
return page
```
### Using the Hooks with the AsyncWebCrawler
### Using the Hooks with AsyncWebCrawler
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
async def main():
print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!")
initial_cookies = [
{"name": "sessionId", "value": "abc123", "domain": ".example.com"},
{"name": "userId", "value": "12345", "domain": ".example.com"}
]
crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies)
crawler_strategy.set_hook('on_browser_created', on_browser_created)
crawler_strategy.set_hook('before_goto', before_goto)
crawler_strategy.set_hook('after_goto', after_goto)
crawler_strategy.set_hook('on_execution_started', on_execution_started)
crawler_strategy.set_hook('before_return_html', before_return_html)
async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler:
result = await crawler.arun(
url="https://example.com",
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="footer"
)
print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!")
print("📦 Crawler Hooks result:")
# Configure browser and crawler settings
browser_config = BrowserConfig(
headless=True,
viewport_width=1920,
viewport_height=1080
)
crawler_run_config = CrawlerRunConfig(
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="footer"
)
# Initialize crawler
async with AsyncWebCrawler(browser_config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
crawler.crawler_strategy.set_hook("before_goto", before_goto)
crawler.crawler_strategy.set_hook("after_goto", after_goto)
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
# Run the crawler
result = await crawler.arun(url="https://example.com", config=crawler_run_config)
print("\n📦 Crawler Hooks Result:")
print(result)
asyncio.run(main())
```
### Explanation
### Explanation of Hooks
- `on_browser_created`: This hook is called when the Playwright browser is created. It sets up the browser context, logs in to a website, and adds a custom cookie.
- `before_goto`: This hook is called right before Playwright navigates to the URL. It adds custom HTTP headers.
- `after_goto`: This hook is called after Playwright navigates to the URL. It logs the current URL.
- `on_execution_started`: This hook is called after any custom JavaScript is executed. It performs additional JavaScript actions.
- `before_return_html`: This hook is called before returning the HTML content. It logs the length of the HTML content.
- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies).
- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL.
- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions.
- **`after_goto`**: Called after navigation. Use this to verify content or log the URL.
- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions.
- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content.
### Additional Ideas
### Additional Customizations
- **Handling authentication**: Use the `on_browser_created` hook to handle login processes or set authentication tokens.
- **Dynamic header modification**: Modify headers based on the target URL or other conditions in the `before_goto` hook.
- **Content verification**: Use the `after_goto` hook to verify that the expected content is present on the page.
- **Custom JavaScript injection**: Inject and execute custom JavaScript using the `on_execution_started` hook.
- **Content preprocessing**: Modify or analyze the HTML content in the `before_return_html` hook before it's returned.
- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts).
- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL.
- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens.
- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content.
These hooks provide powerful customization options for tailoring the crawling process to your needs.
By using these hooks, you can customize the behavior of the AsyncWebCrawler to suit your specific needs, including handling authentication, modifying requests, and preprocessing content.

View File

@@ -0,0 +1,156 @@
### Preserve Your Identity with Crawl4AI
Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios.
---
### Managed Browsers: Your Digital Identity Solution
**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web.
#### Why Use Managed Browsers?
1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior.
2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access.
3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions.
#### Steps to Use Managed Browsers
1. **Setup the Browser Configuration**:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
browser_config = BrowserConfig(
headless=False, # Set to False for initial setup to view browser actions
verbose=True,
user_agent_mode="random",
use_managed_browser=True, # Enables persistent browser sessions
browser_type="chromium",
user_data_dir="/path/to/user_profile_data" # Path to save session data
)
```
2. **Perform an Initial Run**:
- Run the crawler with `headless=False`.
- Manually interact with the site (e.g., solve CAPTCHA or log in).
- The browser session saves cookies, local storage, and other required data.
3. **Subsequent Runs**:
- Switch to `headless=True` for automation.
- The session data is reused, allowing seamless crawling.
#### Example: Extracting Data Using Managed Browsers
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def main():
# Define schema for structured data extraction
schema = {
"name": "Example Data",
"baseSelector": "div.example",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
]
}
# Configure crawler
browser_config = BrowserConfig(
headless=True, # Automate subsequent runs
verbose=True,
use_managed_browser=True,
user_data_dir="/path/to/user_profile_data"
)
crawl_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema),
wait_for="css:div.example" # Wait for the targeted element to load
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=crawl_config
)
if result.success:
print("Extracted Data:", result.extracted_content)
if __name__ == "__main__":
asyncio.run(main())
```
### Benefits of Managed Browsers Over Other Methods
Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely.
---
### Magic Mode: Simplified Automation
While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration.
#### What Magic Mode Does:
- Simulates human browsing by randomizing interaction patterns and timing.
- Masks browser automation signals.
- Handles cookie popups and modals.
- Modifies navigator properties for enhanced compatibility.
#### Using Magic Mode
```python
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
magic=True # Enables all automation features
)
```
Magic Mode is particularly useful for:
- Quick prototyping when a Managed Browser setup is not available.
- Basic sites requiring minimal interaction or configuration.
#### Example: Combining Magic Mode with Additional Options
```python
async def crawl_with_magic_mode(url: str):
async with AsyncWebCrawler(headless=True) as crawler:
result = await crawler.arun(
url=url,
magic=True,
remove_overlay_elements=True, # Remove popups/modals
page_timeout=60000 # Increased timeout for complex pages
)
return result.markdown if result.success else None
```
### Magic Mode vs. Managed Browsers
While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor.
---
### Key Comparison: Managed Browsers vs. Magic Mode
| Feature | **Managed Browsers** | **Magic Mode** |
|-------------------------|------------------------------------------|-------------------------------------|
| **Session Persistence** | Retains cookies and local storage. | No session retention. |
| **Human Interaction** | Uses real user profiles and data. | Simulates human-like patterns. |
| **Complex Sites** | Best suited for heavily configured sites.| Works well with simpler challenges.|
| **Setup Complexity** | Requires initial manual interaction. | Fully automated, one-line setup. |
#### Recommendation:
- Use **Managed Browsers** for reliable, session-based crawling and data extraction.
- Use **Magic Mode** for quick prototyping or when persistent profiles are not required.
---
### Conclusion
- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites.
- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed.
By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs.

View File

@@ -1,136 +1,188 @@
# Content Filtering in Crawl4AI
# Creating Browser Instances, Contexts, and Pages
This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
## 1 Introduction
## Relevance Content Filter
### Overview of Browser Management in Crawl4AI
Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling.
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
### Key Objectives
- **Anti-Bot Handling**:
- Implements stealth techniques to evade detection mechanisms used by modern websites.
- Simulates human-like behavior, such as mouse movements, scrolling, and key presses.
- Supports integration with third-party services to bypass CAPTCHA challenges.
- **Persistent Sessions**:
- Retains session data (cookies, local storage) for workflows requiring user authentication.
- Allows seamless continuation of tasks across multiple runs without re-authentication.
- **Scalable Crawling**:
- Optimized resource utilization for handling thousands of URLs concurrently.
- Flexible configuration options to tailor crawling behavior to specific requirements.
---
## Pruning Content Filter
## 2 Browser Creation Methods
The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold.
### Standard Browser Creation
Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization.
### Usage
#### Features and Limitations
- **Features**:
- Quick and straightforward setup for small-scale tasks.
- Supports headless and headful modes.
- **Limitations**:
- Lacks advanced customization options like session reuse.
- May struggle with sites employing strict anti-bot measures.
#### Example Usage
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai import AsyncWebCrawler, BrowserConfig
async def filter_content(url):
async with AsyncWebCrawler() as crawler:
content_filter = PruningContentFilter(
min_word_threshold=5,
threshold_type='dynamic',
threshold=0.45
)
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
if result.success:
print(f"Cleaned Markdown:\n{result.fit_markdown}")
browser_config = BrowserConfig(browser_type="chromium", headless=True)
async with AsyncWebCrawler(browser_config=browser_config) as crawler:
result = await crawler.arun("https://crawl4ai.com")
print(result.markdown)
```
### Parameters
### Persistent Contexts
Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information.
- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
- `'fixed'`: Uses a constant threshold value for all nodes
- `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios
- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning:
- For fixed threshold: Nodes scoring below this value are removed
- For dynamic threshold: This value is adjusted based on node properties
### How It Works
The pruning algorithm evaluates each node using multiple metrics:
- Text density: Ratio of actual text to overall node content
- Link density: Proportion of text within links
- Tag importance: Weight based on HTML tag type (e.g., article, p, div)
- Content quality: Metrics like text length and structural importance
Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks.
The algorithm is particularly effective for:
- Removing boilerplate content
- Eliminating navigation menus and sidebars
- Preserving main article content
- Maintaining document structure while removing noise
## BM25 Algorithm
The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query.
### Usage
To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler.
#### Benefits of Using `user_data_dir`
- **Session Persistence**:
- Stores cookies, local storage, and cache between crawling sessions.
- Reduces overhead for repetitive logins or multi-step workflows.
- **Enhanced Performance**:
- Leverages pre-loaded resources for faster page loading.
- **Flexibility**:
- Adapts to complex workflows requiring user-specific configurations.
#### Example: Setting Up Persistent Contexts
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.content_filter_strategy import BM25ContentFilter
async def filter_content(url, query=None):
async with AsyncWebCrawler() as crawler:
content_filter = BM25ContentFilter(user_query=query)
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
if result.success:
print(f"Filtered Content (JSON):\n{result.extracted_content}")
print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object
print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing.
else:
print("Error:", result.error_message)
# Example usage:
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query.
config = BrowserConfig(user_data_dir="/path/to/user/data")
async with AsyncWebCrawler(browser_config=config) as crawler:
result = await crawler.arun("https://crawl4ai.com")
print(result.markdown)
```
### Parameters
### Managed Browser
The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures.
- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query.
- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering.
#### How It Works
- **Browser Process Management**:
- Automates initialization and cleanup of browser processes.
- Optimizes resource usage by pooling and reusing browser instances.
- **Debugging Support**:
- Integrates with debugging tools like Chrome Developer Tools for real-time inspection.
- **Anti-Bot Measures**:
- Implements stealth plugins to mimic real user behavior and bypass bot detection.
#### Features
- **Customizable Configurations**:
- Supports advanced options such as viewport resizing, proxy settings, and header manipulation.
- **Debugging and Logging**:
- Logs detailed browser interactions for debugging and performance analysis.
- **Scalability**:
- Handles multiple browser instances concurrently, scaling dynamically based on workload.
## Fit Markdown Flag
Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`.
## Custom Content Filtering Strategies
You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs.
#### Example: Using `ManagedBrowser`
```python
from crawl4ai.content_filter_strategy import RelevantContentFilter
from bs4 import BeautifulSoup, Tag
from typing import List
class MyCustomFilter(RelevantContentFilter):
def filter_content(self, html: str) -> List[str]:
soup = BeautifulSoup(html, 'lxml')
# Implement custom filtering logic here
# Example: extract all paragraphs within divs with class "article-body"
filtered_paragraphs = []
for tag in soup.select("div.article-body p"):
if isinstance(tag, Tag):
filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element.
return filtered_paragraphs
async def custom_filter_demo(url: str):
async with AsyncWebCrawler() as crawler:
custom_filter = MyCustomFilter()
result = await crawler.arun(url, extraction_strategy=custom_filter)
if result.success:
print(result.extracted_content)
from crawl4ai import AsyncWebCrawler, BrowserConfig
config = BrowserConfig(headless=False, debug_port=9222)
async with AsyncWebCrawler(browser_config=config) as crawler:
result = await crawler.arun("https://crawl4ai.com")
print(result.markdown)
```
This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques.
---
## Conclusion
## 3 Context and Page Management
Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline.
### Creating and Configuring Browser Contexts
Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage.
#### Customizations
- **Headers and Cookies**:
- Define custom headers to mimic specific devices or browsers.
- Set cookies for authenticated sessions.
- **Session Reuse**:
- Retain and reuse session data across multiple requests.
- Example: Preserve login states for authenticated crawls.
#### Example: Context Initialization
```python
from crawl4ai import CrawlerRunConfig
config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"})
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://crawl4ai.com", config=config)
print(result.markdown)
```
### Creating Pages
Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions.
#### Key Features
- **IFrame Handling**:
- Extract content from embedded iframes.
- Navigate and interact with nested content.
- **Viewport Customization**:
- Adjust viewport size to match target device dimensions.
- **Lazy Loading**:
- Ensure dynamic elements are fully loaded before extraction.
#### Example: Page Initialization
```python
config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://crawl4ai.com", config=config)
print(result.markdown)
```
---
## 4 Advanced Features and Best Practices
### Debugging and Logging
Remote debugging provides a powerful way to troubleshoot complex crawling workflows.
#### Example: Enabling Remote Debugging
```python
config = BrowserConfig(debug_port=9222)
async with AsyncWebCrawler(browser_config=config) as crawler:
result = await crawler.arun("https://crawl4ai.com")
```
### Anti-Bot Techniques
- **Human Behavior Simulation**:
- Mimic real user actions, such as scrolling, clicking, and typing.
- Example: Use JavaScript to simulate interactions.
- **Captcha Handling**:
- Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving.
#### Example: Simulating User Actions
```python
js_code = """
(async () => {
document.querySelector('input[name="search"]').value = 'test';
document.querySelector('button[type="submit"]').click();
})();
"""
config = CrawlerRunConfig(js_code=[js_code])
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://crawl4ai.com", config=config)
```
### Optimizations for Performance and Scalability
- **Persistent Contexts**:
- Reuse browser contexts to minimize resource consumption.
- **Concurrent Crawls**:
- Use `arun_many` with a controlled semaphore count for efficient batch processing.
#### Example: Scaling Crawls
```python
urls = ["https://example1.com", "https://example2.com"]
config = CrawlerRunConfig(semaphore_count=10)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(urls, config=config)
for result in results:
print(result.url, result.markdown)
```

View File

@@ -4,59 +4,67 @@ Configure proxy settings and enhance security features in Crawl4AI for reliable
## Basic Proxy Setup
Simple proxy configuration:
Simple proxy configuration with `BrowserConfig`:
```python
from crawl4ai.async_configs import BrowserConfig
# Using proxy URL
async with AsyncWebCrawler(
proxy="http://proxy.example.com:8080"
) as crawler:
browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
# Using SOCKS proxy
async with AsyncWebCrawler(
proxy="socks5://proxy.example.com:1080"
) as crawler:
browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
## Authenticated Proxy
Use proxy with authentication:
Use an authenticated proxy with `BrowserConfig`:
```python
from crawl4ai.async_configs import BrowserConfig
proxy_config = {
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass"
}
async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
browser_config = BrowserConfig(proxy_config=proxy_config)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
## Rotating Proxies
Example using a proxy rotation service:
Example using a proxy rotation service and updating `BrowserConfig` dynamically:
```python
from crawl4ai.async_configs import BrowserConfig
async def get_next_proxy():
# Your proxy rotation logic here
return {"server": "http://next.proxy.com:8080"}
async with AsyncWebCrawler() as crawler:
browser_config = BrowserConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
# Update proxy for each request
for url in urls:
proxy = await get_next_proxy()
crawler.update_proxy(proxy)
result = await crawler.arun(url=url)
browser_config.proxy_config = proxy
result = await crawler.arun(url=url, config=browser_config)
```
## Custom Headers
Add security-related headers:
Add security-related headers via `BrowserConfig`:
```python
from crawl4ai.async_configs import BrowserConfig
headers = {
"X-Forwarded-For": "203.0.113.195",
"Accept-Language": "en-US,en;q=0.9",
@@ -64,21 +72,24 @@ headers = {
"Pragma": "no-cache"
}
async with AsyncWebCrawler(headers=headers) as crawler:
browser_config = BrowserConfig(headers=headers)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
## Combining with Magic Mode
For maximum protection, combine proxy with Magic Mode:
For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`:
```python
async with AsyncWebCrawler(
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
browser_config = BrowserConfig(
proxy="http://proxy.example.com:8080",
headers={"Accept-Language": "en-US"}
) as crawler:
result = await crawler.arun(
url="https://example.com",
magic=True # Enable all anti-detection features
)
```
)
crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", config=crawler_config)
```

View File

@@ -1,44 +1,53 @@
# Session-Based Crawling for Dynamic Content
### Session-Based Crawling for Dynamic Content
In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. To effectively crawl such websites, Crawl4AI provides powerful session-based crawling capabilities.
In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. Crawl4AI provides session-based crawling capabilities to handle such scenarios effectively.
This guide will explore advanced techniques for crawling dynamic content using Crawl4AI's session management features.
This guide explores advanced techniques for crawling dynamic content using Crawl4AI's session management features.
---
## Understanding Session-Based Crawling
Session-based crawling allows you to maintain a persistent browser session across multiple requests. This is crucial when:
Session-based crawling allows you to reuse a persistent browser session across multiple actions. This means the same browser tab (or page object) is used throughout, enabling:
1. The content changes dynamically without URL changes
2. You need to interact with the page (e.g., clicking buttons) between requests
3. The site requires authentication or maintains state across pages
1. **Efficient handling of dynamic content** without reloading the page.
2. **JavaScript actions before and after crawling** (e.g., clicking buttons or scrolling).
3. **State maintenance** for authenticated sessions or multi-step workflows.
4. **Faster sequential crawling**, as it avoids reopening tabs or reallocating resources.
Crawl4AI's `AsyncWebCrawler` class supports session-based crawling through the `session_id` parameter and related methods.
**Note:** Session-based crawling is ideal for sequential operations, not parallel tasks.
---
## Basic Concepts
Before diving into examples, let's review some key concepts:
Before diving into examples, here are some key concepts:
- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple `arun` calls to maintain state.
- **JavaScript Execution**: Use the `js_code` parameter to execute JavaScript on the page, such as clicking a "Load More" button.
- **CSS Selectors**: Use these to target specific elements for extraction or interaction.
- **Extraction Strategy**: Define how to extract structured data from the page.
- **Wait Conditions**: Specify conditions to wait for before considering the page loaded.
- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple requests to maintain state.
- **BrowserConfig & CrawlerRunConfig**: These configuration objects control browser settings and crawling behavior.
- **JavaScript Execution**: Use `js_code` to perform actions like clicking buttons.
- **CSS Selectors**: Target specific elements for interaction or data extraction.
- **Extraction Strategy**: Define rules to extract structured data.
- **Wait Conditions**: Specify conditions to wait for before proceeding.
---
## Example 1: Basic Session-Based Crawling
Let's start with a basic example of session-based crawling:
A simple example using session-based crawling:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.cache_context import CacheMode
async def basic_session_crawl():
async with AsyncWebCrawler(verbose=True) as crawler:
session_id = "my_session"
async with AsyncWebCrawler() as crawler:
session_id = "dynamic_content_session"
url = "https://example.com/dynamic-content"
for page in range(3):
result = await crawler.arun(
config = CrawlerRunConfig(
url=url,
session_id=session_id,
js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
@@ -46,6 +55,7 @@ async def basic_session_crawl():
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(config=config)
print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
await crawler.crawler_strategy.kill_session(session_id)
@@ -53,17 +63,16 @@ async def basic_session_crawl():
asyncio.run(basic_session_crawl())
```
This example demonstrates:
1. Using a consistent `session_id` across multiple `arun` calls
2. Executing JavaScript to load more content after the first page
3. Using a CSS selector to extract specific content
4. Properly closing the session after crawling
This example shows:
1. Reusing the same `session_id` across multiple requests.
2. Executing JavaScript to load more content dynamically.
3. Properly closing the session to free resources.
---
## Advanced Technique 1: Custom Execution Hooks
Crawl4AI allows you to set custom hooks that execute at different stages of the crawling process. This is particularly useful for handling complex loading scenarios.
Here's an example that waits for new content to appear before proceeding:
Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
```python
async def advanced_session_crawl_with_hooks():
@@ -75,202 +84,96 @@ async def advanced_session_crawl_with_hooks():
while True:
await page.wait_for_selector("li.commit-item h4")
commit = await page.query_selector("li.commit-item h4")
commit = await commit.evaluate("(element) => element.textContent")
commit = commit.strip()
commit = await commit.evaluate("(element) => element.textContent").strip()
if commit and commit != first_commit:
first_commit = commit
break
await asyncio.sleep(0.5)
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
print(f"Warning: New content didn't appear: {e}")
async with AsyncWebCrawler(verbose=True) as crawler:
async with AsyncWebCrawler() as crawler:
session_id = "commit_session"
url = "https://github.com/example/repo/commits/main"
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
url = "https://github.com/example/repo/commits/main"
session_id = "commit_session"
all_commits = []
js_next_page = """
const button = document.querySelector('a.pagination-next');
if (button) button.click();
"""
js_next_page = """document.querySelector('a.pagination-next').click();"""
for page in range(3):
result = await crawler.arun(
config = CrawlerRunConfig(
url=url,
session_id=session_id,
css_selector="li.commit-item",
js_code=js_next_page if page > 0 else None,
cache_mode=CacheMode.BYPASS,
js_only=page > 0
css_selector="li.commit-item",
js_only=page > 0,
cache_mode=CacheMode.BYPASS
)
commits = result.extracted_content.select("li.commit-item")
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
result = await crawler.arun(config=config)
print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
asyncio.run(advanced_session_crawl_with_hooks())
```
This technique uses a custom `on_execution_started` hook to ensure new content has loaded before proceeding to the next step.
This technique ensures new content loads before the next action.
---
## Advanced Technique 2: Integrated JavaScript Execution and Waiting
Instead of using separate hooks, you can integrate the waiting logic directly into your JavaScript execution. This approach can be more concise and easier to manage for some scenarios.
Here's an example:
Combine JavaScript execution and waiting logic for concise handling of dynamic content:
```python
async def integrated_js_and_wait_crawl():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/example/repo/commits/main"
async with AsyncWebCrawler() as crawler:
session_id = "integrated_session"
all_commits = []
url = "https://github.com/example/repo/commits/main"
js_next_page_and_wait = """
(async () => {
const getCurrentCommit = () => {
const commits = document.querySelectorAll('li.commit-item h4');
return commits.length > 0 ? commits[0].textContent.trim() : null;
};
const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
const initialCommit = getCurrentCommit();
const button = document.querySelector('a.pagination-next');
if (button) button.click();
while (true) {
document.querySelector('a.pagination-next').click();
while (getCurrentCommit() === initialCommit) {
await new Promise(resolve => setTimeout(resolve, 100));
const newCommit = getCurrentCommit();
if (newCommit && newCommit !== initialCommit) {
break;
}
}
})();
"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.commit-item",
"fields": [
{
"name": "title",
"selector": "h4.commit-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3):
result = await crawler.arun(
config = CrawlerRunConfig(
url=url,
session_id=session_id,
css_selector="li.commit-item",
extraction_strategy=extraction_strategy,
js_code=js_next_page_and_wait if page > 0 else None,
css_selector="li.commit-item",
js_only=page > 0,
cache_mode=CacheMode.BYPASS
)
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
result = await crawler.arun(config=config)
print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
asyncio.run(integrated_js_and_wait_crawl())
```
This approach combines the JavaScript for clicking the "next" button and waiting for new content to load into a single script.
## Advanced Technique 3: Using the `wait_for` Parameter
Crawl4AI provides a `wait_for` parameter that allows you to specify a condition to wait for before considering the page fully loaded. This can be particularly useful for dynamic content.
Here's an example:
```python
async def wait_for_parameter_crawl():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/example/repo/commits/main"
session_id = "wait_for_session"
all_commits = []
js_next_page = """
const commits = document.querySelectorAll('li.commit-item h4');
if (commits.length > 0) {
window.lastCommit = commits[0].textContent.trim();
}
const button = document.querySelector('a.pagination-next');
if (button) button.click();
"""
wait_for = """() => {
const commits = document.querySelectorAll('li.commit-item h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.lastCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.commit-item",
"fields": [
{
"name": "title",
"selector": "h4.commit-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3):
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.commit-item",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS
)
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
asyncio.run(wait_for_parameter_crawl())
```
This technique separates the JavaScript execution (clicking the "next" button) from the waiting condition, providing more flexibility and clarity in some scenarios.
---
## Best Practices for Session-Based Crawling
1. **Use Unique Session IDs**: Ensure each crawling session has a unique `session_id` to prevent conflicts.
2. **Close Sessions**: Always close sessions using `kill_session` when you're done to free up resources.
3. **Handle Errors**: Implement proper error handling to deal with unexpected situations during crawling.
4. **Respect Website Terms**: Ensure your crawling adheres to the website's terms of service and robots.txt file.
5. **Implement Delays**: Add appropriate delays between requests to avoid overwhelming the target server.
6. **Use Extraction Strategies**: Leverage `JsonCssExtractionStrategy` or other extraction strategies for structured data extraction.
7. **Optimize JavaScript**: Keep your JavaScript execution concise and efficient to improve crawling speed.
8. **Monitor Performance**: Keep an eye on memory usage and crawling speed, especially for long-running sessions.
1. **Unique Session IDs**: Assign descriptive and unique `session_id` values.
2. **Close Sessions**: Always clean up sessions with `kill_session` after use.
3. **Error Handling**: Anticipate and handle errors gracefully.
4. **Respect Websites**: Follow terms of service and robots.txt.
5. **Delays**: Add delays to avoid overwhelming servers.
6. **Optimize JavaScript**: Keep scripts concise for better performance.
7. **Monitor Resources**: Track memory and CPU usage for long sessions.
---
## Conclusion
Session-based crawling with Crawl4AI provides powerful capabilities for handling dynamic content and complex web applications. By leveraging session management, JavaScript execution, and waiting strategies, you can effectively crawl and extract data from a wide range of modern websites.
Remember to use these techniques responsibly and in compliance with website policies and ethical web scraping practices.
For more advanced usage and API details, refer to the Crawl4AI API documentation.
Session-based crawling in Crawl4AI is a robust solution for handling dynamic content and multi-step workflows. By combining session management, JavaScript execution, and structured extraction strategies, you can effectively navigate and extract data from modern web applications. Always adhere to ethical web scraping practices and respect website policies.

View File

@@ -1,74 +1,70 @@
# Session Management
### Session Management
Session management in Crawl4AI allows you to maintain state across multiple requests and handle complex multi-page crawling tasks, particularly useful for dynamic websites.
Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
## Basic Session Usage
- **Performing JavaScript actions before and after crawling.**
- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly.
Use `session_id` to maintain state between requests:
**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations.
---
#### Basic Session Usage
Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`:
```python
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
async with AsyncWebCrawler() as crawler:
session_id = "my_session"
# Define configurations
config1 = CrawlerRunConfig(url="https://example.com/page1", session_id=session_id)
config2 = CrawlerRunConfig(url="https://example.com/page2", session_id=session_id)
# First request
result1 = await crawler.arun(
url="https://example.com/page1",
session_id=session_id
)
# Subsequent request using same session
result2 = await crawler.arun(
url="https://example.com/page2",
session_id=session_id
)
result1 = await crawler.arun(config=config1)
# Subsequent request using the same session
result2 = await crawler.arun(config=config2)
# Clean up when done
await crawler.crawler_strategy.kill_session(session_id)
```
## Dynamic Content with Sessions
---
Here's a real-world example of crawling GitHub commits across multiple pages:
#### Dynamic Content with Sessions
Here's an example of crawling GitHub commits across multiple pages while preserving session state:
```python
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content():
async with AsyncWebCrawler(verbose=True) as crawler:
async with AsyncWebCrawler() as crawler:
session_id = "github_commits_session"
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
# Define navigation JavaScript
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
# Define wait condition
wait_for = """() => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.firstCommit;
}"""
# Define extraction schema
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
"fields": [{"name": "title", "selector": "h4.markdown-title", "type": "text"}],
}
extraction_strategy = JsonCssExtractionStrategy(schema)
# JavaScript and wait configurations
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
# Crawl multiple pages
for page in range(3):
result = await crawler.arun(
config = CrawlerRunConfig(
url=url,
session_id=session_id,
extraction_strategy=extraction_strategy,
@@ -78,6 +74,7 @@ async def crawl_dynamic_content():
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(config=config)
if result.success:
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
@@ -88,46 +85,53 @@ async def crawl_dynamic_content():
return all_commits
```
## Session Best Practices
---
1. **Session Naming**:
```python
# Use descriptive session IDs
session_id = "login_flow_session"
session_id = "product_catalog_session"
```
#### Session Best Practices
1. **Descriptive Session IDs**:
Use meaningful names for session IDs to organize workflows:
```python
session_id = "login_flow_session"
session_id = "product_catalog_session"
```
2. **Resource Management**:
```python
try:
# Your crawling code
pass
finally:
# Always clean up sessions
await crawler.crawler_strategy.kill_session(session_id)
```
Always ensure sessions are cleaned up to free resources:
```python
try:
# Your crawling code here
pass
finally:
await crawler.crawler_strategy.kill_session(session_id)
```
3. **State Management**:
```python
# First page: login
result = await crawler.arun(
url="https://example.com/login",
session_id=session_id,
js_code="document.querySelector('form').submit();"
)
3. **State Maintenance**:
Reuse the session for subsequent actions within the same workflow:
```python
# Step 1: Login
login_config = CrawlerRunConfig(
url="https://example.com/login",
session_id=session_id,
js_code="document.querySelector('form').submit();"
)
await crawler.arun(config=login_config)
# Second page: verify login success
result = await crawler.arun(
url="https://example.com/dashboard",
session_id=session_id,
wait_for="css:.user-profile" # Wait for authenticated content
)
```
# Step 2: Verify login success
dashboard_config = CrawlerRunConfig(
url="https://example.com/dashboard",
session_id=session_id,
wait_for="css:.user-profile" # Wait for authenticated content
)
result = await crawler.arun(config=dashboard_config)
```
## Common Use Cases
---
1. **Authentication Flows**
2. **Pagination Handling**
3. **Form Submissions**
4. **Multi-step Processes**
5. **Dynamic Content Navigation**
#### Common Use Cases for Sessions
1. **Authentication Flows**: Login and interact with secured pages.
2. **Pagination Handling**: Navigate through multiple pages.
3. **Form Submissions**: Fill forms, submit, and process results.
4. **Multi-step Processes**: Complete workflows that span multiple actions.
5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.

View File

@@ -45,13 +45,15 @@ if __name__ == "__main__":
### New Code (Recommended)
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def use_proxy():
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Use CacheMode in CrawlerRunConfig
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
cache_mode=CacheMode.BYPASS # New way
config=config # Pass the configuration object
)
print(len(result.markdown))
@@ -64,12 +66,12 @@ if __name__ == "__main__":
## Common Migration Patterns
Old Flag | New Mode
---------|----------
`bypass_cache=True` | `cache_mode=CacheMode.BYPASS`
`disable_cache=True` | `cache_mode=CacheMode.DISABLED`
`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY`
`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY`
| Old Flag | New Mode |
|-----------------------|---------------------------------|
| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` |
| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`|
| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` |
| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
## Suppressing Deprecation Warnings
If you need time to migrate, you can temporarily suppress deprecation warnings:

View File

@@ -1,68 +1,58 @@
# Content Selection
### Content Selection
Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.
## CSS Selectors
#### CSS Selectors
The simplest way to extract specific content:
Extract specific content using a `CrawlerRunConfig` with CSS selectors:
```python
# Extract specific content using CSS selector
result = await crawler.arun(
url="https://example.com",
css_selector=".main-article" # Target main article content
)
from crawl4ai.async_configs import CrawlerRunConfig
# Multiple selectors
result = await crawler.arun(
url="https://example.com",
css_selector="article h1, article .content" # Target heading and content
)
config = CrawlerRunConfig(css_selector=".main-article") # Target main article content
result = await crawler.arun(url="https://crawl4ai.com", config=config)
config = CrawlerRunConfig(css_selector="article h1, article .content") # Target heading and content
result = await crawler.arun(url="https://crawl4ai.com", config=config)
```
## Content Filtering
#### Content Filtering
Control what content is included or excluded:
Control content inclusion or exclusion with `CrawlerRunConfig`:
```python
result = await crawler.arun(
url="https://example.com",
# Content thresholds
config = CrawlerRunConfig(
word_count_threshold=10, # Minimum words per block
# Tag exclusions
excluded_tags=['form', 'header', 'footer', 'nav'],
# Link filtering
excluded_tags=['form', 'header', 'footer', 'nav'], # Excluded tags
exclude_external_links=True, # Remove external links
exclude_social_media_links=True, # Remove social media links
# Media filtering
exclude_external_images=True # Remove external images
)
result = await crawler.arun(url="https://crawl4ai.com", config=config)
```
## Iframe Content
#### Iframe Content
Process content inside iframes:
Process iframe content by enabling specific options in `CrawlerRunConfig`:
```python
result = await crawler.arun(
url="https://example.com",
process_iframes=True, # Extract iframe content
config = CrawlerRunConfig(
process_iframes=True, # Extract iframe content
remove_overlay_elements=True # Remove popups/modals that might block iframes
)
result = await crawler.arun(url="https://crawl4ai.com", config=config)
```
## Structured Content Selection
#### Structured Content Selection Using LLMs
### Using LLMs for Smart Selection
Use LLMs to intelligently extract specific types of content:
Leverage LLMs for intelligent content extraction:
```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel
from typing import List
class ArticleContent(BaseModel):
title: str
@@ -70,28 +60,27 @@ class ArticleContent(BaseModel):
conclusion: str
strategy = LLMExtractionStrategy(
provider="ollama/nemotron", # Works with any supported LLM
provider="ollama/nemotron",
schema=ArticleContent.schema(),
instruction="Extract the main article title, key points, and conclusion"
)
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
config = CrawlerRunConfig(extraction_strategy=strategy)
result = await crawler.arun(url="https://crawl4ai.com", config=config)
article = json.loads(result.extracted_content)
```
### Pattern-Based Selection
#### Pattern-Based Selection
For repeated content patterns (like product listings, news feeds):
Extract content matching repetitive patterns:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
schema = {
"name": "News Articles",
"baseSelector": "article.news-item", # Repeated element
"baseSelector": "article.news-item",
"fields": [
{"name": "headline", "selector": "h2", "type": "text"},
{"name": "summary", "selector": ".summary", "type": "text"},
@@ -108,51 +97,19 @@ schema = {
}
strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
config = CrawlerRunConfig(extraction_strategy=strategy)
result = await crawler.arun(url="https://crawl4ai.com", config=config)
articles = json.loads(result.extracted_content)
```
## Domain-Based Filtering
#### Comprehensive Example
Control content based on domains:
Combine different selection methods using `CrawlerRunConfig`:
```python
result = await crawler.arun(
url="https://example.com",
exclude_domains=["ads.com", "tracker.com"],
exclude_social_media_domains=["facebook.com", "twitter.com"], # Custom social media domains to exclude
exclude_social_media_links=True
)
```
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
## Media Selection
Select specific types of media:
```python
result = await crawler.arun(url="https://example.com")
# Access different media types
images = result.media["images"] # List of image details
videos = result.media["videos"] # List of video details
audios = result.media["audios"] # List of audio details
# Image with metadata
for image in images:
print(f"URL: {image['src']}")
print(f"Alt text: {image['alt']}")
print(f"Description: {image['desc']}")
print(f"Relevance score: {image['score']}")
```
## Comprehensive Example
Here's how to combine different selection methods:
```python
async def extract_article_content(url: str):
# Define structured extraction
article_schema = {
@@ -163,37 +120,16 @@ async def extract_article_content(url: str):
{"name": "content", "selector": ".content", "type": "text"}
]
}
# Define LLM extraction
class ArticleAnalysis(BaseModel):
key_points: List[str]
sentiment: str
category: str
# Define configuration
config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(article_schema),
word_count_threshold=10,
excluded_tags=['nav', 'footer'],
exclude_external_links=True
)
async with AsyncWebCrawler() as crawler:
# Get structured content
pattern_result = await crawler.arun(
url=url,
extraction_strategy=JsonCssExtractionStrategy(article_schema),
word_count_threshold=10,
excluded_tags=['nav', 'footer'],
exclude_external_links=True
)
# Get semantic analysis
analysis_result = await crawler.arun(
url=url,
extraction_strategy=LLMExtractionStrategy(
provider="ollama/nemotron",
schema=ArticleAnalysis.schema(),
instruction="Analyze the article content"
)
)
# Combine results
return {
"article": json.loads(pattern_result.extracted_content),
"analysis": json.loads(analysis_result.extracted_content),
"media": pattern_result.media
}
```
result = await crawler.arun(url=url, config=config)
return json.loads(result.extracted_content)
```

View File

@@ -1,136 +1,83 @@
# Content Filtering in Crawl4AI
This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
## Relevance Content Filter
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
The `RelevanceContentFilter` is an abstract class providing a common interface for content filtering strategies. Specific algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
## Pruning Content Filter
The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold.
The `PruningContentFilter` removes less relevant nodes based on metrics like text density, link density, and tag importance. Nodes that fall below a defined threshold are pruned, leaving only high-value content.
### Usage
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
async def filter_content(url):
async with AsyncWebCrawler() as crawler:
content_filter = PruningContentFilter(
min_word_threshold=5,
threshold_type='dynamic',
threshold=0.45
)
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
if result.success:
print(f"Cleaned Markdown:\n{result.fit_markdown}")
config = CrawlerRunConfig(
content_filter=PruningContentFilter(
min_word_threshold=5,
threshold_type='dynamic',
threshold=0.45
),
fit_markdown=True # Activates markdown fitting
)
result = await crawler.arun(url="https://example.com", config=config)
if result.success:
print(f"Cleaned Markdown:\n{result.fit_markdown}")
```
### Parameters
- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
- `'fixed'`: Uses a constant threshold value for all nodes
- `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios
- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning:
- For fixed threshold: Nodes scoring below this value are removed
- For dynamic threshold: This value is adjusted based on node properties
- `'fixed'`: Uses a constant threshold value for all nodes.
- `'dynamic'`: Adjusts thresholds based on node properties (e.g., tag importance, text/link ratios).
- **`threshold`**: (Optional, default 0.48) Base threshold for pruning:
- Fixed: Nodes scoring below this value are removed.
- Dynamic: This value adjusts based on node characteristics.
### How It Works
The pruning algorithm evaluates each node using multiple metrics:
- Text density: Ratio of actual text to overall node content
- Link density: Proportion of text within links
- Tag importance: Weight based on HTML tag type (e.g., article, p, div)
- Content quality: Metrics like text length and structural importance
Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks.
The algorithm is particularly effective for:
- Removing boilerplate content
- Eliminating navigation menus and sidebars
- Preserving main article content
- Maintaining document structure while removing noise
The algorithm evaluates each node using:
- **Text density**: Ratio of text to overall content.
- **Link density**: Proportion of text within links.
- **Tag importance**: Weights based on HTML tag type (e.g., `<article>`, `<p>`, `<div>`).
- **Content quality**: Metrics like text length and structural importance.
## BM25 Algorithm
The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query.
The `BM25ContentFilter` uses the BM25 algorithm to rank and extract text chunks based on relevance to a search query or page metadata.
### Usage
To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler.
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.content_filter_strategy import BM25ContentFilter
async def filter_content(url, query=None):
async with AsyncWebCrawler() as crawler:
content_filter = BM25ContentFilter(user_query=query)
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
if result.success:
print(f"Filtered Content (JSON):\n{result.extracted_content}")
print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object
print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing.
else:
print("Error:", result.error_message)
config = CrawlerRunConfig(
content_filter=BM25ContentFilter(user_query="fruit nutrition health"),
fit_markdown=True # Activates markdown fitting
)
# Example usage:
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query
asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query.
result = await crawler.arun(url="https://example.com", config=config)
if result.success:
print(f"Filtered Content:\n{result.extracted_content}")
print(f"\nFiltered Markdown:\n{result.fit_markdown}")
print(f"\nFiltered HTML:\n{result.fit_html}")
else:
print("Error:", result.error_message)
```
### Parameters
- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query.
- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering.
- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts metadata (title, description, keywords) and uses it as the query.
- **`bm25_threshold`**: (Optional, default 1.0) Threshold controlling relevance:
- Higher values return stricter, more relevant results.
- Lower values include more lenient filtering.
## Fit Markdown Flag
Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`.
## Custom Content Filtering Strategies
You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs.
```python
from crawl4ai.content_filter_strategy import RelevantContentFilter
from bs4 import BeautifulSoup, Tag
from typing import List
class MyCustomFilter(RelevantContentFilter):
def filter_content(self, html: str) -> List[str]:
soup = BeautifulSoup(html, 'lxml')
# Implement custom filtering logic here
# Example: extract all paragraphs within divs with class "article-body"
filtered_paragraphs = []
for tag in soup.select("div.article-body p"):
if isinstance(tag, Tag):
filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element.
return filtered_paragraphs
async def custom_filter_demo(url: str):
async with AsyncWebCrawler() as crawler:
custom_filter = MyCustomFilter()
result = await crawler.arun(url, extraction_strategy=custom_filter)
if result.success:
print(result.extracted_content)
```
This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques.
## Conclusion
Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline.

View File

@@ -1,124 +1,109 @@
# Download Handling in Crawl4AI
This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
## Enabling Downloads
By default, Crawl4AI does not download files. To enable downloads, set the `accept_downloads` parameter to `True` in either the `AsyncWebCrawler` constructor or the `arun` method.
To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
async def main():
async with AsyncWebCrawler(accept_downloads=True) as crawler: # Globally enable downloads
config = BrowserConfig(accept_downloads=True) # Enable downloads globally
async with AsyncWebCrawler(config=config) as crawler:
# ... your crawling logic ...
asyncio.run(main())
```
Or, enable it for a specific crawl:
Or, enable it for a specific crawl by using `CrawlerRunConfig`:
```python
from crawl4ai.async_configs import CrawlerRunConfig
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="...", accept_downloads=True)
config = CrawlerRunConfig(accept_downloads=True)
result = await crawler.arun(url="https://example.com", config=config)
# ...
```
## Specifying Download Location
You can specify the download directory using the `downloads_path` parameter. If not provided, Crawl4AI creates a "downloads" directory inside the `.crawl4ai` folder in your home directory.
Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
```python
from crawl4ai.async_configs import BrowserConfig
import os
from pathlib import Path
# ... inside your crawl function:
downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path
os.makedirs(downloads_path, exist_ok=True)
result = await crawler.arun(url="...", downloads_path=downloads_path, accept_downloads=True)
config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
# ...
```
If you are setting it globally, provide the path to the AsyncWebCrawler:
```python
async def crawl_with_downloads(url: str, download_path: str):
async with AsyncWebCrawler(
accept_downloads=True,
downloads_path=download_path, # or set it on arun
verbose=True
) as crawler:
result = await crawler.arun(url=url) # you still need to enable downloads per call.
async def main():
async with AsyncWebCrawler(config=config) as crawler:
result = await crawler.arun(url="https://example.com")
# ...
```
## Triggering Downloads
Downloads are typically triggered by user interactions on a web page (e.g., clicking a download button). You can simulate these actions with the `js_code` parameter, injecting JavaScript code to be executed within the browser context. The `wait_for` parameter might also be crucial to allowing sufficient time for downloads to initiate before the crawler proceeds.
Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
```python
result = await crawler.arun(
url="https://www.python.org/downloads/",
from crawl4ai.async_configs import CrawlerRunConfig
config = CrawlerRunConfig(
js_code="""
// Find and click the first Windows installer link
const downloadLink = document.querySelector('a[href$=".exe"]');
if (downloadLink) {
downloadLink.click();
}
""",
wait_for=5 # Wait for 5 seconds for the download to start
wait_for=5 # Wait 5 seconds for the download to start
)
result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
```
## Accessing Downloaded Files
Downloaded file paths are stored in the `downloaded_files` attribute of the returned `CrawlResult` object. This is a list of strings, with each string representing the absolute path to a downloaded file.
The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
```python
if result.downloaded_files:
print("Downloaded files:")
for file_path in result.downloaded_files:
print(f"- {file_path}")
# Perform operations with downloaded files, e.g., check file size
file_size = os.path.getsize(file_path)
print(f"- File size: {file_size} bytes")
else:
print("No files downloaded.")
```
## Example: Downloading Multiple Files
## Example: Downloading Multiple Files
```python
import asyncio
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import os
from pathlib import Path
from crawl4ai import AsyncWebCrawler
async def download_multiple_files(url: str, download_path: str):
async with AsyncWebCrawler(
accept_downloads=True,
downloads_path=download_path,
verbose=True
) as crawler:
result = await crawler.arun(
url=url,
config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
async with AsyncWebCrawler(config=config) as crawler:
run_config = CrawlerRunConfig(
js_code="""
// Trigger multiple downloads (example)
const downloadLinks = document.querySelectorAll('a[download]'); // Or a more specific selector
for (const link of downloadLinks) {
link.click();
await new Promise(r => setTimeout(r, 2000)); // Add a small delay between clicks if needed
}
const downloadLinks = document.querySelectorAll('a[download]');
for (const link of downloadLinks) {
link.click();
await new Promise(r => setTimeout(r, 2000)); // Delay between clicks
}
""",
wait_for=10 # Adjust the timeout to match the expected time for all downloads to start
wait_for=10 # Wait for all downloads to start
)
result = await crawler.arun(url=url, config=run_config)
if result.downloaded_files:
print("Downloaded files:")
@@ -126,23 +111,19 @@ async def download_multiple_files(url: str, download_path: str):
print(f"- {file}")
else:
print("No files downloaded.")
# Example usage
# Usage
download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
os.makedirs(download_path, exist_ok=True) # Create directory if it doesn't exist
os.makedirs(download_path, exist_ok=True)
asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
```
## Important Considerations
- **Browser Context:** Downloads are managed within the browser context. Ensure your `js_code` correctly targets the download triggers on the specific web page.
- **Waiting:** Use `wait_for` to manage the timing of the crawl process if immediate download might not occur.
- **Error Handling:** Implement proper error handling to gracefully manage failed downloads or incorrect file paths.
- **Security:** Downloaded files should be scanned for potential security threats before use.
- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
- **Security:** Scan downloaded files for potential security threats before use.
This guide provides a foundation for handling downloads with Crawl4AI. You can adapt these techniques to manage downloads in various scenarios and integrate them into more complex crawling workflows.
This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!

View File

@@ -1,6 +1,6 @@
# Output Formats
Crawl4AI provides multiple output formats to suit different needs, from raw HTML to structured data using LLM or pattern-based extraction.
Crawl4AI provides multiple output formats to suit different needs, ranging from raw HTML to structured data using LLM or pattern-based extraction, and versatile markdown outputs.
## Basic Formats
@@ -8,18 +8,20 @@ Crawl4AI provides multiple output formats to suit different needs, from raw HTML
result = await crawler.arun(url="https://example.com")
# Access different formats
raw_html = result.html # Original HTML
clean_html = result.cleaned_html # Sanitized HTML
markdown = result.markdown # Standard markdown
fit_md = result.fit_markdown # Most relevant content in markdown
raw_html = result.html # Original HTML
clean_html = result.cleaned_html # Sanitized HTML
markdown_v2 = result.markdown_v2 # Detailed markdown generation results
fit_md = result.markdown_v2.fit_markdown # Most relevant content in markdown
```
> **Note**: The `markdown_v2` property will soon be replaced by `markdown`. It is recommended to start transitioning to using `markdown` for new implementations.
## Raw HTML
Original, unmodified HTML from the webpage. Useful when you need to:
- Preserve the exact page structure
- Process HTML with your own tools
- Debug page issues
- Preserve the exact page structure.
- Process HTML with your own tools.
- Debug page issues.
```python
result = await crawler.arun(url="https://example.com")
@@ -29,167 +31,72 @@ print(result.html) # Complete HTML including headers, scripts, etc.
## Cleaned HTML
Sanitized HTML with unnecessary elements removed. Automatically:
- Removes scripts and styles
- Cleans up formatting
- Preserves semantic structure
- Removes scripts and styles.
- Cleans up formatting.
- Preserves semantic structure.
```python
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
excluded_tags=['form', 'header', 'footer'], # Additional tags to remove
keep_data_attributes=False # Remove data-* attributes
)
result = await crawler.arun(url="https://example.com", config=config)
print(result.cleaned_html)
```
## Standard Markdown
HTML converted to clean markdown format. Great for:
- Content analysis
- Documentation
- Readability
HTML converted to clean markdown format. This output is useful for:
- Content analysis.
- Documentation.
- Readability.
```python
result = await crawler.arun(
url="https://example.com",
include_links_on_markdown=True # Include links in markdown
config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
options={"include_links": True} # Include links in markdown
)
)
print(result.markdown)
result = await crawler.arun(url="https://example.com", config=config)
print(result.markdown_v2.raw_markdown) # Standard markdown with links
```
## Fit Markdown
Most relevant content extracted and converted to markdown. Ideal for:
- Article extraction
- Main content focus
- Removing boilerplate
Extract and convert only the most relevant content into markdown format. Best suited for:
- Article extraction.
- Focusing on the main content.
- Removing boilerplate.
To generate `fit_markdown`, use a content filter like `PruningContentFilter`:
```python
result = await crawler.arun(url="https://example.com")
print(result.fit_markdown) # Only the main content
from crawl4ai.content_filter_strategy import PruningContentFilter
config = CrawlerRunConfig(
content_filter=PruningContentFilter(
threshold=0.7,
threshold_type="dynamic",
min_word_threshold=100
)
)
result = await crawler.arun(url="https://example.com", config=config)
print(result.markdown_v2.fit_markdown) # Extracted main content in markdown
```
## Structured Data Extraction
## Markdown with Citations
Crawl4AI offers two powerful approaches for structured data extraction:
### 1. LLM-Based Extraction
Use any LLM (OpenAI, HuggingFace, Ollama, etc.) to extract structured data with high accuracy:
Generate markdown that includes citations for links. This format is ideal for:
- Creating structured documentation.
- Including references for extracted content.
```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class KnowledgeGraph(BaseModel):
entities: List[dict]
relationships: List[dict]
strategy = LLMExtractionStrategy(
provider="ollama/nemotron", # or "huggingface/...", "ollama/..."
api_token="your-token", # not needed for Ollama
schema=KnowledgeGraph.schema(),
instruction="Extract entities and relationships from the content"
config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
options={"citations": True} # Enable citations
)
)
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
knowledge_graph = json.loads(result.extracted_content)
result = await crawler.arun(url="https://example.com", config=config)
print(result.markdown_v2.markdown_with_citations)
print(result.markdown_v2.references_markdown) # Citations section
```
### 2. Pattern-Based Extraction
For pages with repetitive patterns (e.g., product listings, article feeds), use JsonCssExtractionStrategy:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
schema = {
"name": "Product Listing",
"baseSelector": ".product-card", # Repeated element
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "description", "selector": ".desc", "type": "text"}
]
}
strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
products = json.loads(result.extracted_content)
```
## Content Customization
### HTML to Text Options
Configure markdown conversion:
```python
result = await crawler.arun(
url="https://example.com",
html2text={
"escape_dot": False,
"body_width": 0,
"protect_links": True,
"unicode_snob": True
}
)
```
### Content Filters
Control what content is included:
```python
result = await crawler.arun(
url="https://example.com",
word_count_threshold=10, # Minimum words per block
exclude_external_links=True, # Remove external links
exclude_external_images=True, # Remove external images
excluded_tags=['form', 'nav'] # Remove specific HTML tags
)
```
## Comprehensive Example
Here's how to use multiple output formats together:
```python
async def crawl_content(url: str):
async with AsyncWebCrawler() as crawler:
# Extract main content with fit markdown
result = await crawler.arun(
url=url,
word_count_threshold=10,
exclude_external_links=True
)
# Get structured data using LLM
llm_result = await crawler.arun(
url=url,
extraction_strategy=LLMExtractionStrategy(
provider="ollama/nemotron",
schema=YourSchema.schema(),
instruction="Extract key information"
)
)
# Get repeated patterns (if any)
pattern_result = await crawler.arun(
url=url,
extraction_strategy=JsonCssExtractionStrategy(your_schema)
)
return {
"main_content": result.fit_markdown,
"structured_data": json.loads(llm_result.extracted_content),
"pattern_data": json.loads(pattern_result.extracted_content),
"media": result.media
}
```

View File

@@ -7,11 +7,13 @@ Crawl4AI provides powerful features for interacting with dynamic webpages, handl
### Basic Execution
```python
from crawl4ai.async_configs import CrawlerRunConfig
# Single JavaScript command
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
js_code="window.scrollTo(0, document.body.scrollHeight);"
)
result = await crawler.arun(url="https://example.com", config=config)
# Multiple commands
js_commands = [
@@ -19,10 +21,8 @@ js_commands = [
"document.querySelector('.load-more').click();",
"document.querySelector('#consent-button').click();"
]
result = await crawler.arun(
url="https://example.com",
js_code=js_commands
)
config = CrawlerRunConfig(js_code=js_commands)
result = await crawler.arun(url="https://example.com", config=config)
```
## Wait Conditions
@@ -32,10 +32,8 @@ result = await crawler.arun(
Wait for elements to appear:
```python
result = await crawler.arun(
url="https://example.com",
wait_for="css:.dynamic-content" # Wait for element with class 'dynamic-content'
)
config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content'
result = await crawler.arun(url="https://example.com", config=config)
```
### JavaScript-Based Waiting
@@ -48,10 +46,8 @@ wait_condition = """() => {
return document.querySelectorAll('.item').length > 10;
}"""
result = await crawler.arun(
url="https://example.com",
wait_for=f"js:{wait_condition}"
)
config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
result = await crawler.arun(url="https://example.com", config=config)
# Wait for dynamic content to load
wait_for_content = """() => {
@@ -59,10 +55,8 @@ wait_for_content = """() => {
return content && content.innerText.length > 100;
}"""
result = await crawler.arun(
url="https://example.com",
wait_for=f"js:{wait_for_content}"
)
config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}")
result = await crawler.arun(url="https://example.com", config=config)
```
## Handling Dynamic Content
@@ -72,18 +66,14 @@ result = await crawler.arun(
Handle infinite scroll or load more buttons:
```python
# Scroll and wait pattern
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
js_code=[
# Scroll to bottom
"window.scrollTo(0, document.body.scrollHeight);",
# Click load more if exists
"const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
"const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more
],
# Wait for new content
wait_for="js:() => document.querySelectorAll('.item').length > previousCount"
wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content
)
result = await crawler.arun(url="https://example.com", config=config)
```
### Form Interaction
@@ -92,17 +82,15 @@ Handle forms and inputs:
```python
js_form_interaction = """
// Fill form fields
document.querySelector('#search').value = 'search term';
// Submit form
document.querySelector('form').submit();
document.querySelector('#search').value = 'search term'; // Fill form fields
document.querySelector('form').submit(); // Submit form
"""
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
js_code=js_form_interaction,
wait_for="css:.results" # Wait for results to load
)
result = await crawler.arun(url="https://example.com", config=config)
```
## Timing Control
@@ -112,11 +100,11 @@ result = await crawler.arun(
Control timing of interactions:
```python
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
page_timeout=60000, # Page load timeout (ms)
delay_before_return_html=2.0, # Wait before capturing content
delay_before_return_html=2.0 # Wait before capturing content
)
result = await crawler.arun(url="https://example.com", config=config)
```
## Complex Interactions Example
@@ -124,43 +112,37 @@ result = await crawler.arun(
Here's an example of handling a dynamic page with multiple interactions:
```python
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
async def crawl_dynamic_content():
async with AsyncWebCrawler() as crawler:
# Initial page load
result = await crawler.arun(
url="https://example.com",
# Handle cookie consent
js_code="document.querySelector('.cookie-accept')?.click();",
config = CrawlerRunConfig(
js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent
wait_for="css:.main-content"
)
result = await crawler.arun(url="https://example.com", config=config)
# Load more content
session_id = "dynamic_session" # Keep session for multiple interactions
for page in range(3): # Load 3 pages of content
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
session_id=session_id,
js_code=[
# Scroll to bottom
"window.scrollTo(0, document.body.scrollHeight);",
# Store current item count
"window.previousCount = document.querySelectorAll('.item').length;",
# Click load more
"document.querySelector('.load-more')?.click();"
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
"window.previousCount = document.querySelectorAll('.item').length;", # Store item count
"document.querySelector('.load-more')?.click();" # Click load more
],
# Wait for new items
wait_for="""() => {
const currentCount = document.querySelectorAll('.item').length;
return currentCount > window.previousCount;
}""",
# Only execute JS without reloading page
js_only=True if page > 0 else False
js_only=(page > 0) # Execute JS without reloading page for subsequent interactions
)
# Process content after each load
result = await crawler.arun(url="https://example.com", config=config)
print(f"Page {page + 1} items:", len(result.cleaned_html))
# Clean up session
await crawler.crawler_strategy.kill_session(session_id)
```
@@ -171,6 +153,7 @@ Combine page interaction with structured extraction:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai.async_configs import CrawlerRunConfig
# Pattern-based extraction after interaction
schema = {
@@ -182,20 +165,19 @@ schema = {
]
}
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="css:.item:nth-child(10)", # Wait for 10 items
extraction_strategy=JsonCssExtractionStrategy(schema)
)
result = await crawler.arun(url="https://example.com", config=config)
# Or use LLM to analyze dynamic content
class ContentAnalysis(BaseModel):
topics: List[str]
summary: str
result = await crawler.arun(
url="https://example.com",
config = CrawlerRunConfig(
js_code="document.querySelector('.show-more').click();",
wait_for="css:.full-content",
extraction_strategy=LLMExtractionStrategy(
@@ -204,4 +186,5 @@ result = await crawler.arun(
instruction="Analyze the full content"
)
)
```
result = await crawler.arun(url="https://example.com", config=config)
```

View File

@@ -2,31 +2,19 @@
This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
## Table of Contents
- [Prefix-Based Input Handling in Crawl4AI](#prefix-based-input-handling-in-crawl4ai)
- [Table of Contents](#table-of-contents)
- [Crawling a Web URL](#crawling-a-web-url)
- [Crawling a Local HTML File](#crawling-a-local-html-file)
- [Crawling Raw HTML Content](#crawling-raw-html-content)
- [Complete Example](#complete-example)
- [**How It Works**](#how-it-works)
- [**Running the Example**](#running-the-example)
- [Conclusion](#conclusion)
## Crawling a Web URL
---
### Crawling a Web URL
To crawl a live web page, provide the URL starting with `http://` or `https://`.
To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_web():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", bypass_cache=True)
config = CrawlerRunConfig(bypass_cache=True)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config)
if result.success:
print("Markdown Content:")
print(result.markdown)
@@ -36,20 +24,22 @@ async def crawl_web():
asyncio.run(crawl_web())
```
### Crawling a Local HTML File
## Crawling a Local HTML File
To crawl a local HTML file, prefix the file path with `file://`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_local_file():
local_file_path = "/path/to/apple.html" # Replace with your file path
file_url = f"file://{local_file_path}"
config = CrawlerRunConfig(bypass_cache=True)
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url=file_url, bypass_cache=True)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url, config=config)
if result.success:
print("Markdown Content from Local File:")
print(result.markdown)
@@ -59,20 +49,22 @@ async def crawl_local_file():
asyncio.run(crawl_local_file())
```
### Crawling Raw HTML Content
## Crawling Raw HTML Content
To crawl raw HTML content, prefix the HTML string with `raw:`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_raw_html():
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
raw_html_url = f"raw:{raw_html}"
config = CrawlerRunConfig(bypass_cache=True)
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url=raw_html_url, bypass_cache=True)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=raw_html_url, config=config)
if result.success:
print("Markdown Content from Raw HTML:")
print(result.markdown)
@@ -84,152 +76,83 @@ asyncio.run(crawl_raw_html())
---
## Complete Example
# Complete Example
Below is a comprehensive script that:
1. **Crawls the Wikipedia page for "Apple".**
2. **Saves the HTML content to a local file (`apple.html`).**
3. **Crawls the local HTML file and verifies the markdown length matches the original crawl.**
4. **Crawls the raw HTML content from the saved file and verifies consistency.**
1. Crawls the Wikipedia page for "Apple."
2. Saves the HTML content to a local file (`apple.html`).
3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
4. Crawls the raw HTML content from the saved file and verifies consistency.
```python
import os
import sys
import asyncio
from pathlib import Path
# Adjust the parent directory to include the crawl4ai module
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
async def main():
# Define the URL to crawl
wikipedia_url = "https://en.wikipedia.org/wiki/apple"
# Define the path to save the HTML file
# Save the file in the same directory as the script
script_dir = Path(__file__).parent
html_file_path = script_dir / "apple.html"
async with AsyncWebCrawler(verbose=True) as crawler:
async with AsyncWebCrawler() as crawler:
# Step 1: Crawl the Web URL
print("\n=== Step 1: Crawling the Wikipedia URL ===")
# Crawl the Wikipedia URL
result = await crawler.arun(url=wikipedia_url, bypass_cache=True)
# Check if crawling was successful
web_config = CrawlerRunConfig(bypass_cache=True)
result = await crawler.arun(url=wikipedia_url, config=web_config)
if not result.success:
print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
return
# Save the HTML content to a local file
with open(html_file_path, 'w', encoding='utf-8') as f:
f.write(result.html)
print(f"Saved HTML content to {html_file_path}")
# Store the length of the generated markdown
web_crawl_length = len(result.markdown)
print(f"Length of markdown from web crawl: {web_crawl_length}\n")
# Step 2: Crawl from the Local HTML File
print("=== Step 2: Crawling from the Local HTML File ===")
# Construct the file URL with 'file://' prefix
file_url = f"file://{html_file_path.resolve()}"
# Crawl the local HTML file
local_result = await crawler.arun(url=file_url, bypass_cache=True)
# Check if crawling was successful
file_config = CrawlerRunConfig(bypass_cache=True)
local_result = await crawler.arun(url=file_url, config=file_config)
if not local_result.success:
print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
return
# Store the length of the generated markdown from local file
local_crawl_length = len(local_result.markdown)
print(f"Length of markdown from local file crawl: {local_crawl_length}")
# Compare the lengths
assert web_crawl_length == local_crawl_length, (
f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Local file crawl ({local_crawl_length})"
)
print("✅ Markdown length matches between web crawl and local file crawl.\n")
assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
print("✅ Markdown length matches between web and local file crawl.\n")
# Step 3: Crawl Using Raw HTML Content
print("=== Step 3: Crawling Using Raw HTML Content ===")
# Read the HTML content from the saved file
with open(html_file_path, 'r', encoding='utf-8') as f:
raw_html_content = f.read()
# Prefix the raw HTML content with 'raw:'
raw_html_url = f"raw:{raw_html_content}"
# Crawl using the raw HTML content
raw_result = await crawler.arun(url=raw_html_url, bypass_cache=True)
# Check if crawling was successful
raw_config = CrawlerRunConfig(bypass_cache=True)
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
if not raw_result.success:
print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
return
# Store the length of the generated markdown from raw HTML
raw_crawl_length = len(raw_result.markdown)
print(f"Length of markdown from raw HTML crawl: {raw_crawl_length}")
# Compare the lengths
assert web_crawl_length == raw_crawl_length, (
f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Raw HTML crawl ({raw_crawl_length})"
)
print("✅ Markdown length matches between web crawl and raw HTML crawl.\n")
assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
print("✅ Markdown length matches between web and raw HTML crawl.\n")
print("All tests passed successfully!")
# Clean up by removing the saved HTML file
if html_file_path.exists():
os.remove(html_file_path)
print(f"Removed the saved HTML file: {html_file_path}")
# Run the main function
if __name__ == "__main__":
asyncio.run(main())
```
### **How It Works**
1. **Step 1: Crawl the Web URL**
- Crawls `https://en.wikipedia.org/wiki/apple`.
- Saves the HTML content to `apple.html`.
- Records the length of the generated markdown.
2. **Step 2: Crawl from the Local HTML File**
- Uses the `file://` prefix to crawl `apple.html`.
- Ensures the markdown length matches the original web crawl.
3. **Step 3: Crawl Using Raw HTML Content**
- Reads the HTML from `apple.html`.
- Prefixes it with `raw:` and crawls.
- Verifies the markdown length matches the previous results.
4. **Cleanup**
- Deletes the `apple.html` file after testing.
### **Running the Example**
1. **Save the Script:**
- Save the above code as `test_crawl4ai.py` in your project directory.
2. **Execute the Script:**
- Run the script using:
```bash
python test_crawl4ai.py
```
3. **Observe the Output:**
- The script will print logs detailing each step.
- Assertions ensure consistency across different crawling methods.
- Upon success, it confirms that all markdown lengths match.
---
## Conclusion
With the new prefix-based input handling in **Crawl4AI**, you can effortlessly crawl web URLs, local HTML files, and raw HTML strings using a unified `url` parameter. This enhancement simplifies the API usage and provides greater flexibility for diverse crawling scenarios.
# Conclusion
With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.

View File

@@ -1,49 +1,66 @@
# Quick Start Guide 🚀
Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI with a friendly and humorous tone. We'll cover everything from basic usage to advanced features like chunking and extraction strategies, all with the power of asynchronous programming. Let's dive in! 🌟
Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI, covering everything from initial setup to advanced features like chunking and extraction strategies, using asynchronous programming. Let's dive in! 🌟
---
## Getting Started 🛠️
First, let's import the necessary modules and create an instance of `AsyncWebCrawler`. We'll use an async context manager, which handles the setup and teardown of the crawler for us.
Set up your environment with `BrowserConfig` and create an `AsyncWebCrawler` instance.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
# We'll add our crawling code here
browser_config = BrowserConfig(verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
# Add your crawling logic here
pass
if __name__ == "__main__":
asyncio.run(main())
```
---
### Basic Usage
Simply provide a URL and let Crawl4AI do the magic!
Provide a URL and let Crawl4AI do the work!
```python
from crawl4ai.async_configs import CrawlerRunConfig
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://www.nbcnews.com/business")
browser_config = BrowserConfig(verbose=True)
crawl_config = CrawlerRunConfig(url="https://www.nbcnews.com/business")
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(config=crawl_config)
print(f"Basic crawl result: {result.markdown[:500]}") # Print first 500 characters
asyncio.run(main())
if __name__ == "__main__":
asyncio.run(main())
```
---
### Taking Screenshots 📸
Capture screenshots of web pages easily:
Capture and save webpage screenshots with `CrawlerRunConfig`:
```python
from crawl4ai.async_configs import CacheMode
async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=url,
screenshot=True,
cache_mode=CacheMode.BYPASS
)
browser_config = BrowserConfig(verbose=True)
crawl_config = CrawlerRunConfig(
url=url,
screenshot=True,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(config=crawl_config)
if result.success and result.screenshot:
import base64
@@ -55,243 +72,101 @@ async def capture_and_save_screenshot(url: str, output_path: str):
print("Failed to capture screenshot")
```
---
### Browser Selection 🌐
Crawl4AI supports multiple browser engines. Here's how to use different browsers:
Choose from multiple browser engines using `BrowserConfig`:
```python
from crawl4ai.async_configs import BrowserConfig
# Use Firefox
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS)
firefox_config = BrowserConfig(browser_type="firefox", verbose=True, headless=True)
async with AsyncWebCrawler(config=firefox_config) as crawler:
result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
# Use WebKit
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS)
webkit_config = BrowserConfig(browser_type="webkit", verbose=True, headless=True)
async with AsyncWebCrawler(config=webkit_config) as crawler:
result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
# Use Chromium (default)
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS)
chromium_config = BrowserConfig(verbose=True, headless=True)
async with AsyncWebCrawler(config=chromium_config) as crawler:
result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
```
---
### User Simulation 🎭
Simulate real user behavior to avoid detection:
Simulate real user behavior to bypass detection:
```python
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
result = await crawler.arun(
url="YOUR-URL-HERE",
cache_mode=CacheMode.BYPASS,
simulate_user=True, # Causes random mouse movements and clicks
override_navigator=True # Makes the browser appear more like a real user
)
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
browser_config = BrowserConfig(verbose=True, headless=True)
crawl_config = CrawlerRunConfig(
url="YOUR-URL-HERE",
cache_mode=CacheMode.BYPASS,
simulate_user=True, # Random mouse movements and clicks
override_navigator=True # Makes the browser appear like a real user
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(config=crawl_config)
```
---
### Understanding Parameters 🧠
By default, Crawl4AI caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.
Explore caching and forcing fresh crawls:
```python
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
# First crawl (caches the result)
result1 = await crawler.arun(url="https://www.nbcnews.com/business")
browser_config = BrowserConfig(verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
# First crawl (uses cache)
result1 = await crawler.arun(config=CrawlerRunConfig(url="https://www.nbcnews.com/business"))
print(f"First crawl result: {result1.markdown[:100]}...")
# Force to crawl again
result2 = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS)
# Force fresh crawl
result2 = await crawler.arun(
config=CrawlerRunConfig(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS)
)
print(f"Second crawl result: {result2.markdown[:100]}...")
asyncio.run(main())
if __name__ == "__main__":
asyncio.run(main())
```
---
### Adding a Chunking Strategy 🧩
Let's add a chunking strategy: `RegexChunking`! This strategy splits the text based on a given regex pattern.
Split content into chunks using `RegexChunking`:
```python
from crawl4ai.chunking_strategy import RegexChunking
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"])
)
browser_config = BrowserConfig(verbose=True)
crawl_config = CrawlerRunConfig(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"])
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(config=crawl_config)
print(f"RegexChunking result: {result.extracted_content[:200]}...")
asyncio.run(main())
if __name__ == "__main__":
asyncio.run(main())
```
### Using LLMExtractionStrategy with Different Providers 🤖
---
Crawl4AI supports multiple LLM providers for extraction:
### Advanced Features and Configurations
```python
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
# OpenAI
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# Hugging Face
await extract_structured_data_using_llm(
"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct",
os.getenv("HUGGINGFACE_API_KEY")
)
# Ollama
await extract_structured_data_using_llm("ollama/llama3.2")
# With custom headers
custom_headers = {
"Authorization": "Bearer your-custom-token",
"X-Custom-Header": "Some-Value"
}
await extract_structured_data_using_llm(extra_headers=custom_headers)
```
### Knowledge Graph Generation 🕸️
Generate knowledge graphs from web content:
```python
from pydantic import BaseModel
from typing import List
class Entity(BaseModel):
name: str
description: str
class Relationship(BaseModel):
entity1: Entity
entity2: Entity
description: str
relation_type: str
class KnowledgeGraph(BaseModel):
entities: List[Entity]
relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy(
provider='openai/gpt-4o-mini',
api_token=os.getenv('OPENAI_API_KEY'),
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="Extract entities and relationships from the given text."
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://paulgraham.com/love.html",
cache_mode=CacheMode.BYPASS,
extraction_strategy=extraction_strategy
)
```
### Advanced Session-Based Crawling with Dynamic Content 🔄
For modern web applications with dynamic content loading, here's how to handle pagination and content updates:
```python
async def crawl_dynamic_content():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
wait_for = """() => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.firstCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS,
headless=False,
)
await crawler.crawler_strategy.kill_session(session_id)
```
### Handling Overlays and Fitting Content 📏
Remove overlay elements and fit content appropriately:
```python
async with AsyncWebCrawler(headless=False) as crawler:
result = await crawler.arun(
url="your-url-here",
cache_mode=CacheMode.BYPASS,
word_count_threshold=10,
remove_overlay_elements=True,
screenshot=True
)
```
## Performance Comparison 🏎️
Crawl4AI offers impressive performance compared to other solutions:
```python
# Firecrawl comparison
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
start = time.time()
scrape_status = app.scrape_url(
'https://www.nbcnews.com/business',
params={'formats': ['markdown', 'html']}
)
end = time.time()
# Crawl4AI comparison
async with AsyncWebCrawler() as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
word_count_threshold=0,
cache_mode=CacheMode.BYPASS,
verbose=False,
)
end = time.time()
```
Note: Performance comparisons should be conducted in environments with stable and fast internet connections for accurate results.
## Congratulations! 🎉
You've made it through the updated Crawl4AI Quickstart Guide! Now you're equipped with even more powerful features to crawl the web asynchronously like a pro! 🕸️
Happy crawling! 🚀
For advanced examples (LLM strategies, knowledge graphs, pagination handling), ensure all code aligns with the `BrowserConfig` and `CrawlerRunConfig` pattern shown above.

View File

@@ -4,16 +4,21 @@ This guide covers the basics of web crawling with Crawl4AI. You'll learn how to
## Basic Usage
Here's the simplest way to crawl a webpage:
Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`:
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
async def main():
async with AsyncWebCrawler() as crawler:
browser_config = BrowserConfig() # Default browser configuration
run_config = CrawlerRunConfig() # Default crawl run configuration
async with AsyncWebCrawler(browser_config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com"
url="https://example.com",
config=run_config
)
print(result.markdown) # Print clean markdown content
@@ -26,7 +31,10 @@ if __name__ == "__main__":
The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
```python
result = await crawler.arun(url="https://example.com", fit_markdown=True)
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig(fit_markdown=True)
)
# Different content formats
print(result.html) # Raw HTML
@@ -45,16 +53,20 @@ print(result.links) # Dictionary of internal and external links
## Adding Basic Options
Customize your crawl with these common options:
Customize your crawl using `CrawlerRunConfig`:
```python
result = await crawler.arun(
url="https://example.com",
run_config = CrawlerRunConfig(
word_count_threshold=10, # Minimum words per content block
exclude_external_links=True, # Remove external links
remove_overlay_elements=True, # Remove popups/modals
process_iframes=True # Process iframe content
)
result = await crawler.arun(
url="https://example.com",
config=run_config
)
```
## Handling Errors
@@ -62,7 +74,9 @@ result = await crawler.arun(
Always check if the crawl was successful:
```python
result = await crawler.arun(url="https://example.com")
run_config = CrawlerRunConfig()
result = await crawler.arun(url="https://example.com", config=run_config)
if not result.success:
print(f"Crawl failed: {result.error_message}")
print(f"Status code: {result.status_code}")
@@ -70,36 +84,45 @@ if not result.success:
## Logging and Debugging
Enable verbose mode for detailed logging:
Enable verbose logging in `BrowserConfig`:
```python
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://example.com")
browser_config = BrowserConfig(verbose=True)
async with AsyncWebCrawler(browser_config=browser_config) as crawler:
run_config = CrawlerRunConfig()
result = await crawler.arun(url="https://example.com", config=run_config)
```
## Complete Example
Here's a more comprehensive example showing common usage patterns:
Here's a more comprehensive example demonstrating common usage patterns:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
browser_config = BrowserConfig(verbose=True)
run_config = CrawlerRunConfig(
# Content filtering
word_count_threshold=10,
excluded_tags=['form', 'header'],
exclude_external_links=True,
# Content processing
process_iframes=True,
remove_overlay_elements=True,
# Cache control
cache_mode=CacheMode.ENABLED # Use cache if available
)
async with AsyncWebCrawler(browser_config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
# Content filtering
word_count_threshold=10,
excluded_tags=['form', 'header'],
exclude_external_links=True,
# Content processing
process_iframes=True,
remove_overlay_elements=True,
# Cache control
cache_mode=CacheMode.ENABLE # Use cache if available
config=run_config
)
if result.success:

View File

@@ -37,11 +37,11 @@ Heres how to turn it on:
```python
crawler = AsyncPlaywrightCrawlerStrategy(
text_only=True # Set this to True to enable text-only crawling
text_mode=True # Set this to True to enable text-only crawling
)
```
When `text_only=True`, the crawler automatically:
When `text_mode=True`, the crawler automatically:
- Disables GPU processing.
- Blocks image and JavaScript resources.
- Reduces the viewport size to 800x600 (you can override this with `viewport_width` and `viewport_height`).

View File

@@ -32,7 +32,7 @@ async def test_browser_config_object():
async def test_browser_performance_config():
"""Test browser configurations focused on performance"""
browser_config = BrowserConfig(
text_only=True,
text_mode=True,
light_mode=True,
extra_args=['--disable-gpu', '--disable-software-rasterizer'],
ignore_https_errors=True,