Release prep (#749)
* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
This commit is contained in:
@@ -24,6 +24,14 @@ We would like to thank the following people for their contributions to Crawl4AI:
|
||||
- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
|
||||
- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
|
||||
|
||||
#### Feb-Alpha-1
|
||||
- [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)
|
||||
- [tautikAg](https://github.com/tautikAg) - fix: [Markdown output has incorect spacing](https://github.com/unclecode/crawl4ai/issues/599)
|
||||
- [cardit1](https://github.com/cardit1) - fix: ['AsyncPlaywrightCrawlerStrategy' object has no attribute 'downloads_path'](https://github.com/unclecode/crawl4ai/issues/585)
|
||||
- [dmurat](https://github.com/dmurat) - fix: [ Incorrect rendering of inline code inside of links ](https://github.com/unclecode/crawl4ai/issues/583)
|
||||
- [Sparshsing](https://github.com/Sparshsing) - fix: [Relative Urls in the webpage not extracted properly ](https://github.com/unclecode/crawl4ai/issues/570)
|
||||
|
||||
|
||||
|
||||
## Other Contributors
|
||||
|
||||
@@ -31,6 +39,11 @@ We would like to thank the following people for their contributions to Crawl4AI:
|
||||
- [Shiv Kumar](https://github.com/shivkumar0757)
|
||||
- [QIN2DIM](https://github.com/QIN2DIM)
|
||||
|
||||
#### Typo fixes
|
||||
- [ssoydan](https://github.com/ssoydan)
|
||||
- [Darshan](https://github.com/Darshan2104)
|
||||
- [tuhinmallick](https://github.com/tuhinmallick)
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.
|
||||
|
||||
@@ -318,9 +318,8 @@ async def main():
|
||||
url="https://docs.micronaut.io/4.7.6/guide/",
|
||||
config=run_config
|
||||
)
|
||||
print(len(result.markdown))
|
||||
print(len(result.fit_markdown))
|
||||
print(len(result.markdown_v2.fit_markdown))
|
||||
print(len(result.markdown.raw_markdown))
|
||||
print(len(result.markdown.fit_markdown))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
|
||||
import aiofiles
|
||||
from .utils import VersionManager
|
||||
from .async_logger import AsyncLogger
|
||||
@@ -336,12 +336,17 @@ class AsyncDatabaseManager:
|
||||
except json.JSONDecodeError:
|
||||
# Very UGLY, never mention it to me please
|
||||
if field == "markdown" and isinstance(row_dict[field], str):
|
||||
row_dict[field] = row_dict[field]
|
||||
row_dict[field] = MarkdownGenerationResult(
|
||||
raw_markdown=row_dict[field] or "",
|
||||
markdown_with_citations="",
|
||||
references_markdown="",
|
||||
fit_markdown="",
|
||||
fit_html="",
|
||||
)
|
||||
else:
|
||||
row_dict[field] = {}
|
||||
|
||||
if isinstance(row_dict["markdown"], Dict):
|
||||
row_dict["markdown_v2"] = row_dict["markdown"]
|
||||
if row_dict["markdown"].get("raw_markdown"):
|
||||
row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]
|
||||
|
||||
@@ -358,7 +363,7 @@ class AsyncDatabaseManager:
|
||||
# Remove any fields not in CrawlResult model
|
||||
valid_fields = CrawlResult.__annotations__.keys()
|
||||
filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
|
||||
|
||||
filtered_dict["markdown"] = row_dict["markdown"]
|
||||
return CrawlResult(**filtered_dict)
|
||||
|
||||
try:
|
||||
@@ -384,14 +389,14 @@ class AsyncDatabaseManager:
|
||||
}
|
||||
|
||||
try:
|
||||
if isinstance(result.markdown, MarkdownGenerationResult):
|
||||
if isinstance(result.markdown, StringCompatibleMarkdown):
|
||||
content_map["markdown"] = (
|
||||
result.markdown.model_dump_json(),
|
||||
result.markdown,
|
||||
"markdown",
|
||||
)
|
||||
elif hasattr(result, "markdown_v2"):
|
||||
elif isinstance(result.markdown, MarkdownGenerationResult):
|
||||
content_map["markdown"] = (
|
||||
result.markdown_v2.model_dump_json(),
|
||||
result.markdown.model_dump_json(),
|
||||
"markdown",
|
||||
)
|
||||
elif isinstance(result.markdown, str):
|
||||
|
||||
@@ -362,7 +362,7 @@ class AsyncWebCrawler:
|
||||
self.logger.info(
|
||||
message="Switch proxy: {proxy}",
|
||||
tag="PROXY",
|
||||
params={"proxy": next_proxy.get("server")},
|
||||
params={"proxy": next_proxy.server},
|
||||
)
|
||||
config.proxy_config = next_proxy
|
||||
# config = config.clone(proxy_config=next_proxy)
|
||||
@@ -581,8 +581,6 @@ class AsyncWebCrawler:
|
||||
# html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
)
|
||||
markdown_v2 = markdown_result
|
||||
markdown = sanitize_input_encode(markdown_result.raw_markdown)
|
||||
|
||||
# Log processing completion
|
||||
self.logger.info(
|
||||
@@ -611,11 +609,11 @@ class AsyncWebCrawler:
|
||||
content_format = "markdown"
|
||||
|
||||
content = {
|
||||
"markdown": markdown,
|
||||
"markdown": markdown_result.raw_markdown,
|
||||
"html": html,
|
||||
"cleaned_html": cleaned_html,
|
||||
"fit_markdown": markdown_result.raw_markdown,
|
||||
}.get(content_format, markdown)
|
||||
"fit_markdown": markdown_result.fit_markdown,
|
||||
}.get(content_format, markdown_result.raw_markdown)
|
||||
|
||||
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
|
||||
chunking = (
|
||||
@@ -649,10 +647,7 @@ class AsyncWebCrawler:
|
||||
url=url,
|
||||
html=html,
|
||||
cleaned_html=cleaned_html,
|
||||
markdown_v2=markdown_v2,
|
||||
markdown=markdown,
|
||||
fit_markdown=markdown_result.fit_markdown,
|
||||
fit_html=markdown_result.fit_html,
|
||||
markdown=markdown_result,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
|
||||
@@ -633,12 +633,12 @@ class BrowserManager:
|
||||
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||||
if crawlerRunConfig.proxy_config:
|
||||
proxy_settings = {
|
||||
"server": crawlerRunConfig.proxy_config.get("server"),
|
||||
"server": crawlerRunConfig.proxy_config.server,
|
||||
}
|
||||
if crawlerRunConfig.proxy_config.get("username"):
|
||||
if crawlerRunConfig.proxy_config.username:
|
||||
proxy_settings.update({
|
||||
"username": crawlerRunConfig.proxy_config.get("username"),
|
||||
"password": crawlerRunConfig.proxy_config.get("password"),
|
||||
"username": crawlerRunConfig.proxy_config.username,
|
||||
"password": crawlerRunConfig.proxy_config.password,
|
||||
})
|
||||
context_settings["proxy"] = proxy_settings
|
||||
|
||||
|
||||
@@ -19,6 +19,8 @@ from crawl4ai import (
|
||||
from litellm import completion
|
||||
from pathlib import Path
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
def get_global_config() -> dict:
|
||||
config_dir = Path.home() / ".crawl4ai"
|
||||
config_file = config_dir / "global.yml"
|
||||
@@ -288,7 +290,7 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
|
||||
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
|
||||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all")
|
||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||||
@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling")
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@@ -351,9 +353,8 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
|
||||
raise click.ClickException("LLM provider and API token are required for LLM extraction")
|
||||
|
||||
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
||||
provider=extract_conf["provider"],
|
||||
llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
|
||||
instruction=extract_conf["instruction"],
|
||||
api_token=extract_conf.get("api_token", extract_conf.get("api_key")),
|
||||
schema=schema_data,
|
||||
**extract_conf.get("params", {})
|
||||
)
|
||||
@@ -383,7 +384,7 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
|
||||
# Handle question
|
||||
if question:
|
||||
provider, token = setup_llm_config()
|
||||
markdown = result.markdown_v2.raw_markdown
|
||||
markdown = result.markdown.raw_markdown
|
||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||
return
|
||||
|
||||
@@ -393,9 +394,9 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
|
||||
elif output == "json":
|
||||
click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(result.markdown_v2.raw_markdown)
|
||||
click.echo(result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(result.markdown_v2.fit_markdown)
|
||||
click.echo(result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
|
||||
2
crawl4ai/configs/__init__.py
Normal file
2
crawl4ai/configs/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .proxy_config import ProxyConfig
|
||||
__all__ = ["ProxyConfig"]
|
||||
113
crawl4ai/configs/proxy_config.py
Normal file
113
crawl4ai/configs/proxy_config.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import os
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
):
|
||||
"""Configuration class for a single proxy.
|
||||
|
||||
Args:
|
||||
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
|
||||
username: Optional username for proxy authentication
|
||||
password: Optional password for proxy authentication
|
||||
ip: Optional IP address for verification purposes
|
||||
"""
|
||||
self.server = server
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
# Extract IP from server if not explicitly provided
|
||||
self.ip = ip or self._extract_ip_from_server()
|
||||
|
||||
def _extract_ip_from_server(self) -> Optional[str]:
|
||||
"""Extract IP address from server URL."""
|
||||
try:
|
||||
# Simple extraction assuming http://ip:port format
|
||||
if "://" in self.server:
|
||||
parts = self.server.split("://")[1].split(":")
|
||||
return parts[0]
|
||||
else:
|
||||
parts = self.server.split(":")
|
||||
return parts[0]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4: # ip:port:username:password
|
||||
ip, port, username, password = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
username=username,
|
||||
password=password,
|
||||
ip=ip
|
||||
)
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
ip=ip
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a dictionary."""
|
||||
return ProxyConfig(
|
||||
server=proxy_dict.get("server"),
|
||||
username=proxy_dict.get("username"),
|
||||
password=proxy_dict.get("password"),
|
||||
ip=proxy_dict.get("ip")
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
|
||||
"""Load proxies from environment variable.
|
||||
|
||||
Args:
|
||||
env_var: Name of environment variable containing comma-separated proxy strings
|
||||
|
||||
Returns:
|
||||
List of ProxyConfig objects
|
||||
"""
|
||||
proxies = []
|
||||
try:
|
||||
proxy_list = os.getenv(env_var, "").split(",")
|
||||
for proxy in proxy_list:
|
||||
if not proxy:
|
||||
continue
|
||||
proxies.append(ProxyConfig.from_string(proxy))
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
"server": self.server,
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
"ip": self.ip
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "ProxyConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
ProxyConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return ProxyConfig.from_dict(config_dict)
|
||||
@@ -8,6 +8,7 @@ from .filters import (
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
URLFilter,
|
||||
URLPatternFilter,
|
||||
FilterStats,
|
||||
ContentRelevanceFilter,
|
||||
SEOFilter
|
||||
@@ -32,6 +33,7 @@ __all__ = [
|
||||
"ContentTypeFilter",
|
||||
"DomainFilter",
|
||||
"URLFilter",
|
||||
"URLPatternFilter",
|
||||
"FilterStats",
|
||||
"ContentRelevanceFilter",
|
||||
"SEOFilter",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from re import U
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
@@ -86,6 +86,9 @@ class MarkdownGenerationResult(BaseModel):
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.raw_markdown
|
||||
|
||||
@dataclass
|
||||
class TraversalStats:
|
||||
"""Statistics for the traversal process"""
|
||||
@@ -105,7 +108,6 @@ class DispatchResult(BaseModel):
|
||||
end_time: Union[datetime, float]
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
@@ -117,10 +119,7 @@ class CrawlResult(BaseModel):
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf: Optional[bytes] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
@@ -134,6 +133,118 @@ class CrawlResult(BaseModel):
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
|
||||
# and model_dump override all exist to support a smooth transition from markdown as a string
|
||||
# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
|
||||
#
|
||||
# This allows code that expects markdown to be a string to continue working, while also
|
||||
# providing access to the full MarkdownGenerationResult object's properties.
|
||||
#
|
||||
# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
|
||||
#
|
||||
# When backward compatibility is no longer needed in future versions, this entire mechanism
|
||||
# can be simplified to a standard field with no custom accessors or serialization logic.
|
||||
|
||||
def __init__(self, **data):
|
||||
markdown_result = data.pop('markdown', None)
|
||||
super().__init__(**data)
|
||||
if markdown_result is not None:
|
||||
self._markdown = markdown_result
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
"""
|
||||
Property that returns a StringCompatibleMarkdown object that behaves like
|
||||
a string but also provides access to MarkdownGenerationResult attributes.
|
||||
|
||||
This approach allows backward compatibility with code that expects 'markdown'
|
||||
to be a string, while providing access to the full MarkdownGenerationResult.
|
||||
"""
|
||||
if self._markdown is None:
|
||||
return None
|
||||
return StringCompatibleMarkdown(self._markdown)
|
||||
|
||||
@markdown.setter
|
||||
def markdown(self, value):
|
||||
"""
|
||||
Setter for the markdown property.
|
||||
"""
|
||||
self._markdown = value
|
||||
|
||||
@property
|
||||
def markdown_v2(self):
|
||||
"""
|
||||
Deprecated property that raises an AttributeError when accessed.
|
||||
|
||||
This property exists to inform users that 'markdown_v2' has been
|
||||
deprecated and they should use 'markdown' instead.
|
||||
"""
|
||||
raise AttributeError(
|
||||
"The 'markdown_v2' attribute is deprecated and has been removed. "
|
||||
"""Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
|
||||
following properties:
|
||||
- raw_markdown: The raw markdown string
|
||||
- markdown_with_citations: The markdown string with citations
|
||||
- references_markdown: The markdown string with references
|
||||
- fit_markdown: The markdown string with fit text
|
||||
"""
|
||||
)
|
||||
|
||||
@property
|
||||
def fit_markdown(self):
|
||||
"""
|
||||
Deprecated property that raises an AttributeError when accessed.
|
||||
"""
|
||||
raise AttributeError(
|
||||
"The 'fit_markdown' attribute is deprecated and has been removed. "
|
||||
"Please use 'markdown.fit_markdown' instead."
|
||||
)
|
||||
|
||||
@property
|
||||
def fit_html(self):
|
||||
"""
|
||||
Deprecated property that raises an AttributeError when accessed.
|
||||
"""
|
||||
raise AttributeError(
|
||||
"The 'fit_html' attribute is deprecated and has been removed. "
|
||||
"Please use 'markdown.fit_html' instead."
|
||||
)
|
||||
|
||||
def model_dump(self, *args, **kwargs):
|
||||
"""
|
||||
Override model_dump to include the _markdown private attribute in serialization.
|
||||
|
||||
This override is necessary because:
|
||||
1. PrivateAttr fields are excluded from serialization by default
|
||||
2. We need to maintain backward compatibility by including the 'markdown' field
|
||||
in the serialized output
|
||||
3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
|
||||
the same type of data
|
||||
|
||||
Future developers: This method ensures that the markdown content is properly
|
||||
serialized despite being stored in a private attribute. If the serialization
|
||||
requirements change, this is where you would update the logic.
|
||||
"""
|
||||
result = super().model_dump(*args, **kwargs)
|
||||
if self._markdown is not None:
|
||||
result["markdown"] = self._markdown.model_dump()
|
||||
return result
|
||||
|
||||
class StringCompatibleMarkdown(str):
|
||||
"""A string subclass that also provides access to MarkdownGenerationResult attributes"""
|
||||
def __new__(cls, markdown_result):
|
||||
return super().__new__(cls, markdown_result.raw_markdown)
|
||||
|
||||
def __init__(self, markdown_result):
|
||||
self._markdown_result = markdown_result
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._markdown_result, name)
|
||||
|
||||
# END of backward compatibility code for markdown/markdown_v2.
|
||||
# When removing this code in the future, make sure to:
|
||||
# 1. Replace the private attribute and property with a standard field
|
||||
# 2. Update any serialization logic that might depend on the current behavior
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
|
||||
@@ -2,6 +2,8 @@ from typing import List, Dict, Optional
|
||||
from abc import ABC, abstractmethod
|
||||
from itertools import cycle
|
||||
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
|
||||
class ProxyRotationStrategy(ABC):
|
||||
"""Base abstract class for proxy rotation strategies"""
|
||||
|
||||
@@ -15,28 +17,27 @@ class ProxyRotationStrategy(ABC):
|
||||
"""Add proxy configurations to the strategy"""
|
||||
pass
|
||||
|
||||
class RoundRobinProxyStrategy(ProxyRotationStrategy):
|
||||
"""Simple round-robin proxy rotation strategy"""
|
||||
class RoundRobinProxyStrategy:
|
||||
"""Simple round-robin proxy rotation strategy using ProxyConfig objects"""
|
||||
|
||||
def __init__(self, proxies: List[Dict] = None):
|
||||
def __init__(self, proxies: List[ProxyConfig] = None):
|
||||
"""
|
||||
Initialize with optional list of proxy configurations
|
||||
|
||||
Args:
|
||||
proxies: List of proxy config dictionaries, each containing at least
|
||||
'server' key with proxy URL
|
||||
proxies: List of ProxyConfig objects
|
||||
"""
|
||||
self._proxies = []
|
||||
self._proxy_cycle = None
|
||||
if proxies:
|
||||
self.add_proxies(proxies)
|
||||
|
||||
def add_proxies(self, proxies: List[Dict]):
|
||||
def add_proxies(self, proxies: List[ProxyConfig]):
|
||||
"""Add new proxies to the rotation pool"""
|
||||
self._proxies.extend(proxies)
|
||||
self._proxy_cycle = cycle(self._proxies)
|
||||
|
||||
async def get_next_proxy(self) -> Optional[Dict]:
|
||||
async def get_next_proxy(self) -> Optional[ProxyConfig]:
|
||||
"""Get next proxy in round-robin fashion"""
|
||||
if not self._proxy_cycle:
|
||||
return None
|
||||
|
||||
@@ -60,7 +60,7 @@ async def handle_llm_qa(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=result.error_message
|
||||
)
|
||||
content = result.markdown_v2.fit_markdown
|
||||
content = result.markdown.fit_markdown
|
||||
|
||||
# Create prompt and get LLM response
|
||||
prompt = f"""Use the following content as context to answer the question.
|
||||
@@ -189,9 +189,9 @@ async def handle_markdown_request(
|
||||
detail=result.error_message
|
||||
)
|
||||
|
||||
return (result.markdown_v2.raw_markdown
|
||||
return (result.markdown.raw_markdown
|
||||
if filter_type == FilterType.RAW
|
||||
else result.markdown_v2.fit_markdown)
|
||||
else result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown error: {str(e)}", exc_info=True)
|
||||
|
||||
@@ -52,7 +52,7 @@ async def crawl_sequential(urls: List[str]):
|
||||
)
|
||||
if result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
@@ -101,7 +101,7 @@ async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
||||
print(f"Error crawling {url}: {str(result)}")
|
||||
elif result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
|
||||
404
docs/examples/deepcrawl.py
Normal file
404
docs/examples/deepcrawl.py
Normal file
@@ -0,0 +1,404 @@
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.filters import (
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
ContentRelevanceFilter,
|
||||
SEOFilter,
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import (
|
||||
KeywordRelevanceScorer,
|
||||
)
|
||||
|
||||
|
||||
# 1️⃣ Basic Deep Crawl Setup
|
||||
async def basic_deep_crawl():
|
||||
"""
|
||||
PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
|
||||
|
||||
This function shows:
|
||||
- How to set up BFSDeepCrawlStrategy (Breadth-First Search)
|
||||
- Setting depth and domain parameters
|
||||
- Processing the results to show the hierarchy
|
||||
"""
|
||||
print("\n===== BASIC DEEP CRAWL SETUP =====")
|
||||
|
||||
# Configure a 2-level deep crawl using Breadth-First Search strategy
|
||||
# max_depth=2 means: initial page (depth 0) + 2 more levels
|
||||
# include_external=False means: only follow links within the same domain
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True, # Show progress during crawling
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
# Group results by depth to visualize the crawl tree
|
||||
pages_by_depth = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
if depth not in pages_by_depth:
|
||||
pages_by_depth[depth] = []
|
||||
pages_by_depth[depth].append(result.url)
|
||||
|
||||
print(f"✅ Crawled {len(results)} pages total")
|
||||
|
||||
# Display crawl structure by depth
|
||||
for depth, urls in sorted(pages_by_depth.items()):
|
||||
print(f"\nDepth {depth}: {len(urls)} pages")
|
||||
# Show first 3 URLs for each depth as examples
|
||||
for url in urls[:3]:
|
||||
print(f" → {url}")
|
||||
if len(urls) > 3:
|
||||
print(f" ... and {len(urls) - 3} more")
|
||||
|
||||
print(
|
||||
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
|
||||
# 2️⃣ Stream vs. Non-Stream Execution
|
||||
async def stream_vs_nonstream():
|
||||
"""
|
||||
PART 2: Demonstrates the difference between stream and non-stream execution.
|
||||
|
||||
Non-stream: Waits for all results before processing
|
||||
Stream: Processes results as they become available
|
||||
"""
|
||||
print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
|
||||
|
||||
# Common configuration for both examples
|
||||
base_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# NON-STREAMING MODE
|
||||
print("\n📊 NON-STREAMING MODE:")
|
||||
print(" In this mode, all results are collected before being returned.")
|
||||
|
||||
non_stream_config = base_config.clone()
|
||||
non_stream_config.stream = False
|
||||
|
||||
start_time = time.perf_counter()
|
||||
results = await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=non_stream_config
|
||||
)
|
||||
|
||||
print(f" ✅ Received all {len(results)} results at once")
|
||||
print(f" ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
# STREAMING MODE
|
||||
print("\n📊 STREAMING MODE:")
|
||||
print(" In this mode, results are processed as they become available.")
|
||||
|
||||
stream_config = base_config.clone()
|
||||
stream_config.stream = True
|
||||
|
||||
start_time = time.perf_counter()
|
||||
result_count = 0
|
||||
first_result_time = None
|
||||
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=stream_config
|
||||
):
|
||||
result_count += 1
|
||||
if result_count == 1:
|
||||
first_result_time = time.perf_counter() - start_time
|
||||
print(
|
||||
f" ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
|
||||
)
|
||||
elif result_count % 5 == 0: # Show every 5th result for brevity
|
||||
print(f" → Result #{result_count}: {result.url}")
|
||||
|
||||
print(f" ✅ Total: {result_count} results")
|
||||
print(f" ✅ First result: {first_result_time:.2f} seconds")
|
||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
||||
|
||||
|
||||
# 3️⃣ Introduce Filters & Scorers
|
||||
async def filters_and_scorers():
|
||||
"""
|
||||
PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
|
||||
|
||||
This function progressively adds:
|
||||
1. A single URL pattern filter
|
||||
2. Multiple filters in a chain
|
||||
3. Scorers for prioritizing pages
|
||||
"""
|
||||
print("\n===== FILTERS AND SCORERS =====")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# SINGLE FILTER EXAMPLE
|
||||
print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
|
||||
print(" Only crawl pages containing 'core' in the URL")
|
||||
|
||||
# Create a filter that only allows URLs with 'guide' in them
|
||||
url_filter = URLPatternFilter(patterns=["*core*"])
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
include_external=False,
|
||||
filter_chain=FilterChain([url_filter]), # Single filter
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
print(f" ✅ Crawled {len(results)} pages matching '*core*'")
|
||||
for result in results[:3]: # Show first 3 results
|
||||
print(f" → {result.url}")
|
||||
if len(results) > 3:
|
||||
print(f" ... and {len(results) - 3} more")
|
||||
|
||||
# MULTIPLE FILTERS EXAMPLE
|
||||
print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
|
||||
print(" Only crawl pages that:")
|
||||
print(" 1. Contain '2024' in the URL")
|
||||
print(" 2. Are from 'techcrunch.com'")
|
||||
print(" 3. Are of text/html or application/javascript content type")
|
||||
|
||||
# Create a chain of filters
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
URLPatternFilter(patterns=["*2024*"]),
|
||||
DomainFilter(
|
||||
allowed_domains=["techcrunch.com"],
|
||||
blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
|
||||
),
|
||||
ContentTypeFilter(
|
||||
allowed_types=["text/html", "application/javascript"]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1, include_external=False, filter_chain=filter_chain
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://techcrunch.com", config=config)
|
||||
|
||||
print(f" ✅ Crawled {len(results)} pages after applying all filters")
|
||||
for result in results[:3]:
|
||||
print(f" → {result.url}")
|
||||
if len(results) > 3:
|
||||
print(f" ... and {len(results) - 3} more")
|
||||
|
||||
# SCORERS EXAMPLE
|
||||
print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
|
||||
print(
|
||||
"Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
|
||||
)
|
||||
|
||||
# Create a keyword relevance scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst
|
||||
max_depth=1, include_external=False, url_scorer=keyword_scorer
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=True,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
results = []
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config
|
||||
):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score")
|
||||
print(f" → Score: {score:.2f} | {result.url}")
|
||||
|
||||
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
||||
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
||||
|
||||
|
||||
# 4️⃣ Wrap-Up and Key Takeaways
|
||||
async def wrap_up():
|
||||
"""
|
||||
PART 4: Wrap-Up and Key Takeaways
|
||||
|
||||
Summarize the key concepts learned in this tutorial.
|
||||
"""
|
||||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||||
|
||||
# Create a sophisticated filter chain
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create a composite scorer that combines multiple scoring strategies
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
# Set up the configuration
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=1,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
results = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config
|
||||
):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
# Summarize the results
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
print(
|
||||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||||
)
|
||||
|
||||
# Group by depth
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("\n📊 Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
|
||||
# 5️⃣ Advanced Filters
|
||||
async def advanced_filters():
|
||||
"""
|
||||
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
|
||||
|
||||
This function covers:
|
||||
- SEO filters
|
||||
- Text relevancy filtering
|
||||
- Combining advanced filters
|
||||
"""
|
||||
print("\n===== ADVANCED FILTERS =====")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# SEO FILTER EXAMPLE
|
||||
print("\n📊 EXAMPLE 1: SEO FILTERS")
|
||||
print(
|
||||
"Quantitative SEO quality assessment filter based searching keywords in the head section"
|
||||
)
|
||||
|
||||
seo_filter = SEOFilter(
|
||||
threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1, filter_chain=FilterChain([seo_filter])
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
print(f" ✅ Found {len(results)} pages with relevant keywords")
|
||||
for result in results:
|
||||
print(f" → {result.url}")
|
||||
|
||||
# ADVANCED TEXT RELEVANCY FILTER
|
||||
print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
|
||||
|
||||
# More sophisticated content relevance filter
|
||||
relevance_filter = ContentRelevanceFilter(
|
||||
query="Interact with the web using your authentic digital identity",
|
||||
threshold=0.7,
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1, filter_chain=FilterChain([relevance_filter])
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||||
|
||||
print(f" ✅ Found {len(results)} pages")
|
||||
for result in results:
|
||||
relevance_score = result.metadata.get("relevance_score", 0)
|
||||
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
||||
|
||||
|
||||
# Main function to run the entire tutorial
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Executes all tutorial sections in sequence.
|
||||
"""
|
||||
print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
|
||||
print("======================================")
|
||||
print("This tutorial will walk you through deep crawling techniques,")
|
||||
print("from basic to advanced, using the Crawl4AI library.")
|
||||
|
||||
# Define sections - uncomment to run specific parts during development
|
||||
tutorial_sections = [
|
||||
basic_deep_crawl,
|
||||
stream_vs_nonstream,
|
||||
filters_and_scorers,
|
||||
wrap_up,
|
||||
advanced_filters,
|
||||
]
|
||||
|
||||
for section in tutorial_sections:
|
||||
await section()
|
||||
|
||||
print("\n🎉 TUTORIAL COMPLETE! 🎉")
|
||||
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
||||
print("For more information, check out https://docs.crawl4ai.com")
|
||||
|
||||
|
||||
# Execute the tutorial when run directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tutorial())
|
||||
@@ -39,9 +39,9 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
|
||||
if result.success:
|
||||
print(f"\n=== {name} Results ===")
|
||||
print(f"Extracted Content: {result.extracted_content}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
|
||||
print(
|
||||
f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
|
||||
f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
|
||||
)
|
||||
else:
|
||||
print(f"Error in {name}: Crawl failed")
|
||||
|
||||
@@ -25,7 +25,7 @@ async def main():
|
||||
# url="https://www.helloworld.org", config=crawler_config
|
||||
url="https://www.kidocode.com", config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
# print(result.model_dump())
|
||||
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"id": "003376f3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -114,7 +114,7 @@
|
||||
" url=\"https://www.nbcnews.com/business\",\n",
|
||||
" bypass_cache=True # By default this is False, meaning the cache will be used\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Print the first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Print the first 500 characters\n",
|
||||
" \n",
|
||||
"asyncio.run(simple_crawl())"
|
||||
]
|
||||
@@ -129,7 +129,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": null,
|
||||
"id": "5bb8c1e4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -177,7 +177,7 @@
|
||||
" # wait_for=wait_for,\n",
|
||||
" bypass_cache=True,\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Print first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Print first 500 characters\n",
|
||||
"\n",
|
||||
"asyncio.run(crawl_dynamic_content())"
|
||||
]
|
||||
@@ -206,11 +206,11 @@
|
||||
" word_count_threshold=10,\n",
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" full_markdown_length = len(result.markdown)\n",
|
||||
" fit_markdown_length = len(result.fit_markdown)\n",
|
||||
" full_markdown_length = len(result.markdown.raw_markdown)\n",
|
||||
" fit_markdown_length = len(result.markdown.fit_markdown)\n",
|
||||
" print(f\"Full Markdown Length: {full_markdown_length}\")\n",
|
||||
" print(f\"Fit Markdown Length: {fit_markdown_length}\")\n",
|
||||
" print(result.fit_markdown[:1000])\n",
|
||||
" print(result.markdown.fit_markdown[:1000])\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"asyncio.run(clean_content())"
|
||||
@@ -342,7 +342,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": null,
|
||||
"id": "bc4d2fc8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -387,7 +387,7 @@
|
||||
" url=\"https://crawl4ai.com\",\n",
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" print(result.markdown[:500]) # Display the first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Display the first 500 characters\n",
|
||||
"\n",
|
||||
"asyncio.run(custom_hook_workflow())"
|
||||
]
|
||||
@@ -465,7 +465,7 @@
|
||||
" bypass_cache=True\n",
|
||||
" )\n",
|
||||
" print(f\"Page {page_number} Content:\")\n",
|
||||
" print(result.markdown[:500]) # Print first 500 characters\n",
|
||||
" print(result.markdown.raw_markdown[:500]) # Print first 500 characters\n",
|
||||
"\n",
|
||||
"# asyncio.run(multi_page_session_crawl())"
|
||||
]
|
||||
|
||||
@@ -59,8 +59,8 @@ async def clean_content():
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
config=crawler_config,
|
||||
)
|
||||
full_markdown_length = len(result.markdown_v2.raw_markdown)
|
||||
fit_markdown_length = len(result.markdown_v2.fit_markdown)
|
||||
full_markdown_length = len(result.markdown.raw_markdown)
|
||||
fit_markdown_length = len(result.markdown.fit_markdown)
|
||||
print(f"Full Markdown Length: {full_markdown_length}")
|
||||
print(f"Fit Markdown Length: {fit_markdown_length}")
|
||||
|
||||
@@ -139,7 +139,7 @@ async def custom_hook_workflow(verbose=True):
|
||||
|
||||
# Perform the crawl operation
|
||||
result = await crawler.arun(url="https://crawl4ai.com")
|
||||
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
|
||||
print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
|
||||
|
||||
|
||||
# Proxy Example
|
||||
@@ -584,9 +584,9 @@ async def speed_comparison():
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
|
||||
|
||||
@@ -514,9 +514,9 @@ async def speed_comparison():
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
# Crawl4AI with JavaScript execution
|
||||
@@ -539,9 +539,9 @@ async def speed_comparison():
|
||||
end = time.time()
|
||||
print("Crawl4AI (with JavaScript execution):")
|
||||
print(f"Time taken: {end - start:.2f} seconds")
|
||||
print(f"Content length: {len(result.markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
|
||||
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
|
||||
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
|
||||
|
||||
print("\nNote on Speed Comparison:")
|
||||
print("The speed test conducted here may not reflect optimal conditions.")
|
||||
@@ -613,9 +613,9 @@ async def fit_markdown_remove_overlay():
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(len(result.markdown_v2.raw_markdown))
|
||||
print(len(result.markdown_v2.markdown_with_citations))
|
||||
print(len(result.markdown_v2.fit_markdown))
|
||||
print(len(result.markdown.raw_markdown))
|
||||
print(len(result.markdown.markdown_with_citations))
|
||||
print(len(result.markdown.fit_markdown))
|
||||
|
||||
# Save clean html
|
||||
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
|
||||
@@ -624,18 +624,18 @@ async def fit_markdown_remove_overlay():
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_raw_markdown.md"), "w"
|
||||
) as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
f.write(result.markdown.raw_markdown)
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_markdown_with_citations.md"),
|
||||
"w",
|
||||
) as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
f.write(result.markdown.markdown_with_citations)
|
||||
|
||||
with open(
|
||||
os.path.join(__location__, "output/output_fit_markdown.md"), "w"
|
||||
) as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
f.write(result.markdown.fit_markdown)
|
||||
|
||||
print("Done")
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ async def little_hello_web():
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org"
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
|
||||
async def hello_web():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
@@ -42,7 +42,7 @@ async def hello_web():
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org", config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown[:500])
|
||||
print(result.markdown.fit_markdown[:500])
|
||||
|
||||
# Naive Approach Using Large Language Models
|
||||
async def extract_using_llm():
|
||||
|
||||
460
docs/examples/tutorial_v0.5.py
Normal file
460
docs/examples/tutorial_v0.5.py
Normal file
@@ -0,0 +1,460 @@
|
||||
import asyncio
|
||||
import time
|
||||
import re
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import (
|
||||
BestFirstCrawlingStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
# 1️⃣ Deep Crawling with Best-First Strategy
|
||||
async def deep_crawl():
|
||||
"""
|
||||
PART 1: Deep Crawling with Best-First Strategy
|
||||
|
||||
This function demonstrates:
|
||||
- Using the BestFirstCrawlingStrategy
|
||||
- Creating filter chains to narrow down crawl targets
|
||||
- Using a scorer to prioritize certain URLs
|
||||
- Respecting robots.txt rules
|
||||
"""
|
||||
print("\n===== DEEP CRAWLING =====")
|
||||
print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
|
||||
|
||||
# Create a filter chain to filter urls based on patterns, domains and content type
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*"],),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create a keyword scorer that prioritises the pages with certain keywords first
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
|
||||
# Set up the configuration with robots.txt compliance enabled
|
||||
deep_crawl_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
check_robots_txt=True, # Enable robots.txt compliance
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
print("\n📊 Starting deep crawl with Best-First strategy...")
|
||||
print(" - Filtering by domain, URL patterns, and content type")
|
||||
print(" - Scoring pages based on keyword relevance")
|
||||
print(" - Respecting robots.txt rules")
|
||||
|
||||
start_time = time.perf_counter()
|
||||
results = []
|
||||
|
||||
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
|
||||
# Print each result as it comes in
|
||||
depth = result.metadata.get("depth", 0)
|
||||
score = result.metadata.get("score", 0)
|
||||
print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
|
||||
results.append(result)
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
# Print summary statistics
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
|
||||
# Group by depth
|
||||
if results:
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("\n📊 Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
|
||||
# 2️⃣ Memory-Adaptive Dispatcher
|
||||
async def memory_adaptive_dispatcher():
|
||||
"""
|
||||
PART 2: Memory-Adaptive Dispatcher
|
||||
|
||||
This function demonstrates:
|
||||
- Using MemoryAdaptiveDispatcher to manage system memory
|
||||
- Batch and streaming modes with multiple URLs
|
||||
"""
|
||||
print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
|
||||
print("This example shows how to use the memory-adaptive dispatcher for resource management.")
|
||||
|
||||
# Configure the dispatcher (optional, defaults are used if not provided)
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Pause if memory usage exceeds 80%
|
||||
check_interval=0.5, # Check memory every 0.5 seconds
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"https://docs.crawl4ai.com",
|
||||
"https://github.com/unclecode/crawl4ai"
|
||||
]
|
||||
|
||||
async def batch_mode():
|
||||
print("\n📊 BATCH MODE:")
|
||||
print(" In this mode, all results are collected before being returned.")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(stream=False), # Batch mode
|
||||
dispatcher=dispatcher,
|
||||
)
|
||||
|
||||
print(f" ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
|
||||
for result in results:
|
||||
print(f" → {result.url} with status code: {result.status_code}")
|
||||
|
||||
async def stream_mode():
|
||||
print("\n📊 STREAMING MODE:")
|
||||
print(" In this mode, results are processed as they become available.")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
count = 0
|
||||
first_result_time = None
|
||||
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(stream=True), # Stream mode
|
||||
dispatcher=dispatcher,
|
||||
):
|
||||
count += 1
|
||||
current_time = time.perf_counter() - start_time
|
||||
|
||||
if count == 1:
|
||||
first_result_time = current_time
|
||||
print(f" ✅ First result after {first_result_time:.2f} seconds: {result.url}")
|
||||
else:
|
||||
print(f" → Result #{count} after {current_time:.2f} seconds: {result.url}")
|
||||
|
||||
print(f" ✅ Total: {count} results")
|
||||
print(f" ✅ First result: {first_result_time:.2f} seconds")
|
||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
# Run both examples
|
||||
await batch_mode()
|
||||
await stream_mode()
|
||||
|
||||
print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
|
||||
print(" and manages concurrency based on system resources.")
|
||||
|
||||
|
||||
# 3️⃣ HTTP Crawler Strategy
|
||||
async def http_crawler_strategy():
|
||||
"""
|
||||
PART 3: HTTP Crawler Strategy
|
||||
|
||||
This function demonstrates:
|
||||
- Using the lightweight HTTP-only crawler
|
||||
- Setting custom headers and configurations
|
||||
"""
|
||||
print("\n===== HTTP CRAWLER STRATEGY =====")
|
||||
print("This example shows how to use the fast, lightweight HTTP-only crawler.")
|
||||
|
||||
# Use the HTTP crawler strategy
|
||||
http_config = HTTPCrawlerConfig(
|
||||
method="GET",
|
||||
headers={"User-Agent": "MyCustomBot/1.0"},
|
||||
follow_redirects=True,
|
||||
verify_ssl=True
|
||||
)
|
||||
|
||||
print("\n📊 Initializing HTTP crawler strategy...")
|
||||
print(" - Using custom User-Agent: MyCustomBot/1.0")
|
||||
print(" - Following redirects: Enabled")
|
||||
print(" - Verifying SSL: Enabled")
|
||||
|
||||
# Create crawler with HTTP strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
|
||||
) as crawler:
|
||||
start_time = time.perf_counter()
|
||||
result = await crawler.arun("https://example.com")
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
print(f"\n✅ Crawled in {duration:.2f} seconds")
|
||||
print(f"✅ Status code: {result.status_code}")
|
||||
print(f"✅ Content length: {len(result.html)} bytes")
|
||||
|
||||
# Check if there was a redirect
|
||||
if result.redirected_url and result.redirected_url != result.url:
|
||||
print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
|
||||
|
||||
print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
|
||||
print(" than browser-based crawling for simple pages.")
|
||||
|
||||
|
||||
# 4️⃣ Proxy Rotation
|
||||
async def proxy_rotation():
|
||||
"""
|
||||
PART 4: Proxy Rotation
|
||||
|
||||
This function demonstrates:
|
||||
- Setting up a proxy rotation strategy
|
||||
- Using multiple proxies in a round-robin fashion
|
||||
"""
|
||||
print("\n===== PROXY ROTATION =====")
|
||||
print("This example shows how to implement proxy rotation for distributed crawling.")
|
||||
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config
|
||||
)
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy.ip
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
else:
|
||||
print(f"❌ Crawl via proxy failed!: {result.error_message}")
|
||||
|
||||
|
||||
# 5️⃣ LLM Content Filter (requires API key)
|
||||
async def llm_content_filter():
|
||||
"""
|
||||
PART 5: LLM Content Filter
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Using LLM to generate focused markdown
|
||||
- LlmConfig for configuration
|
||||
|
||||
Note: Requires a valid API key for the chosen LLM provider
|
||||
"""
|
||||
print("\n===== LLM CONTENT FILTER =====")
|
||||
print("This example shows how to use LLM to generate focused markdown content.")
|
||||
print("Note: This example requires an API key. Set it in environment variables.")
|
||||
|
||||
# Create LLM configuration
|
||||
# Replace with your actual API key or set as environment variable
|
||||
llm_config = LlmConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
|
||||
)
|
||||
|
||||
print("\n📊 Setting up LLM content filter...")
|
||||
print(f" - Provider: {llm_config.provider}")
|
||||
print(" - API token: Using environment variable")
|
||||
print(" - Instruction: Extract key concepts and summaries")
|
||||
|
||||
# Create markdown generator with LLM filter
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llmConfig=llm_config,
|
||||
instruction="Extract key concepts and summaries"
|
||||
)
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.crawl4ai.com", config=config)
|
||||
pprint(result.markdown.fit_markdown)
|
||||
print("\n✅ Generated focused markdown:")
|
||||
|
||||
|
||||
|
||||
# 6️⃣ PDF Processing
|
||||
async def pdf_processing():
|
||||
"""
|
||||
PART 6: PDF Processing
|
||||
|
||||
This function demonstrates:
|
||||
- Using PDFCrawlerStrategy and PDFContentScrapingStrategy
|
||||
- Extracting text and metadata from PDFs
|
||||
"""
|
||||
print("\n===== PDF PROCESSING =====")
|
||||
print("This example shows how to extract text and metadata from PDF files.")
|
||||
|
||||
# Sample PDF URL
|
||||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
|
||||
|
||||
print("\n📊 Initializing PDF crawler...")
|
||||
print(f" - Target PDF: {pdf_url}")
|
||||
print(" - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
|
||||
|
||||
# Create crawler with PDF strategy
|
||||
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
|
||||
print("\n🚀 Starting PDF processing...")
|
||||
|
||||
start_time = time.perf_counter()
|
||||
result = await crawler.arun(
|
||||
pdf_url,
|
||||
config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
|
||||
)
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
print(f"\n✅ Processed PDF in {duration:.2f} seconds")
|
||||
|
||||
# Show metadata
|
||||
print("\n📄 PDF Metadata:")
|
||||
if result.metadata:
|
||||
for key, value in result.metadata.items():
|
||||
if key not in ["html", "text", "markdown"] and value:
|
||||
print(f" - {key}: {value}")
|
||||
else:
|
||||
print(" No metadata available")
|
||||
|
||||
# Show sample of content
|
||||
if result.markdown:
|
||||
print("\n📝 PDF Content Sample:")
|
||||
content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
|
||||
print(f"---\n{content_sample}\n---")
|
||||
else:
|
||||
print("\n⚠️ No content extracted")
|
||||
|
||||
print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
|
||||
print(" to extract both text content and metadata.")
|
||||
|
||||
|
||||
# 7️⃣ LLM Schema Generation (requires API key)
|
||||
async def llm_schema_generation():
|
||||
"""
|
||||
PART 7: LLM Schema Generation
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Using LLM to generate extraction schemas
|
||||
- JsonCssExtractionStrategy
|
||||
|
||||
Note: Requires a valid API key for the chosen LLM provider
|
||||
"""
|
||||
print("\n===== LLM SCHEMA GENERATION =====")
|
||||
print("This example shows how to use LLM to automatically generate extraction schemas.")
|
||||
print("Note: This example requires an API key. Set it in environment variables.")
|
||||
|
||||
# Sample HTML
|
||||
sample_html = """
|
||||
<div class="product">
|
||||
<h2 class="title">Awesome Gaming Laptop</h2>
|
||||
<div class="price">$1,299.99</div>
|
||||
<div class="specs">
|
||||
<ul>
|
||||
<li>16GB RAM</li>
|
||||
<li>512GB SSD</li>
|
||||
<li>RTX 3080</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="rating">4.7/5</div>
|
||||
</div>
|
||||
"""
|
||||
print("\n📊 Setting up LlmConfig...")
|
||||
# Create LLM configuration
|
||||
llm_config = LlmConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY"
|
||||
)
|
||||
print("\n🚀 Generating schema for product extraction...")
|
||||
print(" This would use the LLM to analyze HTML and create an extraction schema")
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html=sample_html,
|
||||
llmConfig = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print("\n✅ Generated Schema:")
|
||||
pprint(schema)
|
||||
|
||||
# Run all sections
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Main function to run all tutorial sections.
|
||||
"""
|
||||
print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
|
||||
print("===============================")
|
||||
print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
|
||||
print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
|
||||
print("and more powerful extraction capabilities.")
|
||||
|
||||
# Sections to run
|
||||
sections = [
|
||||
deep_crawl, # 1. Deep Crawling with Best-First Strategy
|
||||
memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
|
||||
http_crawler_strategy, # 3. HTTP Crawler Strategy
|
||||
proxy_rotation, # 4. Proxy Rotation
|
||||
llm_content_filter, # 5. LLM Content Filter
|
||||
pdf_processing, # 6. PDF Processing
|
||||
llm_schema_generation, # 7. Schema Generation using LLM
|
||||
]
|
||||
|
||||
for section in sections:
|
||||
try:
|
||||
await section()
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error in {section.__name__}: {e}")
|
||||
|
||||
print("\n🎉 TUTORIAL COMPLETE! 🎉")
|
||||
print("You've now explored the key features of Crawl4AI v0.5.0")
|
||||
print("For more information, visit https://docs.crawl4ai.com")
|
||||
|
||||
|
||||
# Run the tutorial
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tutorial())
|
||||
@@ -200,7 +200,7 @@ Each `arun()` returns a **`CrawlResult`** containing:
|
||||
- `url`: Final URL (if redirected).
|
||||
- `html`: Original HTML.
|
||||
- `cleaned_html`: Sanitized HTML.
|
||||
- `markdown_v2` (or future `markdown`): Markdown outputs (raw, fit, etc.).
|
||||
- `markdown_v2`: Deprecated. Instead just use regular `markdown`
|
||||
- `extracted_content`: If an extraction strategy was used (JSON for CSS/LLM strategies).
|
||||
- `screenshot`, `pdf`: If screenshots/PDF requested.
|
||||
- `media`, `links`: Information about discovered images/links.
|
||||
|
||||
@@ -16,9 +16,6 @@ class CrawlResult(BaseModel):
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
@@ -116,8 +113,8 @@ print(result.cleaned_html[:500]) # Show a snippet
|
||||
**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it.
|
||||
**Usage**:
|
||||
```python
|
||||
if result.fit_html:
|
||||
print("High-value HTML content:", result.fit_html[:300])
|
||||
if result.markdown.fit_html:
|
||||
print("High-value HTML content:", result.markdown.fit_html[:300])
|
||||
```
|
||||
|
||||
---
|
||||
@@ -132,8 +129,6 @@ Crawl4AI can convert HTML→Markdown, optionally including:
|
||||
- **Links as citations** (with a references section)
|
||||
- **Fit** markdown if a **content filter** is used (like Pruning or BM25)
|
||||
|
||||
### 3.2 **`markdown_v2`** *(Optional[MarkdownGenerationResult])*
|
||||
**What**: The **structured** object holding multiple markdown variants. Soon to be consolidated into `markdown`.
|
||||
|
||||
**`MarkdownGenerationResult`** includes:
|
||||
- **`raw_markdown`** *(str)*: The full HTML→Markdown conversion.
|
||||
@@ -144,8 +139,8 @@ Crawl4AI can convert HTML→Markdown, optionally including:
|
||||
|
||||
**Usage**:
|
||||
```python
|
||||
if result.markdown_v2:
|
||||
md_res = result.markdown_v2
|
||||
if result.markdown:
|
||||
md_res = result.markdown
|
||||
print("Raw MD:", md_res.raw_markdown[:300])
|
||||
print("Citations MD:", md_res.markdown_with_citations[:300])
|
||||
print("References:", md_res.references_markdown)
|
||||
@@ -153,26 +148,15 @@ if result.markdown_v2:
|
||||
print("Pruned text:", md_res.fit_markdown[:300])
|
||||
```
|
||||
|
||||
### 3.3 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])*
|
||||
**What**: In future versions, `markdown` will fully replace `markdown_v2`. Right now, it might be a `str` or a `MarkdownGenerationResult`.
|
||||
### 3.2 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])*
|
||||
**What**: Holds the `MarkdownGenerationResult`.
|
||||
**Usage**:
|
||||
```python
|
||||
# Soon, you might see:
|
||||
if isinstance(result.markdown, MarkdownGenerationResult):
|
||||
print(result.markdown.raw_markdown[:200])
|
||||
else:
|
||||
print(result.markdown)
|
||||
print(result.markdown.fit_markdown)
|
||||
print(result.markdown.fit_html)
|
||||
```
|
||||
|
||||
### 3.4 **`fit_markdown`** *(Optional[str])*
|
||||
**What**: A direct reference to the final filtered markdown (legacy approach).
|
||||
**When**: This is set if a filter or content strategy explicitly writes there. Usually overshadowed by `markdown_v2.fit_markdown`.
|
||||
**Usage**:
|
||||
```python
|
||||
print(result.fit_markdown) # Legacy field, prefer result.markdown_v2.fit_markdown
|
||||
```
|
||||
|
||||
**Important**: “Fit” content (in `fit_markdown`/`fit_html`) only exists if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
|
||||
**Important**: “Fit” content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
|
||||
|
||||
---
|
||||
|
||||
@@ -304,13 +288,11 @@ async def handle_result(result: CrawlResult):
|
||||
print("Cleaned HTML size:", len(result.cleaned_html or ""))
|
||||
|
||||
# Markdown output
|
||||
if result.markdown_v2:
|
||||
print("Raw Markdown:", result.markdown_v2.raw_markdown[:300])
|
||||
print("Citations Markdown:", result.markdown_v2.markdown_with_citations[:300])
|
||||
if result.markdown_v2.fit_markdown:
|
||||
print("Fit Markdown:", result.markdown_v2.fit_markdown[:200])
|
||||
else:
|
||||
print("Raw Markdown (legacy):", result.markdown[:200] if result.markdown else "N/A")
|
||||
if result.markdown:
|
||||
print("Raw Markdown:", result.markdown.raw_markdown[:300])
|
||||
print("Citations Markdown:", result.markdown.markdown_with_citations[:300])
|
||||
if result.markdown.fit_markdown:
|
||||
print("Fit Markdown:", result.markdown.fit_markdown[:200])
|
||||
|
||||
# Media & Links
|
||||
if "images" in result.media:
|
||||
@@ -333,12 +315,12 @@ async def handle_result(result: CrawlResult):
|
||||
|
||||
## 8. Key Points & Future
|
||||
|
||||
1. **`markdown_v2` vs `markdown`**
|
||||
- Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**.
|
||||
- In future versions, everything will unify under **`markdown`**. If you rely on advanced features (citations, fit content), check `markdown_v2`.
|
||||
1. **Deprecated legacy properties of CrawlResult**
|
||||
- `markdown_v2` - Deprecated in v0.5. Just use `markdown`. It holds the `MarkdownGenerationResult` now!
|
||||
- `fit_markdown` and `fit_html` - Deprecated in v0.5. They can now be accessed via `MarkdownGenerationResult` in `result.markdown`. eg: `result.markdown.fit_markdown` and `result.markdown.fit_html`
|
||||
|
||||
2. **Fit Content**
|
||||
- **`fit_markdown`** and **`fit_html`** appear only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly.
|
||||
- **`fit_markdown`** and **`fit_html`** appear in MarkdownGenerationResult, only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly.
|
||||
- If no filter is used, they remain `None`.
|
||||
|
||||
3. **References & Citations**
|
||||
|
||||
@@ -2,79 +2,151 @@
|
||||
|
||||
**Release Theme: Power, Flexibility, and Scalability**
|
||||
|
||||
Crawl4AI v0.5.0 is a major release focused on significantly enhancing the library's power, flexibility, and scalability. Key improvements include a new **deep crawling** system, a **memory-adaptive dispatcher** for handling large-scale crawls, **multiple crawling strategies** (including a fast HTTP-only crawler), **Docker** deployment options, and a powerful **command-line interface (CLI)**. This release also includes numerous bug fixes, performance optimizations, and documentation updates.
|
||||
Crawl4AI v0.5.0 is a major release focused on significantly enhancing the
|
||||
library's power, flexibility, and scalability. Key improvements include a new
|
||||
**deep crawling** system, a **memory-adaptive dispatcher** for handling
|
||||
large-scale crawls, **multiple crawling strategies** (including a fast HTTP-only
|
||||
crawler), **Docker** deployment options, and a powerful **command-line interface
|
||||
(CLI)**. This release also includes numerous bug fixes, performance
|
||||
optimizations, and documentation updates.
|
||||
|
||||
**Important Note:** This release contains several **breaking changes**. Please review the "Breaking Changes" section carefully and update your code accordingly.
|
||||
**Important Note:** This release contains several **breaking changes**. Please
|
||||
review the "Breaking Changes" section carefully and update your code
|
||||
accordingly.
|
||||
|
||||
## Key Features
|
||||
|
||||
### 1. Deep Crawling
|
||||
|
||||
Crawl4AI now supports deep crawling, allowing you to explore websites beyond the initial URLs. This is controlled by the `deep_crawl_strategy` parameter in `CrawlerRunConfig`. Several strategies are available:
|
||||
Crawl4AI now supports deep crawling, allowing you to explore websites beyond the
|
||||
initial URLs. This is controlled by the `deep_crawl_strategy` parameter in
|
||||
`CrawlerRunConfig`. Several strategies are available:
|
||||
|
||||
* **`BFSDeepCrawlStrategy` (Breadth-First Search):** Explores the website level by level. (Default)
|
||||
* **`DFSDeepCrawlStrategy` (Depth-First Search):** Explores each branch as deeply as possible before backtracking.
|
||||
* **`BestFirstCrawlingStrategy`:** Uses a scoring function to prioritize which URLs to crawl next.
|
||||
- **`BFSDeepCrawlStrategy` (Breadth-First Search):** Explores the website level
|
||||
by level. (Default)
|
||||
- **`DFSDeepCrawlStrategy` (Depth-First Search):** Explores each branch as
|
||||
deeply as possible before backtracking.
|
||||
- **`BestFirstCrawlingStrategy`:** Uses a scoring function to prioritize which
|
||||
URLs to crawl next.
|
||||
|
||||
```python
|
||||
import time
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeepCrawlStrategy
|
||||
from crawl4ai.deep_crawling import DomainFilter, ContentTypeFilter, FilterChain
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import DomainFilter, ContentTypeFilter, FilterChain, URLPatternFilter, KeywordRelevanceScorer, BestFirstCrawlingStrategy
|
||||
import asyncio
|
||||
|
||||
# Configure a deep crawl with BFS, limiting to a specific domain and content type.
|
||||
# Create a filter chain to filter urls based on patterns, domains and content type
|
||||
filter_chain = FilterChain(
|
||||
filters=[
|
||||
DomainFilter(allowed_domains=["example.com"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"])
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*"],),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
deep_crawl_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=5, filter_chain=filter_chain),
|
||||
stream=True # Process results as they arrive
|
||||
|
||||
# Create a keyword scorer that prioritises the pages with certain keywords first
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
|
||||
# Set up the configuration
|
||||
deep_crawl_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(url="https://example.com", config=deep_crawl_config):
|
||||
print(f"Crawled: {result.url} (Depth: {result.metadata['depth']})")
|
||||
start_time = time.perf_counter()
|
||||
results = []
|
||||
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
|
||||
print(f"Crawled: {result.url} (Depth: {result.metadata['depth']}), score: {result.metadata['score']:.2f}")
|
||||
results.append(result)
|
||||
duration = time.perf_counter() - start_time
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Breaking Change:** The `max_depth` parameter is now part of `CrawlerRunConfig` and controls the *depth* of the crawl, not the number of concurrent crawls. The `arun()` and `arun_many()` methods are now decorated to handle deep crawling strategies. Imports for deep crawling strategies have changed. See the [Deep Crawling documentation](../deep_crawling/README.md) for more details.
|
||||
**Breaking Change:** The `max_depth` parameter is now part of `CrawlerRunConfig`
|
||||
and controls the _depth_ of the crawl, not the number of concurrent crawls. The
|
||||
`arun()` and `arun_many()` methods are now decorated to handle deep crawling
|
||||
strategies. Imports for deep crawling strategies have changed. See the
|
||||
[Deep Crawling documentation](../../core/deep-crawling.md) for more details.
|
||||
|
||||
### 2. Memory-Adaptive Dispatcher
|
||||
|
||||
The new `MemoryAdaptiveDispatcher` dynamically adjusts concurrency based on available system memory and includes built-in rate limiting. This prevents out-of-memory errors and avoids overwhelming target websites.
|
||||
The new `MemoryAdaptiveDispatcher` dynamically adjusts concurrency based on
|
||||
available system memory and includes built-in rate limiting. This prevents
|
||||
out-of-memory errors and avoids overwhelming target websites.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MemoryAdaptiveDispatcher
|
||||
import asyncio
|
||||
|
||||
# Configure the dispatcher (optional, defaults are used if not provided)
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Pause if memory usage exceeds 80%
|
||||
check_interval=0.5 # Check memory every 0.5 seconds
|
||||
check_interval=0.5, # Check memory every 0.5 seconds
|
||||
)
|
||||
|
||||
async def batch_mode():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=["https://example.com/1", "https://example.com/2"],
|
||||
urls=["https://docs.crawl4ai.com", "https://github.com/unclecode/crawl4ai"],
|
||||
config=CrawlerRunConfig(stream=False), # Batch mode
|
||||
dispatcher=dispatcher
|
||||
dispatcher=dispatcher,
|
||||
)
|
||||
for result in results:
|
||||
print(f"Crawled: {result.url} with status code: {result.status_code}")
|
||||
|
||||
async def stream_mode():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# OR, for streaming:
|
||||
async for result in await crawler.arun_many(urls, config=CrawlerRunConfig(stream=True), dispatcher=dispatcher):
|
||||
# ...
|
||||
async for result in await crawler.arun_many(
|
||||
urls=["https://docs.crawl4ai.com", "https://github.com/unclecode/crawl4ai"],
|
||||
config=CrawlerRunConfig(stream=True),
|
||||
dispatcher=dispatcher,
|
||||
):
|
||||
print(f"Crawled: {result.url} with status code: {result.status_code}")
|
||||
|
||||
print("Dispatcher in batch mode:")
|
||||
asyncio.run(batch_mode())
|
||||
print("-" * 50)
|
||||
print("Dispatcher in stream mode:")
|
||||
asyncio.run(stream_mode())
|
||||
```
|
||||
|
||||
**Breaking Change:** `AsyncWebCrawler.arun_many()` now uses `MemoryAdaptiveDispatcher` by default. Existing code that relied on unbounded concurrency may require adjustments.
|
||||
**Breaking Change:** `AsyncWebCrawler.arun_many()` now uses
|
||||
`MemoryAdaptiveDispatcher` by default. Existing code that relied on unbounded
|
||||
concurrency may require adjustments.
|
||||
|
||||
### 3. Multiple Crawling Strategies (Playwright and HTTP)
|
||||
|
||||
Crawl4AI now offers two crawling strategies:
|
||||
|
||||
* **`AsyncPlaywrightCrawlerStrategy` (Default):** Uses Playwright for browser-based crawling, supporting JavaScript rendering and complex interactions.
|
||||
* **`AsyncHTTPCrawlerStrategy`:** A lightweight, fast, and memory-efficient HTTP-only crawler. Ideal for simple scraping tasks where browser rendering is unnecessary.
|
||||
- **`AsyncPlaywrightCrawlerStrategy` (Default):** Uses Playwright for
|
||||
browser-based crawling, supporting JavaScript rendering and complex
|
||||
interactions.
|
||||
- **`AsyncHTTPCrawlerStrategy`:** A lightweight, fast, and memory-efficient
|
||||
HTTP-only crawler. Ideal for simple scraping tasks where browser rendering is
|
||||
unnecessary.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
import asyncio
|
||||
|
||||
# Use the HTTP crawler strategy
|
||||
http_crawler_config = HTTPCrawlerConfig(
|
||||
@@ -84,15 +156,20 @@ http_crawler_config = HTTPCrawlerConfig(
|
||||
verify_ssl=True
|
||||
)
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config =http_crawler_config)) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print(f"Status code: {result.status_code}")
|
||||
print(f"Content length: {len(result.html)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### 4. Docker Deployment
|
||||
|
||||
Crawl4AI can now be easily deployed as a Docker container, providing a consistent and isolated environment. The Docker image includes a FastAPI server with both streaming and non-streaming endpoints.
|
||||
Crawl4AI can now be easily deployed as a Docker container, providing a
|
||||
consistent and isolated environment. The Docker image includes a FastAPI server
|
||||
with both streaming and non-streaming endpoints.
|
||||
|
||||
```bash
|
||||
# Build the image (from the project root)
|
||||
@@ -104,26 +181,28 @@ docker run -d -p 8000:8000 --name crawl4ai crawl4ai
|
||||
|
||||
**API Endpoints:**
|
||||
|
||||
* `/crawl` (POST): Non-streaming crawl.
|
||||
* `/crawl/stream` (POST): Streaming crawl (NDJSON).
|
||||
* `/health` (GET): Health check.
|
||||
* `/schema` (GET): Returns configuration schemas.
|
||||
* `/md/{url}` (GET): Returns markdown content of the URL.
|
||||
* `/llm/{url}` (GET): Returns LLM extracted content.
|
||||
* `/token` (POST): Get JWT token
|
||||
- `/crawl` (POST): Non-streaming crawl.
|
||||
- `/crawl/stream` (POST): Streaming crawl (NDJSON).
|
||||
- `/health` (GET): Health check.
|
||||
- `/schema` (GET): Returns configuration schemas.
|
||||
- `/md/{url}` (GET): Returns markdown content of the URL.
|
||||
- `/llm/{url}` (GET): Returns LLM extracted content.
|
||||
- `/token` (POST): Get JWT token
|
||||
|
||||
**Breaking Changes:**
|
||||
|
||||
* Docker deployment now requires a `.llm.env` file for API keys.
|
||||
* Docker deployment now requires Redis and a new `config.yml` structure.
|
||||
* Server startup now uses `supervisord` instead of direct process management.
|
||||
* Docker server now requires authentication by default (JWT tokens).
|
||||
- Docker deployment now requires a `.llm.env` file for API keys.
|
||||
- Docker deployment now requires Redis and a new `config.yml` structure.
|
||||
- Server startup now uses `supervisord` instead of direct process management.
|
||||
- Docker server now requires authentication by default (JWT tokens).
|
||||
|
||||
See the [Docker deployment documentation](../deploy/docker/README.md) for detailed instructions.
|
||||
See the [Docker deployment documentation](../../core/docker-deployment.md) for
|
||||
detailed instructions.
|
||||
|
||||
### 5. Command-Line Interface (CLI)
|
||||
|
||||
A new CLI (`crwl`) provides convenient access to Crawl4AI's functionality from the terminal.
|
||||
A new CLI (`crwl`) provides convenient access to Crawl4AI's functionality from
|
||||
the terminal.
|
||||
|
||||
```bash
|
||||
# Basic crawl
|
||||
@@ -149,14 +228,21 @@ See the [CLI documentation](../docs/md_v2/core/cli.md) for more details.
|
||||
|
||||
### 6. LXML Scraping Mode
|
||||
|
||||
Added `LXMLWebScrapingStrategy` for faster HTML parsing using the `lxml` library. This can significantly improve scraping performance, especially for large or complex pages. Set `scraping_strategy=LXMLWebScrapingStrategy()` in your `CrawlerRunConfig`.
|
||||
Added `LXMLWebScrapingStrategy` for faster HTML parsing using the `lxml`
|
||||
library. This can significantly improve scraping performance, especially for
|
||||
large or complex pages. Set `scraping_strategy=LXMLWebScrapingStrategy()` in
|
||||
your `CrawlerRunConfig`.
|
||||
|
||||
**Breaking Change:** The `ScrapingMode` enum has been replaced with a strategy pattern. Use `WebScrapingStrategy` (default) or `LXMLWebScrapingStrategy`.
|
||||
**Breaking Change:** The `ScrapingMode` enum has been replaced with a strategy
|
||||
pattern. Use `WebScrapingStrategy` (default) or `LXMLWebScrapingStrategy`.
|
||||
|
||||
### 7. Proxy Rotation
|
||||
Added `ProxyRotationStrategy` abstract base class with `RoundRobinProxyStrategy` concrete implementation.
|
||||
|
||||
Added `ProxyRotationStrategy` abstract base class with `RoundRobinProxyStrategy`
|
||||
concrete implementation.
|
||||
|
||||
```python
|
||||
import re
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
@@ -164,8 +250,12 @@ from crawl4ai import (
|
||||
CacheMode,
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
import asyncio
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
async def main():
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = load_proxies_from_env()
|
||||
proxies = ProxyConfig.from_env()
|
||||
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
@@ -178,38 +268,76 @@ from crawl4ai import (
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config
|
||||
)
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy.ip
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Other Changes and Improvements
|
||||
|
||||
* **Added: `LLMContentFilter` for intelligent markdown generation.** This new filter uses an LLM to create more focused and relevant markdown output.
|
||||
- **Added: `LLMContentFilter` for intelligent markdown generation.** This new
|
||||
filter uses an LLM to create more focused and relevant markdown output.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
import asyncio
|
||||
|
||||
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries")
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com/article", config=config)
|
||||
print(result.markdown) # Output will be filtered and formatted by the LLM
|
||||
result = await crawler.arun("https://docs.crawl4ai.com", config=config)
|
||||
print(result.markdown.fit_markdown)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
* **Added: URL redirection tracking.** The crawler now automatically follows HTTP redirects (301, 302, 307, 308) and records the final URL in the `redirected_url` field of the `CrawlResult` object. No code changes are required to enable this; it's automatic.
|
||||
- **Added: URL redirection tracking.** The crawler now automatically follows
|
||||
HTTP redirects (301, 302, 307, 308) and records the final URL in the
|
||||
`redirected_url` field of the `CrawlResult` object. No code changes are
|
||||
required to enable this; it's automatic.
|
||||
|
||||
* **Added: LLM-powered schema generation utility.** A new `generate_schema` method has been added to `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. This greatly simplifies creating extraction schemas.
|
||||
- **Added: LLM-powered schema generation utility.** A new `generate_schema`
|
||||
method has been added to `JsonCssExtractionStrategy` and
|
||||
`JsonXPathExtractionStrategy`. This greatly simplifies creating extraction
|
||||
schemas.
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
|
||||
@@ -217,43 +345,60 @@ from crawl4ai import (
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print(schema)
|
||||
# Expected Output (may vary slightly due to LLM):
|
||||
# {
|
||||
# "name": "ProductExtractor",
|
||||
# "baseSelector": "div.product",
|
||||
# "fields": [
|
||||
# {"name": "name", "selector": "h2", "type": "text"},
|
||||
# {"name": "price", "selector": ".price", "type": "text"}
|
||||
# ]
|
||||
# }
|
||||
```
|
||||
|
||||
* **Added: robots.txt compliance support.** The crawler can now respect `robots.txt` rules. Enable this by setting `check_robots_txt=True` in `CrawlerRunConfig`.
|
||||
Expected Output (may vary slightly due to LLM)
|
||||
```JSON
|
||||
{
|
||||
"name": "ProductExtractor",
|
||||
"baseSelector": "div.product",
|
||||
"fields": [
|
||||
{"name": "name", "selector": "h2", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
- **Added: robots.txt compliance support.** The crawler can now respect
|
||||
`robots.txt` rules. Enable this by setting `check_robots_txt=True` in
|
||||
`CrawlerRunConfig`.
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(check_robots_txt=True)
|
||||
```
|
||||
|
||||
* **Added: PDF processing capabilities.** Crawl4AI can now extract text, images, and metadata from PDF files (both local and remote). This uses a new `PDFCrawlerStrategy` and `PDFContentScrapingStrategy`.
|
||||
- **Added: PDF processing capabilities.** Crawl4AI can now extract text, images,
|
||||
and metadata from PDF files (both local and remote). This uses a new
|
||||
`PDFCrawlerStrategy` and `PDFContentScrapingStrategy`.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com/document.pdf",
|
||||
"https://arxiv.org/pdf/2310.06825.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
)
|
||||
print(result.markdown) # Access extracted text
|
||||
print(result.metadata) # Access PDF metadata (title, author, etc.)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
* **Added: Support for frozenset serialization.** Improves configuration serialization, especially for sets of allowed/blocked domains. No code changes required.
|
||||
- **Added: Support for frozenset serialization.** Improves configuration
|
||||
serialization, especially for sets of allowed/blocked domains. No code changes
|
||||
required.
|
||||
|
||||
* **Added: New `LlmConfig` parameter.** This new parameter can be passed for extraction, filtering, and schema generation tasks. It simplifies passing provider strings, API tokens, and base URLs across all sections where LLM configuration is necessary. It also enables reuse and allows for quick experimentation between different LLM configurations.
|
||||
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for
|
||||
extraction, filtering, and schema generation tasks. It simplifies passing
|
||||
provider strings, API tokens, and base URLs across all sections where LLM
|
||||
configuration is necessary. It also enables reuse and allows for quick
|
||||
experimentation between different LLM configurations.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
@@ -271,61 +416,109 @@ from crawl4ai import (
|
||||
config=CrawlerRunConfig(extraction_strategy=strategy)
|
||||
)
|
||||
```
|
||||
**Breaking Change:** Removed old parameters like `provider`, `api_token`, `base_url`, and `api_base` from `LLMExtractionStrategy` and `LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
|
||||
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
|
||||
`base_url`, and `api_base` from `LLMExtractionStrategy` and
|
||||
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
|
||||
|
||||
* **Changed: Improved browser context management and added shared data support. (Breaking Change:** `BrowserContext` API updated). Browser contexts are now managed more efficiently, reducing resource usage. A new `shared_data` dictionary is available in the `BrowserContext` to allow passing data between different stages of the crawling process. **Breaking Change:** The `BrowserContext` API has changed, and the old `get_context` method is deprecated.
|
||||
- **Changed: Improved browser context management and added shared data support.
|
||||
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now
|
||||
managed more efficiently, reducing resource usage. A new `shared_data`
|
||||
dictionary is available in the `BrowserContext` to allow passing data between
|
||||
different stages of the crawling process. **Breaking Change:** The
|
||||
`BrowserContext` API has changed, and the old `get_context` method is
|
||||
deprecated.
|
||||
|
||||
* **Changed:** Renamed `final_url` to `redirected_url` in `CrawledURL`. This improves consistency and clarity. Update any code referencing the old field name.
|
||||
- **Changed:** Renamed `final_url` to `redirected_url` in `CrawledURL`. This
|
||||
improves consistency and clarity. Update any code referencing the old field
|
||||
name.
|
||||
|
||||
* **Changed:** Improved type hints and removed unused files. This is an internal improvement and should not require code changes.
|
||||
- **Changed:** Improved type hints and removed unused files. This is an internal
|
||||
improvement and should not require code changes.
|
||||
|
||||
* **Changed:** Reorganized deep crawling functionality into dedicated module. (**Breaking Change:** Import paths for `DeepCrawlStrategy` and related classes have changed). This improves code organization. Update imports to use the new `crawl4ai.deep_crawling` module.
|
||||
- **Changed:** Reorganized deep crawling functionality into dedicated module.
|
||||
(**Breaking Change:** Import paths for `DeepCrawlStrategy` and related classes
|
||||
have changed). This improves code organization. Update imports to use the new
|
||||
`crawl4ai.deep_crawling` module.
|
||||
|
||||
* **Changed:** Improved HTML handling and cleanup codebase. (**Breaking Change:** Removed `ssl_certificate.json` file). This removes an unused file. If you were relying on this file for custom certificate validation, you'll need to implement an alternative approach.
|
||||
- **Changed:** Improved HTML handling and cleanup codebase. (**Breaking
|
||||
Change:** Removed `ssl_certificate.json` file). This removes an unused file.
|
||||
If you were relying on this file for custom certificate validation, you'll
|
||||
need to implement an alternative approach.
|
||||
|
||||
* **Changed:** Enhanced serialization and config handling. (**Breaking Change:** `FastFilterChain` has been replaced with `FilterChain`). This change simplifies config and improves the serialization.
|
||||
- **Changed:** Enhanced serialization and config handling. (**Breaking Change:**
|
||||
`FastFilterChain` has been replaced with `FilterChain`). This change
|
||||
simplifies config and improves the serialization.
|
||||
|
||||
* **Added:** Modified the license to Apache 2.0 *with a required attribution clause*. See the `LICENSE` file for details. All users must now clearly attribute the Crawl4AI project when using, distributing, or creating derivative works.
|
||||
- **Added:** Modified the license to Apache 2.0 _with a required attribution
|
||||
clause_. See the `LICENSE` file for details. All users must now clearly
|
||||
attribute the Crawl4AI project when using, distributing, or creating
|
||||
derivative works.
|
||||
|
||||
* **Fixed:** Prevent memory leaks by ensuring proper closure of Playwright pages. No code changes required.
|
||||
- **Fixed:** Prevent memory leaks by ensuring proper closure of Playwright
|
||||
pages. No code changes required.
|
||||
|
||||
* **Fixed:** Make model fields optional with default values (**Breaking Change:** Code relying on all fields being present may need adjustment). Fields in data models (like `CrawledURL`) are now optional, with default values (usually `None`). Update code to handle potential `None` values.
|
||||
- **Fixed:** Make model fields optional with default values (**Breaking
|
||||
Change:** Code relying on all fields being present may need adjustment).
|
||||
Fields in data models (like `CrawledURL`) are now optional, with default
|
||||
values (usually `None`). Update code to handle potential `None` values.
|
||||
|
||||
* **Fixed:** Adjust memory threshold and fix dispatcher initialization. This is an internal bug fix; no code changes are required.
|
||||
- **Fixed:** Adjust memory threshold and fix dispatcher initialization. This is
|
||||
an internal bug fix; no code changes are required.
|
||||
|
||||
* **Fixed:** Ensure proper exit after running doctor command. No code changes are required.
|
||||
* **Fixed:** JsonCss selector and crawler improvements.
|
||||
* **Fixed:** Not working long page screenshot (#403)
|
||||
* **Documentation:** Updated documentation URLs to the new domain.
|
||||
* **Documentation:** Added SERP API project example.
|
||||
* **Documentation:** Added clarifying comments for CSS selector behavior.
|
||||
* **Documentation:** Add Code of Conduct for the project (#410)
|
||||
- **Fixed:** Ensure proper exit after running doctor command. No code changes
|
||||
are required.
|
||||
- **Fixed:** JsonCss selector and crawler improvements.
|
||||
- **Fixed:** Not working long page screenshot (#403)
|
||||
- **Documentation:** Updated documentation URLs to the new domain.
|
||||
- **Documentation:** Added SERP API project example.
|
||||
- **Documentation:** Added clarifying comments for CSS selector behavior.
|
||||
- **Documentation:** Add Code of Conduct for the project (#410)
|
||||
|
||||
## Breaking Changes Summary
|
||||
|
||||
* **Dispatcher:** The `MemoryAdaptiveDispatcher` is now the default for `arun_many()`, changing concurrency behavior. The return type of `arun_many` depends on the `stream` parameter.
|
||||
* **Deep Crawling:** `max_depth` is now part of `CrawlerRunConfig` and controls crawl depth. Import paths for deep crawling strategies have changed.
|
||||
* **Browser Context:** The `BrowserContext` API has been updated.
|
||||
* **Models:** Many fields in data models are now optional, with default values.
|
||||
* **Scraping Mode:** `ScrapingMode` enum replaced by strategy pattern (`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
|
||||
* **Content Filter:** Removed `content_filter` parameter from `CrawlerRunConfig`. Use extraction strategies or markdown generators with filters instead.
|
||||
* **Removed:** Synchronous `WebCrawler`, CLI, and docs management functionality.
|
||||
* **Docker:** Significant changes to Docker deployment, including new requirements and configuration.
|
||||
* **File Removed**: Removed ssl_certificate.json file which might affect existing certificate validations
|
||||
* **Renamed**: final_url to redirected_url for consistency
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
- **Dispatcher:** The `MemoryAdaptiveDispatcher` is now the default for
|
||||
`arun_many()`, changing concurrency behavior. The return type of `arun_many`
|
||||
depends on the `stream` parameter.
|
||||
- **Deep Crawling:** `max_depth` is now part of `CrawlerRunConfig` and controls
|
||||
crawl depth. Import paths for deep crawling strategies have changed.
|
||||
- **Browser Context:** The `BrowserContext` API has been updated.
|
||||
- **Models:** Many fields in data models are now optional, with default values.
|
||||
- **Scraping Mode:** `ScrapingMode` enum replaced by strategy pattern
|
||||
(`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
|
||||
- **Content Filter:** Removed `content_filter` parameter from
|
||||
`CrawlerRunConfig`. Use extraction strategies or markdown generators with
|
||||
filters instead.
|
||||
- **Removed:** Synchronous `WebCrawler`, CLI, and docs management functionality.
|
||||
- **Docker:** Significant changes to Docker deployment, including new
|
||||
requirements and configuration.
|
||||
- **File Removed**: Removed ssl_certificate.json file which might affect
|
||||
existing certificate validations
|
||||
- **Renamed**: final_url to redirected_url for consistency
|
||||
- **Config**: FastFilterChain has been replaced with FilterChain
|
||||
- **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT,
|
||||
List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
- **Proxy**: Removed synchronous WebCrawler support and related rate limiting
|
||||
configurations
|
||||
|
||||
## Migration Guide
|
||||
|
||||
1. **Update Imports:** Adjust imports for `DeepCrawlStrategy`, `BreadthFirstSearchStrategy`, and related classes due to the new `deep_crawling` module structure.
|
||||
2. **`CrawlerRunConfig`:** Move `max_depth` to `CrawlerRunConfig`. If using `content_filter`, migrate to an extraction strategy or a markdown generator with a filter.
|
||||
3. **`arun_many()`:** Adapt code to the new `MemoryAdaptiveDispatcher` behavior and the return type.
|
||||
1. **Update Imports:** Adjust imports for `DeepCrawlStrategy`,
|
||||
`BreadthFirstSearchStrategy`, and related classes due to the new
|
||||
`deep_crawling` module structure.
|
||||
2. **`CrawlerRunConfig`:** Move `max_depth` to `CrawlerRunConfig`. If using
|
||||
`content_filter`, migrate to an extraction strategy or a markdown generator
|
||||
with a filter.
|
||||
3. **`arun_many()`:** Adapt code to the new `MemoryAdaptiveDispatcher` behavior
|
||||
and the return type.
|
||||
4. **`BrowserContext`:** Update code using the `BrowserContext` API.
|
||||
5. **Models:** Handle potential `None` values for optional fields in data models.
|
||||
6. **Scraping:** Replace `ScrapingMode` enum with `WebScrapingStrategy` or `LXMLWebScrapingStrategy`.
|
||||
7. **Docker:** Review the updated Docker documentation and adjust your deployment accordingly.
|
||||
8. **CLI:** Migrate to the new `crwl` command and update any scripts using the old CLI.
|
||||
9. **Proxy:**: Removed synchronous WebCrawler support and related rate limiting configurations.
|
||||
5. **Models:** Handle potential `None` values for optional fields in data
|
||||
models.
|
||||
6. **Scraping:** Replace `ScrapingMode` enum with `WebScrapingStrategy` or
|
||||
`LXMLWebScrapingStrategy`.
|
||||
7. **Docker:** Review the updated Docker documentation and adjust your
|
||||
deployment accordingly.
|
||||
8. **CLI:** Migrate to the new `crwl` command and update any scripts using the
|
||||
old CLI.
|
||||
9. **Proxy:**: Removed synchronous WebCrawler support and related rate limiting
|
||||
configurations.
|
||||
10. **Config:**: Replace FastFilterChain to FilterChain
|
||||
|
||||
@@ -27,7 +27,6 @@ class CrawlResult(BaseModel):
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
@@ -52,8 +51,7 @@ class CrawlResult(BaseModel):
|
||||
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
||||
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
||||
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
||||
| **markdown (`Optional[str or MarkdownGenerationResult]`)** | For now, `markdown_v2` holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
|
||||
| **markdown_v2 (`Optional[MarkdownGenerationResult]`)** | Legacy field for detailed markdown output. This will be replaced by `markdown` soon. |
|
||||
| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
|
||||
| **extracted_content (`Optional[str]`)** | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text. |
|
||||
| **metadata (`Optional[dict]`)** | Additional info about the crawl or extracted data. |
|
||||
| **error_message (`Optional[str]`)** | If `success=False`, contains a short description of what went wrong. |
|
||||
@@ -90,10 +88,10 @@ print(result.cleaned_html) # Freed of forms, header, footer, data-* attributes
|
||||
|
||||
## 3. Markdown Generation
|
||||
|
||||
### 3.1 `markdown_v2` (Legacy) vs `markdown`
|
||||
### 3.1 `markdown`
|
||||
|
||||
- **`markdown_v2`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.
|
||||
- **`markdown`**: Eventually, we’re merging these fields. For now, you might see `result.markdown_v2` used widely in code examples.
|
||||
- **`markdown`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.
|
||||
- **`markdown_v2`**: Deprecated since v0.5.
|
||||
|
||||
**`MarkdownGenerationResult`** Fields:
|
||||
|
||||
@@ -118,7 +116,7 @@ config = CrawlerRunConfig(
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
md_res = result.markdown_v2 # or eventually 'result.markdown'
|
||||
md_res = result.markdown # or eventually 'result.markdown'
|
||||
print(md_res.raw_markdown[:500])
|
||||
print(md_res.markdown_with_citations)
|
||||
print(md_res.references_markdown)
|
||||
@@ -224,15 +222,17 @@ Check any field:
|
||||
if result.success:
|
||||
print(result.status_code, result.response_headers)
|
||||
print("Links found:", len(result.links.get("internal", [])))
|
||||
if result.markdown_v2:
|
||||
print("Markdown snippet:", result.markdown_v2.raw_markdown[:200])
|
||||
if result.markdown:
|
||||
print("Markdown snippet:", result.markdown.raw_markdown[:200])
|
||||
if result.extracted_content:
|
||||
print("Structured JSON:", result.extracted_content)
|
||||
else:
|
||||
print("Error:", result.error_message)
|
||||
```
|
||||
|
||||
**Remember**: Use `result.markdown_v2` for now. It will eventually become `result.markdown`.
|
||||
**Deprecation**: Since v0.5 `result.markdown_v2`, `result.fit_html`,`result.fit_markdown` are deprecated. Use `result.markdown` instead! It holds `MarkdownGenerationResult`, which includes `fit_html` and `fit_markdown`
|
||||
as it's properties.
|
||||
|
||||
|
||||
---
|
||||
|
||||
|
||||
436
docs/md_v2/core/deep-crawling.md
Normal file
436
docs/md_v2/core/deep-crawling.md
Normal file
@@ -0,0 +1,436 @@
|
||||
# Deep Crawling
|
||||
|
||||
One of Crawl4AI's most powerful features is its ability to perform **configurable deep crawling** that can explore websites beyond a single page. With fine-tuned control over crawl depth, domain boundaries, and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
|
||||
|
||||
In this tutorial, you'll learn:
|
||||
|
||||
1. How to set up a **Basic Deep Crawler** with BFS strategy
|
||||
2. Understanding the difference between **streamed and non-streamed** output
|
||||
3. Implementing **filters and scorers** to target specific content
|
||||
4. Creating **advanced filtering chains** for sophisticated crawls
|
||||
5. Using **BestFirstCrawling** for intelligent exploration prioritization
|
||||
|
||||
> **Prerequisites**
|
||||
> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.
|
||||
> - You know how to configure `CrawlerRunConfig`.
|
||||
|
||||
---
|
||||
|
||||
## 1. Quick Example
|
||||
|
||||
Here's a minimal code snippet that implements a basic deep crawl using the **BFSDeepCrawlStrategy**:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
|
||||
async def main():
|
||||
# Configure a 2-level deep crawl
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
print(f"Crawled {len(results)} pages in total")
|
||||
|
||||
# Access individual results
|
||||
for result in results[:3]: # Show first 3 results
|
||||
print(f"URL: {result.url}")
|
||||
print(f"Depth: {result.metadata.get('depth', 0)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**What's happening?**
|
||||
- `BFSDeepCrawlStrategy(max_depth=2, include_external=False)` instructs Crawl4AI to:
|
||||
- Crawl the starting page (depth 0) plus 2 more levels
|
||||
- Stay within the same domain (don't follow external links)
|
||||
- Each result contains metadata like the crawl depth
|
||||
- Results are returned as a list after all crawling is complete
|
||||
|
||||
---
|
||||
|
||||
## 2. Understanding Deep Crawling Strategy Options
|
||||
|
||||
### 2.1 BFSDeepCrawlStrategy (Breadth-First Search)
|
||||
|
||||
The **BFSDeepCrawlStrategy** uses a breadth-first approach, exploring all links at one depth before moving deeper:
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
|
||||
# Basic configuration
|
||||
strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Crawl initial page + 2 levels deep
|
||||
include_external=False, # Stay within the same domain
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- **`max_depth`**: Number of levels to crawl beyond the starting page
|
||||
- **`include_external`**: Whether to follow links to other domains
|
||||
|
||||
### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
|
||||
|
||||
The **DFSDeepCrawlStrategy** uses a depth-first approach, explores as far down a branch as possible before backtracking.
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
|
||||
|
||||
# Basic configuration
|
||||
strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=2, # Crawl initial page + 2 levels deep
|
||||
include_external=False, # Stay within the same domain
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- **`max_depth`**: Number of levels to crawl beyond the starting page
|
||||
- **`include_external`**: Whether to follow links to other domains
|
||||
|
||||
### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
|
||||
|
||||
For more intelligent crawling, use **BestFirstCrawlingStrategy** with scorers to prioritize the most relevant pages:
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
|
||||
# Create a scorer
|
||||
scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"],
|
||||
weight=0.7
|
||||
)
|
||||
|
||||
# Configure the strategy
|
||||
strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=scorer
|
||||
)
|
||||
```
|
||||
|
||||
This crawling approach:
|
||||
- Evaluates each discovered URL based on scorer criteria
|
||||
- Visits higher-scoring pages first
|
||||
- Helps focus crawl resources on the most relevant content
|
||||
|
||||
---
|
||||
|
||||
## 3. Streaming vs. Non-Streaming Results
|
||||
|
||||
Crawl4AI can return results in two modes:
|
||||
|
||||
### 3.1 Non-Streaming Mode (Default)
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
|
||||
stream=False # Default behavior
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Wait for ALL results to be collected before returning
|
||||
results = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
for result in results:
|
||||
process_result(result)
|
||||
```
|
||||
|
||||
**When to use non-streaming mode:**
|
||||
- You need the complete dataset before processing
|
||||
- You're performing batch operations on all results together
|
||||
- Crawl time isn't a critical factor
|
||||
|
||||
### 3.2 Streaming Mode
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
|
||||
stream=True # Enable streaming
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Returns an async iterator
|
||||
async for result in await crawler.arun("https://example.com", config=config):
|
||||
# Process each result as it becomes available
|
||||
process_result(result)
|
||||
```
|
||||
|
||||
**Benefits of streaming mode:**
|
||||
- Process results immediately as they're discovered
|
||||
- Start working with early results while crawling continues
|
||||
- Better for real-time applications or progressive display
|
||||
- Reduces memory pressure when handling many pages
|
||||
|
||||
---
|
||||
|
||||
## 4. Filtering Content with Filter Chains
|
||||
|
||||
Filters help you narrow down which pages to crawl. Combine multiple filters using **FilterChain** for powerful targeting.
|
||||
|
||||
### 4.1 Basic URL Pattern Filter
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
|
||||
|
||||
# Only follow URLs containing "blog" or "docs"
|
||||
url_filter = URLPatternFilter(patterns=["*blog*", "*docs*"])
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
filter_chain=FilterChain([url_filter])
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 4.2 Combining Multiple Filters
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import (
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter
|
||||
)
|
||||
|
||||
# Create a chain of filters
|
||||
filter_chain = FilterChain([
|
||||
# Only follow URLs with specific patterns
|
||||
URLPatternFilter(patterns=["*guide*", "*tutorial*"]),
|
||||
|
||||
# Only crawl specific domains
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.example.com"],
|
||||
blocked_domains=["old.docs.example.com"]
|
||||
),
|
||||
|
||||
# Only include specific content types
|
||||
ContentTypeFilter(allowed_types=["text/html"])
|
||||
])
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
filter_chain=filter_chain
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 4.3 Available Filter Types
|
||||
|
||||
Crawl4AI includes several specialized filters:
|
||||
|
||||
- **`URLPatternFilter`**: Matches URL patterns using wildcard syntax
|
||||
- **`DomainFilter`**: Controls which domains to include or exclude
|
||||
- **`ContentTypeFilter`**: Filters based on HTTP Content-Type
|
||||
- **`ContentRelevanceFilter`**: Uses similarity to a text query
|
||||
- **`SEOFilter`**: Evaluates SEO elements (meta tags, headers, etc.)
|
||||
|
||||
---
|
||||
|
||||
## 5. Using Scorers for Prioritized Crawling
|
||||
|
||||
Scorers assign priority values to discovered URLs, helping the crawler focus on the most relevant content first.
|
||||
|
||||
### 5.1 KeywordRelevanceScorer
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||
|
||||
# Create a keyword relevance scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"],
|
||||
weight=0.7 # Importance of this scorer (0.0 to 1.0)
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
url_scorer=keyword_scorer
|
||||
),
|
||||
stream=True # Recommended with BestFirstCrawling
|
||||
)
|
||||
|
||||
# Results will come in order of relevance score
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun("https://example.com", config=config):
|
||||
score = result.metadata.get("score", 0)
|
||||
print(f"Score: {score:.2f} | {result.url}")
|
||||
```
|
||||
|
||||
**How scorers work:**
|
||||
- Evaluate each discovered URL before crawling
|
||||
- Calculate relevance based on various signals
|
||||
- Help the crawler make intelligent choices about traversal order
|
||||
|
||||
---
|
||||
|
||||
## 6. Advanced Filtering Techniques
|
||||
|
||||
### 6.1 SEO Filter for Quality Assessment
|
||||
|
||||
The **SEOFilter** helps you identify pages with strong SEO characteristics:
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import FilterChain, SEOFilter
|
||||
|
||||
# Create an SEO filter that looks for specific keywords in page metadata
|
||||
seo_filter = SEOFilter(
|
||||
threshold=0.5, # Minimum score (0.0 to 1.0)
|
||||
keywords=["tutorial", "guide", "documentation"]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
filter_chain=FilterChain([seo_filter])
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 6.2 Content Relevance Filter
|
||||
|
||||
The **ContentRelevanceFilter** analyzes the actual content of pages:
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import FilterChain, ContentRelevanceFilter
|
||||
|
||||
# Create a content relevance filter
|
||||
relevance_filter = ContentRelevanceFilter(
|
||||
query="Web crawling and data extraction with Python",
|
||||
threshold=0.7 # Minimum similarity score (0.0 to 1.0)
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
filter_chain=FilterChain([relevance_filter])
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
This filter:
|
||||
- Measures semantic similarity between query and page content
|
||||
- It's a BM25-based relevance filter using head section content
|
||||
|
||||
---
|
||||
|
||||
## 7. Building a Complete Advanced Crawler
|
||||
|
||||
This example combines multiple techniques for a sophisticated crawl:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.filters import (
|
||||
FilterChain,
|
||||
DomainFilter,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
|
||||
async def run_advanced_crawler():
|
||||
# Create a sophisticated filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Domain boundaries
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.example.com"],
|
||||
blocked_domains=["old.docs.example.com"]
|
||||
),
|
||||
|
||||
# URL patterns to include
|
||||
URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),
|
||||
|
||||
# Content type filtering
|
||||
ContentTypeFilter(allowed_types=["text/html"])
|
||||
])
|
||||
|
||||
# Create a relevance scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"],
|
||||
weight=0.7
|
||||
)
|
||||
|
||||
# Set up the configuration
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
results = []
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun("https://docs.example.com", config=config):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
# Analyze the results
|
||||
print(f"Crawled {len(results)} high-value pages")
|
||||
print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")
|
||||
|
||||
# Group by depth
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_advanced_crawler())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
|
||||
## 8. Common Pitfalls & Tips
|
||||
|
||||
1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size.
|
||||
|
||||
2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
|
||||
|
||||
3.**Be a good web citizen.** Respect robots.txt. (disabled by default)
|
||||
|
||||
|
||||
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results.
|
||||
|
||||
---
|
||||
|
||||
## 9. Summary & Next Steps
|
||||
|
||||
In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
|
||||
|
||||
- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy**
|
||||
- Process results in streaming or non-streaming mode
|
||||
- Apply filters to target specific content
|
||||
- Use scorers to prioritize the most relevant pages
|
||||
- Build a complete advanced crawler with combined techniques
|
||||
|
||||
With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
|
||||
@@ -10,11 +10,10 @@
|
||||
|
||||
In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
|
||||
|
||||
- **`result.markdown_v2.raw_markdown`** (unfiltered)
|
||||
- **`result.markdown_v2.fit_markdown`** (filtered or “fit” version)
|
||||
- **`result.markdown_v2.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
|
||||
- **`result.markdown.raw_markdown`** (unfiltered)
|
||||
- **`result.markdown.fit_markdown`** (filtered or “fit” version)
|
||||
- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
|
||||
|
||||
> **Note**: We’re currently storing the result in `markdown_v2`, but eventually we’ll unify it as `result.markdown`.
|
||||
|
||||
### 1.2 Common Filters
|
||||
|
||||
@@ -62,8 +61,8 @@ async def main():
|
||||
|
||||
if result.success:
|
||||
# 'fit_markdown' is your pruned content, focusing on "denser" text
|
||||
print("Raw Markdown length:", len(result.markdown_v2.raw_markdown))
|
||||
print("Fit Markdown length:", len(result.markdown_v2.fit_markdown))
|
||||
print("Raw Markdown length:", len(result.markdown.raw_markdown))
|
||||
print("Fit Markdown length:", len(result.markdown.fit_markdown))
|
||||
else:
|
||||
print("Error:", result.error_message)
|
||||
|
||||
@@ -123,7 +122,7 @@ async def main():
|
||||
)
|
||||
if result.success:
|
||||
print("Fit Markdown (BM25 query-based):")
|
||||
print(result.markdown_v2.fit_markdown)
|
||||
print(result.markdown.fit_markdown)
|
||||
else:
|
||||
print("Error:", result.error_message)
|
||||
|
||||
@@ -144,11 +143,11 @@ if __name__ == "__main__":
|
||||
|
||||
## 4. Accessing the “Fit” Output
|
||||
|
||||
After the crawl, your “fit” content is found in **`result.markdown_v2.fit_markdown`**. In future versions, it will be **`result.markdown.fit_markdown`**. Meanwhile:
|
||||
After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**.
|
||||
|
||||
```python
|
||||
fit_md = result.markdown_v2.fit_markdown
|
||||
fit_html = result.markdown_v2.fit_html
|
||||
fit_md = result.markdown.fit_markdown
|
||||
fit_html = result.markdown.fit_html
|
||||
```
|
||||
|
||||
If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
|
||||
@@ -167,7 +166,6 @@ prune_filter = PruningContentFilter(
|
||||
)
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
|
||||
config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
# => result.markdown_v2.fit_markdown
|
||||
```
|
||||
|
||||
### 5.2 BM25
|
||||
@@ -179,7 +177,6 @@ bm25_filter = BM25ContentFilter(
|
||||
)
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||
config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
# => result.markdown_v2.fit_markdown
|
||||
```
|
||||
|
||||
---
|
||||
@@ -203,7 +200,7 @@ Thus, **multi-level** filtering occurs:
|
||||
|
||||
1. The crawler’s `excluded_tags` are removed from the HTML first.
|
||||
2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.
|
||||
3. The final “fit” content is generated in `result.markdown_v2.fit_markdown`.
|
||||
3. The final “fit” content is generated in `result.markdown.fit_markdown`.
|
||||
|
||||
---
|
||||
|
||||
@@ -241,7 +238,7 @@ class MyCustomFilter(RelevantContentFilter):
|
||||
- **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.
|
||||
- **BM25ContentFilter**: Perfect for query-based extraction or searching.
|
||||
- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.
|
||||
- Fit markdown ends up in **`result.markdown_v2.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
|
||||
- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
|
||||
|
||||
With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
|
||||
|
||||
|
||||
@@ -204,7 +204,7 @@ async def main():
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
print(result.fit_markdown) # Filtered markdown content
|
||||
print(result.markdown.fit_markdown) # Filtered markdown content
|
||||
```
|
||||
|
||||
**Key Features:**
|
||||
@@ -249,14 +249,11 @@ filter = LLMContentFilter(
|
||||
|
||||
## 5. Using Fit Markdown
|
||||
|
||||
When a content filter is active, the library produces two forms of markdown inside `result.markdown_v2` or (if using the simplified field) `result.markdown`:
|
||||
When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
|
||||
|
||||
1. **`raw_markdown`**: The full unfiltered markdown.
|
||||
2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
|
||||
|
||||
**Note**:
|
||||
> In earlier examples, you may see references to `result.markdown_v2`. Depending on your library version, you might access `result.markdown`, `result.markdown_v2`, or an object named `MarkdownGenerationResult`. The idea is the same: you’ll have a raw version and a filtered (“fit”) version if a filter is used.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
@@ -276,7 +273,7 @@ async def main():
|
||||
print("Raw markdown:\n", result.markdown)
|
||||
|
||||
# If a filter is used, we also have .fit_markdown:
|
||||
md_object = result.markdown_v2 # or your equivalent
|
||||
md_object = result.markdown # or your equivalent
|
||||
print("Filtered markdown:\n", md_object.fit_markdown)
|
||||
else:
|
||||
print("Crawl failed:", result.error_message)
|
||||
@@ -300,7 +297,7 @@ If your library stores detailed markdown output in an object like `MarkdownGener
|
||||
**Example**:
|
||||
|
||||
```python
|
||||
md_obj = result.markdown_v2 # your library’s naming may vary
|
||||
md_obj = result.markdown # your library’s naming may vary
|
||||
print("RAW:\n", md_obj.raw_markdown)
|
||||
print("CITED:\n", md_obj.markdown_with_citations)
|
||||
print("REFERENCES:\n", md_obj.references_markdown)
|
||||
|
||||
@@ -296,7 +296,7 @@ async def quick_parallel_example():
|
||||
# Stream results as they complete
|
||||
async for result in await crawler.arun_many(urls, config=run_conf):
|
||||
if result.success:
|
||||
print(f"[OK] {result.url}, length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
|
||||
else:
|
||||
print(f"[ERROR] {result.url} => {result.error_message}")
|
||||
|
||||
@@ -305,7 +305,7 @@ async def quick_parallel_example():
|
||||
results = await crawler.arun_many(urls, config=run_conf)
|
||||
for res in results:
|
||||
if res.success:
|
||||
print(f"[OK] {res.url}, length: {len(res.markdown_v2.raw_markdown)}")
|
||||
print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
|
||||
else:
|
||||
print(f"[ERROR] {res.url} => {res.error_message}")
|
||||
|
||||
|
||||
@@ -39,8 +39,8 @@ result = await crawler.arun(
|
||||
# Different content formats
|
||||
print(result.html) # Raw HTML
|
||||
print(result.cleaned_html) # Cleaned HTML
|
||||
print(result.markdown) # Markdown version
|
||||
print(result.fit_markdown) # Most relevant content in markdown
|
||||
print(result.markdown.raw_markdown) # Raw markdown from cleaned html
|
||||
print(result.markdown.fit_markdown) # Most relevant content in markdown
|
||||
|
||||
# Check success status
|
||||
print(result.success) # True if crawl succeeded
|
||||
|
||||
@@ -9,7 +9,7 @@ nav:
|
||||
- Home: 'index.md'
|
||||
- Setup & Installation:
|
||||
- "Installation": "core/installation.md"
|
||||
- "Docker Deployment": "core/docker-deploymeny.md"
|
||||
- "Docker Deployment": "core/docker-deployment.md"
|
||||
- "Quick Start": "core/quickstart.md"
|
||||
- "Blog & Changelog":
|
||||
- "Blog Home": "blog/index.md"
|
||||
@@ -17,6 +17,7 @@ nav:
|
||||
- Core:
|
||||
- "Command Line Interface": "core/cli.md"
|
||||
- "Simple Crawling": "core/simple-crawling.md"
|
||||
- "Deep Crawling": "core/deep-crawling.md"
|
||||
- "Crawler Result": "core/crawler-result.md"
|
||||
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
||||
- "Markdown Generation": "core/markdown-generation.md"
|
||||
|
||||
@@ -62,6 +62,7 @@ transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"PyPDF2",
|
||||
"torch",
|
||||
"nltk",
|
||||
"scikit-learn",
|
||||
|
||||
@@ -21,3 +21,4 @@ psutil>=6.1.1
|
||||
nltk>=3.9.1
|
||||
rich>=13.9.4
|
||||
cssselect>=1.2.0
|
||||
faust-cchardet>=2.1.19
|
||||
@@ -47,8 +47,8 @@ async def main():
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Raw HTML length: {len(result.html)}")
|
||||
if hasattr(result, 'markdown_v2'):
|
||||
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
if hasattr(result, 'markdown'):
|
||||
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ async def main():
|
||||
config=crawl_config
|
||||
)
|
||||
if result.success:
|
||||
print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
|
||||
print("First crawl - Raw Markdown Length:", len(result.markdown.raw_markdown))
|
||||
|
||||
finally:
|
||||
# Always ensure we close the crawler
|
||||
|
||||
@@ -19,7 +19,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user