Compare commits
14 Commits
unclecode-
...
vr0.5.0.po
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bdd9db579a | ||
|
|
1107fa1d62 | ||
|
|
f78c46446b | ||
|
|
1b72880007 | ||
|
|
29f7915b79 | ||
|
|
2327db6fdc | ||
|
|
3a234ec950 | ||
|
|
9e89d27fcd | ||
|
|
b3ec7ce960 | ||
|
|
baee4949d3 | ||
|
|
9c58e4ce2e | ||
|
|
df6a6d5f4f | ||
|
|
e896c08f9c | ||
|
|
56bc3c6e45 |
@@ -420,7 +420,7 @@ if __name__ == "__main__":
|
|||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
@@ -436,7 +436,7 @@ async def main():
|
|||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
|
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
|
||||||
# provider="ollama/qwen2", api_token="no-token",
|
# provider="ollama/qwen2", api_token="no-token",
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
schema=OpenAIModelFee.schema(),
|
schema=OpenAIModelFee.schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||||
|
|||||||
@@ -2,7 +2,8 @@
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
|
||||||
|
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy,
|
ContentScrapingStrategy,
|
||||||
WebScrapingStrategy,
|
WebScrapingStrategy,
|
||||||
@@ -47,8 +48,9 @@ from .deep_crawling import (
|
|||||||
DeepCrawlStrategy,
|
DeepCrawlStrategy,
|
||||||
BFSDeepCrawlStrategy,
|
BFSDeepCrawlStrategy,
|
||||||
FilterChain,
|
FilterChain,
|
||||||
ContentTypeFilter,
|
URLPatternFilter,
|
||||||
DomainFilter,
|
DomainFilter,
|
||||||
|
ContentTypeFilter,
|
||||||
URLFilter,
|
URLFilter,
|
||||||
FilterStats,
|
FilterStats,
|
||||||
SEOFilter,
|
SEOFilter,
|
||||||
@@ -68,11 +70,13 @@ __all__ = [
|
|||||||
"AsyncLogger",
|
"AsyncLogger",
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
"BrowserProfiler",
|
"BrowserProfiler",
|
||||||
|
"LLMConfig",
|
||||||
"DeepCrawlStrategy",
|
"DeepCrawlStrategy",
|
||||||
"BFSDeepCrawlStrategy",
|
"BFSDeepCrawlStrategy",
|
||||||
"BestFirstCrawlingStrategy",
|
"BestFirstCrawlingStrategy",
|
||||||
"DFSDeepCrawlStrategy",
|
"DFSDeepCrawlStrategy",
|
||||||
"FilterChain",
|
"FilterChain",
|
||||||
|
"URLPatternFilter",
|
||||||
"ContentTypeFilter",
|
"ContentTypeFilter",
|
||||||
"DomainFilter",
|
"DomainFilter",
|
||||||
"FilterStats",
|
"FilterStats",
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.5.0"
|
__version__ = "0.5.0.post6"
|
||||||
|
|||||||
@@ -13,13 +13,15 @@ from .config import (
|
|||||||
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
|
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||||
from .deep_crawling import DeepCrawlStrategy
|
from .deep_crawling import DeepCrawlStrategy
|
||||||
from typing import Union, List
|
|
||||||
from .cache_context import CacheMode
|
from .cache_context import CacheMode
|
||||||
from .proxy_strategy import ProxyRotationStrategy
|
from .proxy_strategy import ProxyRotationStrategy
|
||||||
|
|
||||||
|
from typing import Union, List
|
||||||
import inspect
|
import inspect
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@@ -1042,7 +1044,7 @@ class CrawlerRunConfig():
|
|||||||
return CrawlerRunConfig.from_kwargs(config_dict)
|
return CrawlerRunConfig.from_kwargs(config_dict)
|
||||||
|
|
||||||
|
|
||||||
class LlmConfig:
|
class LLMConfig:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
provider: str = DEFAULT_PROVIDER,
|
provider: str = DEFAULT_PROVIDER,
|
||||||
@@ -1063,8 +1065,8 @@ class LlmConfig:
|
|||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_kwargs(kwargs: dict) -> "LlmConfig":
|
def from_kwargs(kwargs: dict) -> "LLMConfig":
|
||||||
return LlmConfig(
|
return LLMConfig(
|
||||||
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
||||||
api_token=kwargs.get("api_token"),
|
api_token=kwargs.get("api_token"),
|
||||||
base_url=kwargs.get("base_url"),
|
base_url=kwargs.get("base_url"),
|
||||||
@@ -1084,8 +1086,8 @@ class LlmConfig:
|
|||||||
**kwargs: Key-value pairs of configuration options to update
|
**kwargs: Key-value pairs of configuration options to update
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
LLMConfig: A new instance with the specified updates
|
llm_config: A new instance with the specified updates
|
||||||
"""
|
"""
|
||||||
config_dict = self.to_dict()
|
config_dict = self.to_dict()
|
||||||
config_dict.update(kwargs)
|
config_dict.update(kwargs)
|
||||||
return LlmConfig.from_kwargs(config_dict)
|
return LLMConfig.from_kwargs(config_dict)
|
||||||
|
|||||||
@@ -4,10 +4,10 @@ import aiosqlite
|
|||||||
import asyncio
|
import asyncio
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
import logging
|
|
||||||
import json # Added for serialization/deserialization
|
import json # Added for serialization/deserialization
|
||||||
from .utils import ensure_content_dirs, generate_content_hash
|
from .utils import ensure_content_dirs, generate_content_hash
|
||||||
from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
|
from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
|
||||||
|
# , StringCompatibleMarkdown
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from .utils import VersionManager
|
from .utils import VersionManager
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from rich.live import Live
|
|||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich import box
|
from rich import box
|
||||||
from datetime import timedelta
|
from datetime import timedelta, datetime
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Generic, TypeVar
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@ from .async_crawler_strategy import (
|
|||||||
AsyncPlaywrightCrawlerStrategy,
|
AsyncPlaywrightCrawlerStrategy,
|
||||||
AsyncCrawlResponse,
|
AsyncCrawlResponse,
|
||||||
)
|
)
|
||||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
from .cache_context import CacheMode, CacheContext
|
||||||
from .markdown_generation_strategy import (
|
from .markdown_generation_strategy import (
|
||||||
DefaultMarkdownGenerator,
|
DefaultMarkdownGenerator,
|
||||||
MarkdownGenerationStrategy,
|
MarkdownGenerationStrategy,
|
||||||
@@ -44,17 +44,46 @@ from .utils import (
|
|||||||
RobotsParser,
|
RobotsParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import Union, AsyncGenerator, TypeVar
|
from typing import Union, AsyncGenerator
|
||||||
|
|
||||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||||
RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
|
|
||||||
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||||
DeepCrawlManyReturn = Union[
|
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||||
List[List[CrawlResultT]],
|
# Normalize to a list
|
||||||
AsyncGenerator[CrawlResultT, None],
|
if isinstance(results, list):
|
||||||
|
self._results = results
|
||||||
|
else:
|
||||||
|
self._results = [results]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._results)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self._results[index]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._results)
|
||||||
|
|
||||||
|
def __getattr__(self, attr):
|
||||||
|
# Delegate attribute access to the first element.
|
||||||
|
if self._results:
|
||||||
|
return getattr(self._results[0], attr)
|
||||||
|
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({self._results!r})"
|
||||||
|
|
||||||
|
# Redefine the union type. Now synchronous calls always return a container,
|
||||||
|
# while stream mode is handled with an AsyncGenerator.
|
||||||
|
RunManyReturn = Union[
|
||||||
|
CrawlResultContainer[CrawlResultT],
|
||||||
|
AsyncGenerator[CrawlResultT, None]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncWebCrawler:
|
class AsyncWebCrawler:
|
||||||
"""
|
"""
|
||||||
Asynchronous web crawler with flexible caching capabilities.
|
Asynchronous web crawler with flexible caching capabilities.
|
||||||
@@ -223,23 +252,6 @@ class AsyncWebCrawler:
|
|||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
config: CrawlerRunConfig = None,
|
config: CrawlerRunConfig = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
|
||||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
|
||||||
# extraction_strategy: ExtractionStrategy = None,
|
|
||||||
# chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
||||||
# content_filter: RelevantContentFilter = None,
|
|
||||||
# cache_mode: Optional[CacheMode] = None,
|
|
||||||
# Deprecated cache parameters
|
|
||||||
# bypass_cache: bool = False,
|
|
||||||
# disable_cache: bool = False,
|
|
||||||
# no_cache_read: bool = False,
|
|
||||||
# no_cache_write: bool = False,
|
|
||||||
# Other legacy parameters
|
|
||||||
# css_selector: str = None,
|
|
||||||
# screenshot: bool = False,
|
|
||||||
# pdf: bool = False,
|
|
||||||
# user_agent: str = None,
|
|
||||||
# verbose=True,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
@@ -270,47 +282,13 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
CrawlResult: The result of crawling and processing
|
CrawlResult: The result of crawling and processing
|
||||||
"""
|
"""
|
||||||
crawler_config = config or CrawlerRunConfig()
|
config = config or CrawlerRunConfig()
|
||||||
if not isinstance(url, str) or not url:
|
if not isinstance(url, str) or not url:
|
||||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||||
|
|
||||||
async with self._lock or self.nullcontext():
|
async with self._lock or self.nullcontext():
|
||||||
try:
|
try:
|
||||||
self.logger.verbose = crawler_config.verbose
|
self.logger.verbose = config.verbose
|
||||||
# Handle configuration
|
|
||||||
if crawler_config is not None:
|
|
||||||
config = crawler_config
|
|
||||||
else:
|
|
||||||
# Merge all parameters into a single kwargs dict for config creation
|
|
||||||
# config_kwargs = {
|
|
||||||
# "word_count_threshold": word_count_threshold,
|
|
||||||
# "extraction_strategy": extraction_strategy,
|
|
||||||
# "chunking_strategy": chunking_strategy,
|
|
||||||
# "content_filter": content_filter,
|
|
||||||
# "cache_mode": cache_mode,
|
|
||||||
# "bypass_cache": bypass_cache,
|
|
||||||
# "disable_cache": disable_cache,
|
|
||||||
# "no_cache_read": no_cache_read,
|
|
||||||
# "no_cache_write": no_cache_write,
|
|
||||||
# "css_selector": css_selector,
|
|
||||||
# "screenshot": screenshot,
|
|
||||||
# "pdf": pdf,
|
|
||||||
# "verbose": verbose,
|
|
||||||
# **kwargs,
|
|
||||||
# }
|
|
||||||
# config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Handle deprecated cache parameters
|
|
||||||
# if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
|
||||||
# # Convert legacy parameters if cache_mode not provided
|
|
||||||
# if config.cache_mode is None:
|
|
||||||
# config.cache_mode = _legacy_to_cache_mode(
|
|
||||||
# disable_cache=disable_cache,
|
|
||||||
# bypass_cache=bypass_cache,
|
|
||||||
# no_cache_read=no_cache_read,
|
|
||||||
# no_cache_write=no_cache_write,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# Default to ENABLED if no cache mode specified
|
# Default to ENABLED if no cache mode specified
|
||||||
if config.cache_mode is None:
|
if config.cache_mode is None:
|
||||||
@@ -457,7 +435,7 @@ class AsyncWebCrawler:
|
|||||||
if cache_context.should_write() and not bool(cached_result):
|
if cache_context.should_write() and not bool(cached_result):
|
||||||
await async_db_manager.acache_url(crawl_result)
|
await async_db_manager.acache_url(crawl_result)
|
||||||
|
|
||||||
return crawl_result
|
return CrawlResultContainer(crawl_result)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.success(
|
self.logger.success(
|
||||||
@@ -474,7 +452,7 @@ class AsyncWebCrawler:
|
|||||||
cached_result.success = bool(html)
|
cached_result.success = bool(html)
|
||||||
cached_result.session_id = getattr(config, "session_id", None)
|
cached_result.session_id = getattr(config, "session_id", None)
|
||||||
cached_result.redirected_url = cached_result.redirected_url or url
|
cached_result.redirected_url = cached_result.redirected_url or url
|
||||||
return cached_result
|
return CrawlResultContainer(cached_result)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_context = get_error_context(sys.exc_info())
|
error_context = get_error_context(sys.exc_info())
|
||||||
@@ -492,8 +470,10 @@ class AsyncWebCrawler:
|
|||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
)
|
)
|
||||||
|
|
||||||
return CrawlResult(
|
return CrawlResultContainer(
|
||||||
url=url, html="", success=False, error_message=error_message
|
CrawlResult(
|
||||||
|
url=url, html="", success=False, error_message=error_message
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async def aprocess_html(
|
async def aprocess_html(
|
||||||
@@ -669,17 +649,17 @@ class AsyncWebCrawler:
|
|||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
dispatcher: Optional[BaseDispatcher] = None,
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
# Legacy parameters maintained for backwards compatibility
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
# extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
# chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
content_filter: RelevantContentFilter = None,
|
# content_filter: RelevantContentFilter = None,
|
||||||
cache_mode: Optional[CacheMode] = None,
|
# cache_mode: Optional[CacheMode] = None,
|
||||||
bypass_cache: bool = False,
|
# bypass_cache: bool = False,
|
||||||
css_selector: str = None,
|
# css_selector: str = None,
|
||||||
screenshot: bool = False,
|
# screenshot: bool = False,
|
||||||
pdf: bool = False,
|
# pdf: bool = False,
|
||||||
user_agent: str = None,
|
# user_agent: str = None,
|
||||||
verbose=True,
|
# verbose=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
@@ -712,20 +692,21 @@ class AsyncWebCrawler:
|
|||||||
):
|
):
|
||||||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||||||
"""
|
"""
|
||||||
if config is None:
|
config = config or CrawlerRunConfig()
|
||||||
config = CrawlerRunConfig(
|
# if config is None:
|
||||||
word_count_threshold=word_count_threshold,
|
# config = CrawlerRunConfig(
|
||||||
extraction_strategy=extraction_strategy,
|
# word_count_threshold=word_count_threshold,
|
||||||
chunking_strategy=chunking_strategy,
|
# extraction_strategy=extraction_strategy,
|
||||||
content_filter=content_filter,
|
# chunking_strategy=chunking_strategy,
|
||||||
cache_mode=cache_mode,
|
# content_filter=content_filter,
|
||||||
bypass_cache=bypass_cache,
|
# cache_mode=cache_mode,
|
||||||
css_selector=css_selector,
|
# bypass_cache=bypass_cache,
|
||||||
screenshot=screenshot,
|
# css_selector=css_selector,
|
||||||
pdf=pdf,
|
# screenshot=screenshot,
|
||||||
verbose=verbose,
|
# pdf=pdf,
|
||||||
**kwargs,
|
# verbose=verbose,
|
||||||
)
|
# **kwargs,
|
||||||
|
# )
|
||||||
|
|
||||||
if dispatcher is None:
|
if dispatcher is None:
|
||||||
dispatcher = MemoryAdaptiveDispatcher(
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
import click
|
import click
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import datetime
|
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
import humanize
|
import humanize
|
||||||
from typing import Dict, Any, Optional, List
|
from typing import Dict, Any, Optional, List
|
||||||
import json
|
import json
|
||||||
@@ -13,7 +11,6 @@ from rich.console import Console
|
|||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
from rich.panel import Panel
|
from rich.panel import Panel
|
||||||
from rich.prompt import Prompt, Confirm
|
from rich.prompt import Prompt, Confirm
|
||||||
from rich.style import Style
|
|
||||||
|
|
||||||
from crawl4ai import (
|
from crawl4ai import (
|
||||||
CacheMode,
|
CacheMode,
|
||||||
@@ -26,12 +23,13 @@ from crawl4ai import (
|
|||||||
JsonXPathExtractionStrategy,
|
JsonXPathExtractionStrategy,
|
||||||
BM25ContentFilter,
|
BM25ContentFilter,
|
||||||
PruningContentFilter,
|
PruningContentFilter,
|
||||||
BrowserProfiler
|
BrowserProfiler,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
|
LLMConfig
|
||||||
)
|
)
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from crawl4ai.async_configs import LlmConfig
|
|
||||||
|
|
||||||
# Initialize rich console
|
# Initialize rich console
|
||||||
console = Console()
|
console = Console()
|
||||||
@@ -617,17 +615,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
|
|||||||
crawler_cfg = crawler_cfg.clone(**crawler)
|
crawler_cfg = crawler_cfg.clone(**crawler)
|
||||||
|
|
||||||
# Handle content filter config
|
# Handle content filter config
|
||||||
if filter_config:
|
if filter_config or output in ["markdown-fit", "md-fit"]:
|
||||||
filter_conf = load_config_file(filter_config)
|
if filter_config:
|
||||||
|
filter_conf = load_config_file(filter_config)
|
||||||
|
elif not filter_config and output in ["markdown-fit", "md-fit"]:
|
||||||
|
filter_conf = {
|
||||||
|
"type": "pruning",
|
||||||
|
"query": "",
|
||||||
|
"threshold": 0.48
|
||||||
|
}
|
||||||
if filter_conf["type"] == "bm25":
|
if filter_conf["type"] == "bm25":
|
||||||
crawler_cfg.content_filter = BM25ContentFilter(
|
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||||||
user_query=filter_conf.get("query"),
|
content_filter = BM25ContentFilter(
|
||||||
bm25_threshold=filter_conf.get("threshold", 1.0)
|
user_query=filter_conf.get("query"),
|
||||||
|
bm25_threshold=filter_conf.get("threshold", 1.0)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
elif filter_conf["type"] == "pruning":
|
elif filter_conf["type"] == "pruning":
|
||||||
crawler_cfg.content_filter = PruningContentFilter(
|
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||||||
user_query=filter_conf.get("query"),
|
content_filter = PruningContentFilter(
|
||||||
threshold=filter_conf.get("threshold", 0.48)
|
user_query=filter_conf.get("query"),
|
||||||
|
threshold=filter_conf.get("threshold", 0.48)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle extraction strategy
|
# Handle extraction strategy
|
||||||
@@ -647,7 +656,7 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
|
|||||||
raise click.ClickException("LLM provider and API token are required for LLM extraction")
|
raise click.ClickException("LLM provider and API token are required for LLM extraction")
|
||||||
|
|
||||||
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
|
llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
|
||||||
instruction=extract_conf["instruction"],
|
instruction=extract_conf["instruction"],
|
||||||
schema=schema_data,
|
schema=schema_data,
|
||||||
**extract_conf.get("params", {})
|
**extract_conf.get("params", {})
|
||||||
@@ -712,7 +721,7 @@ def profiles_cmd():
|
|||||||
# Run interactive profile manager
|
# Run interactive profile manager
|
||||||
anyio.run(manage_profiles)
|
anyio.run(manage_profiles)
|
||||||
|
|
||||||
@cli.command()
|
@cli.command(name="")
|
||||||
@click.argument("url", required=False)
|
@click.argument("url", required=False)
|
||||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||||
@@ -772,5 +781,11 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
|||||||
profile=profile
|
profile=profile
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def main():
|
||||||
|
import sys
|
||||||
|
if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
|
||||||
|
sys.argv.insert(1, "crawl")
|
||||||
cli()
|
cli()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -16,13 +16,13 @@ from .utils import (
|
|||||||
extract_xml_data,
|
extract_xml_data,
|
||||||
merge_chunks,
|
merge_chunks,
|
||||||
)
|
)
|
||||||
|
from .types import LLMConfig
|
||||||
|
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import math
|
import math
|
||||||
from snowballstemmer import stemmer
|
from snowballstemmer import stemmer
|
||||||
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE, PROVIDER_MODELS
|
|
||||||
from .models import TokenUsage
|
from .models import TokenUsage
|
||||||
from .prompts import PROMPT_FILTER_CONTENT
|
from .prompts import PROMPT_FILTER_CONTENT
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -770,37 +770,56 @@ class PruningContentFilter(RelevantContentFilter):
|
|||||||
|
|
||||||
|
|
||||||
class LLMContentFilter(RelevantContentFilter):
|
class LLMContentFilter(RelevantContentFilter):
|
||||||
"""Content filtering using LLMs to generate relevant markdown."""
|
"""Content filtering using LLMs to generate relevant markdown.
|
||||||
|
|
||||||
|
How it works:
|
||||||
|
1. Extracts page metadata with fallbacks.
|
||||||
|
2. Extracts text chunks from the body element.
|
||||||
|
3. Applies LLMs to generate markdown for each chunk.
|
||||||
|
4. Filters out chunks below the threshold.
|
||||||
|
5. Sorts chunks by score in descending order.
|
||||||
|
6. Returns the top N chunks.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
llm_config (LLMConfig): LLM configuration object.
|
||||||
|
instruction (str): Instruction for LLM markdown generation
|
||||||
|
chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
|
||||||
|
overlap_rate (float): Overlap rate for chunking (default: 0.5).
|
||||||
|
word_token_rate (float): Word token rate for chunking (default: 0.2).
|
||||||
|
verbose (bool): Enable verbose logging (default: False).
|
||||||
|
logger (AsyncLogger): Custom logger for LLM operations (optional).
|
||||||
|
"""
|
||||||
_UNWANTED_PROPS = {
|
_UNWANTED_PROPS = {
|
||||||
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
|
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||||
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
|
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||||
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||||
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
provider: str = DEFAULT_PROVIDER,
|
llm_config: "LLMConfig" = None,
|
||||||
api_token: Optional[str] = None,
|
|
||||||
llmConfig: "LlmConfig" = None,
|
|
||||||
instruction: str = None,
|
instruction: str = None,
|
||||||
chunk_token_threshold: int = int(1e9),
|
chunk_token_threshold: int = int(1e9),
|
||||||
overlap_rate: float = OVERLAP_RATE,
|
overlap_rate: float = OVERLAP_RATE,
|
||||||
word_token_rate: float = WORD_TOKEN_RATE,
|
word_token_rate: float = WORD_TOKEN_RATE,
|
||||||
base_url: Optional[str] = None,
|
|
||||||
api_base: Optional[str] = None,
|
|
||||||
extra_args: Dict = None,
|
|
||||||
# char_token_rate: float = WORD_TOKEN_RATE * 5,
|
# char_token_rate: float = WORD_TOKEN_RATE * 5,
|
||||||
# chunk_mode: str = "char",
|
# chunk_mode: str = "char",
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
logger: Optional[AsyncLogger] = None,
|
logger: Optional[AsyncLogger] = None,
|
||||||
ignore_cache: bool = True,
|
ignore_cache: bool = True,
|
||||||
|
# Deprecated properties
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
api_base: Optional[str] = None,
|
||||||
|
extra_args: Dict = None,
|
||||||
):
|
):
|
||||||
super().__init__(None)
|
super().__init__(None)
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
self.api_token = api_token
|
self.api_token = api_token
|
||||||
self.base_url = base_url or api_base
|
self.base_url = base_url or api_base
|
||||||
self.llmConfig = llmConfig
|
self.llm_config = llm_config
|
||||||
self.instruction = instruction
|
self.instruction = instruction
|
||||||
self.chunk_token_threshold = chunk_token_threshold
|
self.chunk_token_threshold = chunk_token_threshold
|
||||||
self.overlap_rate = overlap_rate
|
self.overlap_rate = overlap_rate
|
||||||
@@ -872,7 +891,7 @@ class LLMContentFilter(RelevantContentFilter):
|
|||||||
self.logger.info(
|
self.logger.info(
|
||||||
"Starting LLM markdown content filtering process",
|
"Starting LLM markdown content filtering process",
|
||||||
tag="LLM",
|
tag="LLM",
|
||||||
params={"provider": self.llmConfig.provider},
|
params={"provider": self.llm_config.provider},
|
||||||
colors={"provider": Fore.CYAN},
|
colors={"provider": Fore.CYAN},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -959,10 +978,10 @@ class LLMContentFilter(RelevantContentFilter):
|
|||||||
|
|
||||||
future = executor.submit(
|
future = executor.submit(
|
||||||
_proceed_with_chunk,
|
_proceed_with_chunk,
|
||||||
self.llmConfig.provider,
|
self.llm_config.provider,
|
||||||
prompt,
|
prompt,
|
||||||
self.llmConfig.api_token,
|
self.llm_config.api_token,
|
||||||
self.llmConfig.base_url,
|
self.llm_config.base_url,
|
||||||
self.extra_args,
|
self.extra_args,
|
||||||
)
|
)
|
||||||
futures.append((i, future))
|
futures.append((i, future))
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from .filters import FilterChain
|
|||||||
from .scorers import URLScorer
|
from .scorers import URLScorer
|
||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||||
|
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
|
||||||
from math import inf as infinity
|
from math import inf as infinity
|
||||||
|
|
||||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||||
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
# First collect all valid links
|
# First collect all valid links
|
||||||
for link in links:
|
for link in links:
|
||||||
url = link.get("href")
|
url = link.get("href")
|
||||||
if url in visited:
|
# Strip URL fragments to avoid duplicate crawling
|
||||||
|
# base_url = url.split('#')[0] if url else url
|
||||||
|
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||||
|
if base_url in visited:
|
||||||
continue
|
continue
|
||||||
if not await self.can_process_url(url, next_depth):
|
if not await self.can_process_url(url, next_depth):
|
||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Score the URL if a scorer is provided
|
# Score the URL if a scorer is provided
|
||||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
score = self.url_scorer.score(base_url) if self.url_scorer else 0
|
||||||
|
|
||||||
# Skip URLs with scores below the threshold
|
# Skip URLs with scores below the threshold
|
||||||
if score < self.score_threshold:
|
if score < self.score_threshold:
|
||||||
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
valid_links.append((url, score))
|
valid_links.append((base_url, score))
|
||||||
|
|
||||||
# If we have more valid links than capacity, sort by score and take the top ones
|
# If we have more valid links than capacity, sort by score and take the top ones
|
||||||
if len(valid_links) > remaining_capacity:
|
if len(valid_links) > remaining_capacity:
|
||||||
|
|||||||
@@ -428,6 +428,11 @@ class DomainFilter(URLFilter):
|
|||||||
return {domains.lower()}
|
return {domains.lower()}
|
||||||
return {d.lower() for d in domains}
|
return {d.lower() for d in domains}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_subdomain(domain: str, parent_domain: str) -> bool:
|
||||||
|
"""Check if domain is a subdomain of parent_domain"""
|
||||||
|
return domain == parent_domain or domain.endswith(f".{parent_domain}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def _extract_domain(url: str) -> str:
|
def _extract_domain(url: str) -> str:
|
||||||
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):
|
|||||||
|
|
||||||
domain = self._extract_domain(url)
|
domain = self._extract_domain(url)
|
||||||
|
|
||||||
# Early return for blocked domains
|
# Check for blocked domains, including subdomains
|
||||||
if domain in self._blocked_domains:
|
for blocked in self._blocked_domains:
|
||||||
self._update_stats(False)
|
if self._is_subdomain(domain, blocked):
|
||||||
return False
|
self._update_stats(False)
|
||||||
|
return False
|
||||||
|
|
||||||
# If no allowed domains specified, accept all non-blocked
|
# If no allowed domains specified, accept all non-blocked
|
||||||
if self._allowed_domains is None:
|
if self._allowed_domains is None:
|
||||||
self._update_stats(True)
|
self._update_stats(True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Final allowed domains check
|
# Check if domain matches any allowed domain (including subdomains)
|
||||||
result = domain in self._allowed_domains
|
for allowed in self._allowed_domains:
|
||||||
self._update_stats(result)
|
if self._is_subdomain(domain, allowed):
|
||||||
return result
|
self._update_stats(True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# No matches found
|
||||||
|
self._update_stats(False)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class ContentRelevanceFilter(URLFilter):
|
class ContentRelevanceFilter(URLFilter):
|
||||||
|
|||||||
@@ -4,12 +4,10 @@ from typing import Any, List, Dict, Optional
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import os
|
|
||||||
|
|
||||||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
|
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
|
||||||
from .config import (
|
from .config import (
|
||||||
DEFAULT_PROVIDER, PROVIDER_MODELS,
|
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
|
||||||
CHUNK_TOKEN_THRESHOLD,
|
|
||||||
OVERLAP_RATE,
|
OVERLAP_RATE,
|
||||||
WORD_TOKEN_RATE,
|
WORD_TOKEN_RATE,
|
||||||
)
|
)
|
||||||
@@ -22,9 +20,7 @@ from .utils import (
|
|||||||
extract_xml_data,
|
extract_xml_data,
|
||||||
split_and_parse_json_objects,
|
split_and_parse_json_objects,
|
||||||
sanitize_input_encode,
|
sanitize_input_encode,
|
||||||
chunk_documents,
|
|
||||||
merge_chunks,
|
merge_chunks,
|
||||||
advanced_split,
|
|
||||||
)
|
)
|
||||||
from .models import * # noqa: F403
|
from .models import * # noqa: F403
|
||||||
|
|
||||||
@@ -38,8 +34,9 @@ from .model_loader import (
|
|||||||
calculate_batch_size
|
calculate_batch_size
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .types import LLMConfig
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import math
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -481,8 +478,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
A strategy that uses an LLM to extract meaningful content from the HTML.
|
A strategy that uses an LLM to extract meaningful content from the HTML.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
llm_config: The LLM configuration object.
|
||||||
api_token: The API token for the provider.
|
|
||||||
instruction: The instruction to use for the LLM model.
|
instruction: The instruction to use for the LLM model.
|
||||||
schema: Pydantic model schema for structured data.
|
schema: Pydantic model schema for structured data.
|
||||||
extraction_type: "block" or "schema".
|
extraction_type: "block" or "schema".
|
||||||
@@ -490,27 +486,20 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
overlap_rate: Overlap between chunks.
|
overlap_rate: Overlap between chunks.
|
||||||
word_token_rate: Word to token conversion rate.
|
word_token_rate: Word to token conversion rate.
|
||||||
apply_chunking: Whether to apply chunking.
|
apply_chunking: Whether to apply chunking.
|
||||||
base_url: The base URL for the API request.
|
|
||||||
api_base: The base URL for the API request.
|
|
||||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
|
||||||
verbose: Whether to print verbose output.
|
verbose: Whether to print verbose output.
|
||||||
usages: List of individual token usages.
|
usages: List of individual token usages.
|
||||||
total_usage: Accumulated token usage.
|
total_usage: Accumulated token usage.
|
||||||
"""
|
"""
|
||||||
_UNWANTED_PROPS = {
|
_UNWANTED_PROPS = {
|
||||||
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
|
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||||
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
|
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||||
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||||
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||||
}
|
}
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
llmConfig: 'LLMConfig' = None,
|
llm_config: 'LLMConfig' = None,
|
||||||
instruction: str = None,
|
instruction: str = None,
|
||||||
provider: str = DEFAULT_PROVIDER,
|
|
||||||
api_token: Optional[str] = None,
|
|
||||||
base_url: str = None,
|
|
||||||
api_base: str = None,
|
|
||||||
schema: Dict = None,
|
schema: Dict = None,
|
||||||
extraction_type="block",
|
extraction_type="block",
|
||||||
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
|
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
|
||||||
@@ -519,15 +508,18 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
apply_chunking=True,
|
apply_chunking=True,
|
||||||
input_format: str = "markdown",
|
input_format: str = "markdown",
|
||||||
verbose=False,
|
verbose=False,
|
||||||
|
# Deprecated arguments
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: Optional[str] = None,
|
||||||
|
base_url: str = None,
|
||||||
|
api_base: str = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the strategy with clustering parameters.
|
Initialize the strategy with clustering parameters.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
llmConfig: The LLM configuration object.
|
llm_config: The LLM configuration object.
|
||||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
|
||||||
api_token: The API token for the provider.
|
|
||||||
instruction: The instruction to use for the LLM model.
|
instruction: The instruction to use for the LLM model.
|
||||||
schema: Pydantic model schema for structured data.
|
schema: Pydantic model schema for structured data.
|
||||||
extraction_type: "block" or "schema".
|
extraction_type: "block" or "schema".
|
||||||
@@ -535,20 +527,19 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
overlap_rate: Overlap between chunks.
|
overlap_rate: Overlap between chunks.
|
||||||
word_token_rate: Word to token conversion rate.
|
word_token_rate: Word to token conversion rate.
|
||||||
apply_chunking: Whether to apply chunking.
|
apply_chunking: Whether to apply chunking.
|
||||||
base_url: The base URL for the API request.
|
|
||||||
api_base: The base URL for the API request.
|
|
||||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
|
||||||
verbose: Whether to print verbose output.
|
verbose: Whether to print verbose output.
|
||||||
usages: List of individual token usages.
|
usages: List of individual token usages.
|
||||||
total_usage: Accumulated token usage.
|
total_usage: Accumulated token usage.
|
||||||
|
|
||||||
|
# Deprecated arguments, will be removed very soon
|
||||||
|
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||||
|
api_token: The API token for the provider.
|
||||||
|
base_url: The base URL for the API request.
|
||||||
|
api_base: The base URL for the API request.
|
||||||
|
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||||
"""
|
"""
|
||||||
super().__init__( input_format=input_format, **kwargs)
|
super().__init__( input_format=input_format, **kwargs)
|
||||||
self.llmConfig = llmConfig
|
self.llm_config = llm_config
|
||||||
self.provider = provider
|
|
||||||
self.api_token = api_token
|
|
||||||
self.base_url = base_url
|
|
||||||
self.api_base = api_base
|
|
||||||
self.instruction = instruction
|
self.instruction = instruction
|
||||||
self.extract_type = extraction_type
|
self.extract_type = extraction_type
|
||||||
self.schema = schema
|
self.schema = schema
|
||||||
@@ -565,6 +556,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
self.usages = [] # Store individual usages
|
self.usages = [] # Store individual usages
|
||||||
self.total_usage = TokenUsage() # Accumulated usage
|
self.total_usage = TokenUsage() # Accumulated usage
|
||||||
|
|
||||||
|
self.provider = provider
|
||||||
|
self.api_token = api_token
|
||||||
|
self.base_url = base_url
|
||||||
|
self.api_base = api_base
|
||||||
|
|
||||||
|
|
||||||
def __setattr__(self, name, value):
|
def __setattr__(self, name, value):
|
||||||
"""Handle attribute setting."""
|
"""Handle attribute setting."""
|
||||||
@@ -618,10 +614,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
response = perform_completion_with_backoff(
|
response = perform_completion_with_backoff(
|
||||||
self.llmConfig.provider,
|
self.llm_config.provider,
|
||||||
prompt_with_variables,
|
prompt_with_variables,
|
||||||
self.llmConfig.api_token,
|
self.llm_config.api_token,
|
||||||
base_url=self.llmConfig.base_url,
|
base_url=self.llm_config.base_url,
|
||||||
extra_args=self.extra_args,
|
extra_args=self.extra_args,
|
||||||
) # , json_response=self.extract_type == "schema")
|
) # , json_response=self.extract_type == "schema")
|
||||||
# Track usage
|
# Track usage
|
||||||
@@ -701,7 +697,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
overlap=int(self.chunk_token_threshold * self.overlap_rate),
|
overlap=int(self.chunk_token_threshold * self.overlap_rate),
|
||||||
)
|
)
|
||||||
extracted_content = []
|
extracted_content = []
|
||||||
if self.llmConfig.provider.startswith("groq/"):
|
if self.llm_config.provider.startswith("groq/"):
|
||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for ix, section in enumerate(merged_sections):
|
for ix, section in enumerate(merged_sections):
|
||||||
extract_func = partial(self.extract, url)
|
extract_func = partial(self.extract, url)
|
||||||
@@ -1043,8 +1039,8 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
_GENERATE_SCHEMA_UNWANTED_PROPS = {
|
_GENERATE_SCHEMA_UNWANTED_PROPS = {
|
||||||
'provider': 'Instead, use llmConfig=LlmConfig(provider="...")',
|
'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||||
'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")',
|
'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -1053,7 +1049,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
schema_type: str = "CSS", # or XPATH
|
schema_type: str = "CSS", # or XPATH
|
||||||
query: str = None,
|
query: str = None,
|
||||||
target_json_example: str = None,
|
target_json_example: str = None,
|
||||||
llmConfig: 'LLMConfig' = None,
|
llm_config: 'LLMConfig' = None,
|
||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
@@ -1066,9 +1062,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
query (str, optional): Natural language description of what data to extract
|
query (str, optional): Natural language description of what data to extract
|
||||||
provider (str): Legacy Parameter. LLM provider to use
|
provider (str): Legacy Parameter. LLM provider to use
|
||||||
api_token (str): Legacy Parameter. API token for LLM provider
|
api_token (str): Legacy Parameter. API token for LLM provider
|
||||||
llmConfig (LlmConfig): LLM configuration object
|
llm_config (LLMConfig): LLM configuration object
|
||||||
prompt (str, optional): Custom prompt template to use
|
prompt (str, optional): Custom prompt template to use
|
||||||
**kwargs: Additional args passed to perform_completion_with_backoff
|
**kwargs: Additional args passed to LLM processor
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||||
@@ -1130,11 +1126,12 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
|
|||||||
try:
|
try:
|
||||||
# Call LLM with backoff handling
|
# Call LLM with backoff handling
|
||||||
response = perform_completion_with_backoff(
|
response = perform_completion_with_backoff(
|
||||||
provider=llmConfig.provider,
|
provider=llm_config.provider,
|
||||||
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
||||||
json_response = True,
|
json_response = True,
|
||||||
api_token=llmConfig.api_token,
|
api_token=llm_config.api_token,
|
||||||
**kwargs
|
base_url=llm_config.base_url,
|
||||||
|
extra_args=kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract and return schema
|
# Extract and return schema
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from tabnanny import verbose
|
|
||||||
from typing import Optional, Dict, Any, Tuple
|
from typing import Optional, Dict, Any, Tuple
|
||||||
from .models import MarkdownGenerationResult
|
from .models import MarkdownGenerationResult
|
||||||
from .html2text import CustomHTML2Text
|
from .html2text import CustomHTML2Text
|
||||||
|
# from .types import RelevantContentFilter
|
||||||
from .content_filter_strategy import RelevantContentFilter
|
from .content_filter_strategy import RelevantContentFilter
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|||||||
@@ -37,13 +37,33 @@ class CrawlStatus(Enum):
|
|||||||
FAILED = "FAILED"
|
FAILED = "FAILED"
|
||||||
|
|
||||||
|
|
||||||
|
# @dataclass
|
||||||
|
# class CrawlStats:
|
||||||
|
# task_id: str
|
||||||
|
# url: str
|
||||||
|
# status: CrawlStatus
|
||||||
|
# start_time: Optional[datetime] = None
|
||||||
|
# end_time: Optional[datetime] = None
|
||||||
|
# memory_usage: float = 0.0
|
||||||
|
# peak_memory: float = 0.0
|
||||||
|
# error_message: str = ""
|
||||||
|
|
||||||
|
# @property
|
||||||
|
# def duration(self) -> str:
|
||||||
|
# if not self.start_time:
|
||||||
|
# return "0:00"
|
||||||
|
# end = self.end_time or datetime.now()
|
||||||
|
# duration = end - self.start_time
|
||||||
|
# return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class CrawlStats:
|
class CrawlStats:
|
||||||
task_id: str
|
task_id: str
|
||||||
url: str
|
url: str
|
||||||
status: CrawlStatus
|
status: CrawlStatus
|
||||||
start_time: Optional[datetime] = None
|
start_time: Optional[Union[datetime, float]] = None
|
||||||
end_time: Optional[datetime] = None
|
end_time: Optional[Union[datetime, float]] = None
|
||||||
memory_usage: float = 0.0
|
memory_usage: float = 0.0
|
||||||
peak_memory: float = 0.0
|
peak_memory: float = 0.0
|
||||||
error_message: str = ""
|
error_message: str = ""
|
||||||
@@ -52,10 +72,20 @@ class CrawlStats:
|
|||||||
def duration(self) -> str:
|
def duration(self) -> str:
|
||||||
if not self.start_time:
|
if not self.start_time:
|
||||||
return "0:00"
|
return "0:00"
|
||||||
end = self.end_time or datetime.now()
|
|
||||||
duration = end - self.start_time
|
|
||||||
return str(timedelta(seconds=int(duration.total_seconds())))
|
|
||||||
|
|
||||||
|
# Convert start_time to datetime if it's a float
|
||||||
|
start = self.start_time
|
||||||
|
if isinstance(start, float):
|
||||||
|
start = datetime.fromtimestamp(start)
|
||||||
|
|
||||||
|
# Get end time or use current time
|
||||||
|
end = self.end_time or datetime.now()
|
||||||
|
# Convert end_time to datetime if it's a float
|
||||||
|
if isinstance(end, float):
|
||||||
|
end = datetime.fromtimestamp(end)
|
||||||
|
|
||||||
|
duration = end - start
|
||||||
|
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
class DisplayMode(Enum):
|
class DisplayMode(Enum):
|
||||||
DETAILED = "DETAILED"
|
DETAILED = "DETAILED"
|
||||||
@@ -149,7 +179,11 @@ class CrawlResult(BaseModel):
|
|||||||
markdown_result = data.pop('markdown', None)
|
markdown_result = data.pop('markdown', None)
|
||||||
super().__init__(**data)
|
super().__init__(**data)
|
||||||
if markdown_result is not None:
|
if markdown_result is not None:
|
||||||
self._markdown = markdown_result
|
self._markdown = (
|
||||||
|
MarkdownGenerationResult(**markdown_result)
|
||||||
|
if isinstance(markdown_result, dict)
|
||||||
|
else markdown_result
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def markdown(self):
|
def markdown(self):
|
||||||
|
|||||||
@@ -1,14 +1,181 @@
|
|||||||
from typing import TYPE_CHECKING, Union
|
from typing import TYPE_CHECKING, Union
|
||||||
|
|
||||||
AsyncWebCrawler = Union['AsyncWebCrawlerType'] # Note the string literal
|
# Logger types
|
||||||
CrawlerRunConfig = Union['CrawlerRunConfigType']
|
AsyncLoggerBase = Union['AsyncLoggerBaseType']
|
||||||
|
AsyncLogger = Union['AsyncLoggerType']
|
||||||
|
|
||||||
|
# Crawler core types
|
||||||
|
AsyncWebCrawler = Union['AsyncWebCrawlerType']
|
||||||
|
CacheMode = Union['CacheModeType']
|
||||||
CrawlResult = Union['CrawlResultType']
|
CrawlResult = Union['CrawlResultType']
|
||||||
|
CrawlerHub = Union['CrawlerHubType']
|
||||||
|
BrowserProfiler = Union['BrowserProfilerType']
|
||||||
|
|
||||||
|
# Configuration types
|
||||||
|
BrowserConfig = Union['BrowserConfigType']
|
||||||
|
CrawlerRunConfig = Union['CrawlerRunConfigType']
|
||||||
|
HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
|
||||||
|
LLMConfig = Union['LLMConfigType']
|
||||||
|
|
||||||
|
# Content scraping types
|
||||||
|
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||||
|
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
||||||
|
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||||
|
|
||||||
|
# Proxy types
|
||||||
|
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||||
|
RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']
|
||||||
|
|
||||||
|
# Extraction types
|
||||||
|
ExtractionStrategy = Union['ExtractionStrategyType']
|
||||||
|
LLMExtractionStrategy = Union['LLMExtractionStrategyType']
|
||||||
|
CosineStrategy = Union['CosineStrategyType']
|
||||||
|
JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
|
||||||
|
JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']
|
||||||
|
|
||||||
|
# Chunking types
|
||||||
|
ChunkingStrategy = Union['ChunkingStrategyType']
|
||||||
|
RegexChunking = Union['RegexChunkingType']
|
||||||
|
|
||||||
|
# Markdown generation types
|
||||||
|
DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
|
||||||
|
MarkdownGenerationResult = Union['MarkdownGenerationResultType']
|
||||||
|
|
||||||
|
# Content filter types
|
||||||
|
RelevantContentFilter = Union['RelevantContentFilterType']
|
||||||
|
PruningContentFilter = Union['PruningContentFilterType']
|
||||||
|
BM25ContentFilter = Union['BM25ContentFilterType']
|
||||||
|
LLMContentFilter = Union['LLMContentFilterType']
|
||||||
|
|
||||||
|
# Dispatcher types
|
||||||
|
BaseDispatcher = Union['BaseDispatcherType']
|
||||||
|
MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
|
||||||
|
SemaphoreDispatcher = Union['SemaphoreDispatcherType']
|
||||||
|
RateLimiter = Union['RateLimiterType']
|
||||||
|
CrawlerMonitor = Union['CrawlerMonitorType']
|
||||||
|
DisplayMode = Union['DisplayModeType']
|
||||||
RunManyReturn = Union['RunManyReturnType']
|
RunManyReturn = Union['RunManyReturnType']
|
||||||
|
|
||||||
|
# Docker client
|
||||||
|
Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']
|
||||||
|
|
||||||
|
# Deep crawling types
|
||||||
|
DeepCrawlStrategy = Union['DeepCrawlStrategyType']
|
||||||
|
BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
|
||||||
|
FilterChain = Union['FilterChainType']
|
||||||
|
ContentTypeFilter = Union['ContentTypeFilterType']
|
||||||
|
DomainFilter = Union['DomainFilterType']
|
||||||
|
URLFilter = Union['URLFilterType']
|
||||||
|
FilterStats = Union['FilterStatsType']
|
||||||
|
SEOFilter = Union['SEOFilterType']
|
||||||
|
KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
|
||||||
|
URLScorer = Union['URLScorerType']
|
||||||
|
CompositeScorer = Union['CompositeScorerType']
|
||||||
|
DomainAuthorityScorer = Union['DomainAuthorityScorerType']
|
||||||
|
FreshnessScorer = Union['FreshnessScorerType']
|
||||||
|
PathDepthScorer = Union['PathDepthScorerType']
|
||||||
|
BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
|
||||||
|
DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
|
||||||
|
DeepCrawlDecorator = Union['DeepCrawlDecoratorType']
|
||||||
|
|
||||||
|
# Only import types during type checking to avoid circular imports
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from . import (
|
# Logger imports
|
||||||
|
from .async_logger import (
|
||||||
|
AsyncLoggerBase as AsyncLoggerBaseType,
|
||||||
|
AsyncLogger as AsyncLoggerType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Crawler core imports
|
||||||
|
from .async_webcrawler import (
|
||||||
AsyncWebCrawler as AsyncWebCrawlerType,
|
AsyncWebCrawler as AsyncWebCrawlerType,
|
||||||
|
CacheMode as CacheModeType,
|
||||||
|
)
|
||||||
|
from .models import CrawlResult as CrawlResultType
|
||||||
|
from .hub import CrawlerHub as CrawlerHubType
|
||||||
|
from .browser_profiler import BrowserProfiler as BrowserProfilerType
|
||||||
|
|
||||||
|
# Configuration imports
|
||||||
|
from .async_configs import (
|
||||||
|
BrowserConfig as BrowserConfigType,
|
||||||
CrawlerRunConfig as CrawlerRunConfigType,
|
CrawlerRunConfig as CrawlerRunConfigType,
|
||||||
CrawlResult as CrawlResultType,
|
HTTPCrawlerConfig as HTTPCrawlerConfigType,
|
||||||
|
LLMConfig as LLMConfigType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Content scraping imports
|
||||||
|
from .content_scraping_strategy import (
|
||||||
|
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||||
|
WebScrapingStrategy as WebScrapingStrategyType,
|
||||||
|
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Proxy imports
|
||||||
|
from .proxy_strategy import (
|
||||||
|
ProxyRotationStrategy as ProxyRotationStrategyType,
|
||||||
|
RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extraction imports
|
||||||
|
from .extraction_strategy import (
|
||||||
|
ExtractionStrategy as ExtractionStrategyType,
|
||||||
|
LLMExtractionStrategy as LLMExtractionStrategyType,
|
||||||
|
CosineStrategy as CosineStrategyType,
|
||||||
|
JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
|
||||||
|
JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chunking imports
|
||||||
|
from .chunking_strategy import (
|
||||||
|
ChunkingStrategy as ChunkingStrategyType,
|
||||||
|
RegexChunking as RegexChunkingType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Markdown generation imports
|
||||||
|
from .markdown_generation_strategy import (
|
||||||
|
DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
|
||||||
|
)
|
||||||
|
from .models import MarkdownGenerationResult as MarkdownGenerationResultType
|
||||||
|
|
||||||
|
# Content filter imports
|
||||||
|
from .content_filter_strategy import (
|
||||||
|
RelevantContentFilter as RelevantContentFilterType,
|
||||||
|
PruningContentFilter as PruningContentFilterType,
|
||||||
|
BM25ContentFilter as BM25ContentFilterType,
|
||||||
|
LLMContentFilter as LLMContentFilterType,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dispatcher imports
|
||||||
|
from .async_dispatcher import (
|
||||||
|
BaseDispatcher as BaseDispatcherType,
|
||||||
|
MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
|
||||||
|
SemaphoreDispatcher as SemaphoreDispatcherType,
|
||||||
|
RateLimiter as RateLimiterType,
|
||||||
|
CrawlerMonitor as CrawlerMonitorType,
|
||||||
|
DisplayMode as DisplayModeType,
|
||||||
RunManyReturn as RunManyReturnType,
|
RunManyReturn as RunManyReturnType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Docker client
|
||||||
|
from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType
|
||||||
|
|
||||||
|
# Deep crawling imports
|
||||||
|
from .deep_crawling import (
|
||||||
|
DeepCrawlStrategy as DeepCrawlStrategyType,
|
||||||
|
BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
|
||||||
|
FilterChain as FilterChainType,
|
||||||
|
ContentTypeFilter as ContentTypeFilterType,
|
||||||
|
DomainFilter as DomainFilterType,
|
||||||
|
URLFilter as URLFilterType,
|
||||||
|
FilterStats as FilterStatsType,
|
||||||
|
SEOFilter as SEOFilterType,
|
||||||
|
KeywordRelevanceScorer as KeywordRelevanceScorerType,
|
||||||
|
URLScorer as URLScorerType,
|
||||||
|
CompositeScorer as CompositeScorerType,
|
||||||
|
DomainAuthorityScorer as DomainAuthorityScorerType,
|
||||||
|
FreshnessScorer as FreshnessScorerType,
|
||||||
|
PathDepthScorer as PathDepthScorerType,
|
||||||
|
BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
|
||||||
|
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
|
||||||
|
DeepCrawlDecorator as DeepCrawlDecoratorType,
|
||||||
|
)
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
import time
|
import time
|
||||||
from urllib.parse import urlparse
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||||
import json
|
import json
|
||||||
@@ -33,6 +32,8 @@ import hashlib
|
|||||||
|
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from packaging import version
|
from packaging import version
|
||||||
from . import __version__
|
from . import __version__
|
||||||
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url_for_deep_crawl(href, base_url):
|
||||||
|
"""Normalize URLs to ensure consistent format"""
|
||||||
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||||
|
|
||||||
|
# Handle None or empty values
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Use urljoin to handle relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Parse the URL for normalization
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Convert hostname to lowercase
|
||||||
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove fragment entirely
|
||||||
|
fragment = ''
|
||||||
|
|
||||||
|
# Normalize query parameters if needed
|
||||||
|
query = parsed.query
|
||||||
|
if query:
|
||||||
|
# Parse query parameters
|
||||||
|
params = parse_qs(query)
|
||||||
|
|
||||||
|
# Remove tracking parameters (example - customize as needed)
|
||||||
|
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||||
|
for param in tracking_params:
|
||||||
|
if param in params:
|
||||||
|
del params[param]
|
||||||
|
|
||||||
|
# Rebuild query string, sorted for consistency
|
||||||
|
query = urlencode(params, doseq=True) if params else ''
|
||||||
|
|
||||||
|
# Build normalized URL
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
netloc,
|
||||||
|
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||||
|
parsed.params,
|
||||||
|
query,
|
||||||
|
fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
@lru_cache(maxsize=10000)
|
||||||
|
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||||
|
"""Efficient URL normalization with proper parsing"""
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Resolve relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Use proper URL parsing
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Only perform the most critical normalizations
|
||||||
|
# 1. Lowercase hostname
|
||||||
|
# 2. Remove fragment
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
parsed.netloc.lower(),
|
||||||
|
parsed.path,
|
||||||
|
parsed.params,
|
||||||
|
parsed.query,
|
||||||
|
'' # Remove fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_tmp(href, base_url):
|
def normalize_url_tmp(href, base_url):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
# Extract protocol and domain from base URL
|
# Extract protocol and domain from base URL
|
||||||
|
|||||||
@@ -595,8 +595,8 @@ curl http://localhost:8000/health
|
|||||||
## Complete Examples
|
## Complete Examples
|
||||||
|
|
||||||
Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
|
Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
|
||||||
[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk_example.py)
|
[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
|
||||||
[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api_example.py)
|
[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
|
||||||
|
|
||||||
## Server Configuration
|
## Server Configuration
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,8 @@ from crawl4ai import (
|
|||||||
CacheMode,
|
CacheMode,
|
||||||
BrowserConfig,
|
BrowserConfig,
|
||||||
MemoryAdaptiveDispatcher,
|
MemoryAdaptiveDispatcher,
|
||||||
RateLimiter
|
RateLimiter,
|
||||||
|
LLMConfig
|
||||||
)
|
)
|
||||||
from crawl4ai.utils import perform_completion_with_backoff
|
from crawl4ai.utils import perform_completion_with_backoff
|
||||||
from crawl4ai.content_filter_strategy import (
|
from crawl4ai.content_filter_strategy import (
|
||||||
@@ -103,8 +104,10 @@ async def process_llm_extraction(
|
|||||||
else:
|
else:
|
||||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
provider=config["llm"]["provider"],
|
llm_config=LLMConfig(
|
||||||
api_token=api_key,
|
provider=config["llm"]["provider"],
|
||||||
|
api_token=api_key
|
||||||
|
),
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
schema=json.loads(schema) if schema else None,
|
schema=json.loads(schema) if schema else None,
|
||||||
)
|
)
|
||||||
@@ -164,8 +167,10 @@ async def handle_markdown_request(
|
|||||||
FilterType.FIT: PruningContentFilter(),
|
FilterType.FIT: PruningContentFilter(),
|
||||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||||
FilterType.LLM: LLMContentFilter(
|
FilterType.LLM: LLMContentFilter(
|
||||||
provider=config["llm"]["provider"],
|
llm_config=LLMConfig(
|
||||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
provider=config["llm"]["provider"],
|
||||||
|
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||||
|
),
|
||||||
instruction=query or "Extract main content"
|
instruction=query or "Extract main content"
|
||||||
)
|
)
|
||||||
}[filter_type]
|
}[filter_type]
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from pydantic.main import BaseModel
|
|||||||
import base64
|
import base64
|
||||||
|
|
||||||
instance = JWT()
|
instance = JWT()
|
||||||
security = HTTPBearer()
|
security = HTTPBearer(auto_error=False)
|
||||||
SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
|
SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
||||||
|
|
||||||
@@ -30,6 +30,9 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
|
|||||||
|
|
||||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||||
"""Verify the JWT token from the Authorization header."""
|
"""Verify the JWT token from the Authorization header."""
|
||||||
|
|
||||||
|
if credentials is None:
|
||||||
|
return None
|
||||||
token = credentials.credentials
|
token = credentials.credentials
|
||||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||||
try:
|
try:
|
||||||
@@ -38,9 +41,15 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security))
|
|||||||
except Exception:
|
except Exception:
|
||||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||||
|
|
||||||
|
|
||||||
def get_token_dependency(config: Dict):
|
def get_token_dependency(config: Dict):
|
||||||
"""Return the token dependency if JWT is enabled, else None."""
|
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
||||||
return verify_token if config.get("security", {}).get("jwt_enabled", False) else None
|
|
||||||
|
if config.get("security", {}).get("jwt_enabled", False):
|
||||||
|
return verify_token
|
||||||
|
else:
|
||||||
|
return lambda: None
|
||||||
|
|
||||||
|
|
||||||
class TokenRequest(BaseModel):
|
class TokenRequest(BaseModel):
|
||||||
email: EmailStr
|
email: EmailStr
|
||||||
@@ -3,7 +3,7 @@ app:
|
|||||||
title: "Crawl4AI API"
|
title: "Crawl4AI API"
|
||||||
version: "1.0.0"
|
version: "1.0.0"
|
||||||
host: "0.0.0.0"
|
host: "0.0.0.0"
|
||||||
port: 8000
|
port: 8020
|
||||||
reload: True
|
reload: True
|
||||||
timeout_keep_alive: 300
|
timeout_keep_alive: 300
|
||||||
|
|
||||||
|
|||||||
@@ -65,7 +65,6 @@ async def basic_deep_crawl():
|
|||||||
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# 2️⃣ Stream vs. Non-Stream Execution
|
# 2️⃣ Stream vs. Non-Stream Execution
|
||||||
async def stream_vs_nonstream():
|
async def stream_vs_nonstream():
|
||||||
"""
|
"""
|
||||||
@@ -127,7 +126,6 @@ async def stream_vs_nonstream():
|
|||||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||||
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
||||||
|
|
||||||
|
|
||||||
# 3️⃣ Introduce Filters & Scorers
|
# 3️⃣ Introduce Filters & Scorers
|
||||||
async def filters_and_scorers():
|
async def filters_and_scorers():
|
||||||
"""
|
"""
|
||||||
@@ -236,82 +234,10 @@ async def filters_and_scorers():
|
|||||||
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
||||||
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
||||||
|
|
||||||
|
# 4️⃣ Advanced Filters
|
||||||
# 4️⃣ Wrap-Up and Key Takeaways
|
|
||||||
async def wrap_up():
|
|
||||||
"""
|
|
||||||
PART 4: Wrap-Up and Key Takeaways
|
|
||||||
|
|
||||||
Summarize the key concepts learned in this tutorial.
|
|
||||||
"""
|
|
||||||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
|
||||||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
|
||||||
|
|
||||||
# Create a sophisticated filter chain
|
|
||||||
filter_chain = FilterChain(
|
|
||||||
[
|
|
||||||
DomainFilter(
|
|
||||||
allowed_domains=["docs.crawl4ai.com"],
|
|
||||||
blocked_domains=["old.docs.crawl4ai.com"],
|
|
||||||
),
|
|
||||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
|
||||||
ContentTypeFilter(allowed_types=["text/html"]),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create a composite scorer that combines multiple scoring strategies
|
|
||||||
keyword_scorer = KeywordRelevanceScorer(
|
|
||||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
|
||||||
)
|
|
||||||
# Set up the configuration
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
|
||||||
max_depth=1,
|
|
||||||
include_external=False,
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=keyword_scorer,
|
|
||||||
),
|
|
||||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
|
||||||
stream=True,
|
|
||||||
verbose=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute the crawl
|
|
||||||
results = []
|
|
||||||
start_time = time.perf_counter()
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
async for result in await crawler.arun(
|
|
||||||
url="https://docs.crawl4ai.com", config=config
|
|
||||||
):
|
|
||||||
results.append(result)
|
|
||||||
score = result.metadata.get("score", 0)
|
|
||||||
depth = result.metadata.get("depth", 0)
|
|
||||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
|
||||||
|
|
||||||
duration = time.perf_counter() - start_time
|
|
||||||
|
|
||||||
# Summarize the results
|
|
||||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
|
||||||
print(
|
|
||||||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Group by depth
|
|
||||||
depth_counts = {}
|
|
||||||
for result in results:
|
|
||||||
depth = result.metadata.get("depth", 0)
|
|
||||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
|
||||||
|
|
||||||
print("\n📊 Pages crawled by depth:")
|
|
||||||
for depth, count in sorted(depth_counts.items()):
|
|
||||||
print(f" Depth {depth}: {count} pages")
|
|
||||||
|
|
||||||
|
|
||||||
# 5️⃣ Advanced Filters
|
|
||||||
async def advanced_filters():
|
async def advanced_filters():
|
||||||
"""
|
"""
|
||||||
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
|
PART 4: Demonstrates advanced filtering techniques for specialized crawling.
|
||||||
|
|
||||||
This function covers:
|
This function covers:
|
||||||
- SEO filters
|
- SEO filters
|
||||||
@@ -371,11 +297,10 @@ async def advanced_filters():
|
|||||||
relevance_score = result.metadata.get("relevance_score", 0)
|
relevance_score = result.metadata.get("relevance_score", 0)
|
||||||
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
||||||
|
|
||||||
|
# 5️⃣ Max Pages and Score Thresholds
|
||||||
# Main function to run the entire tutorial
|
|
||||||
async def max_pages_and_thresholds():
|
async def max_pages_and_thresholds():
|
||||||
"""
|
"""
|
||||||
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||||||
|
|
||||||
This function shows:
|
This function shows:
|
||||||
- How to limit the number of pages crawled
|
- How to limit the number of pages crawled
|
||||||
@@ -471,6 +396,77 @@ async def max_pages_and_thresholds():
|
|||||||
print(f" ✅ Average score: {avg_score:.2f}")
|
print(f" ✅ Average score: {avg_score:.2f}")
|
||||||
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
||||||
|
|
||||||
|
# 6️⃣ Wrap-Up and Key Takeaways
|
||||||
|
async def wrap_up():
|
||||||
|
"""
|
||||||
|
PART 6: Wrap-Up and Key Takeaways
|
||||||
|
|
||||||
|
Summarize the key concepts learned in this tutorial.
|
||||||
|
"""
|
||||||
|
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||||||
|
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||||||
|
|
||||||
|
# Create a sophisticated filter chain
|
||||||
|
filter_chain = FilterChain(
|
||||||
|
[
|
||||||
|
DomainFilter(
|
||||||
|
allowed_domains=["docs.crawl4ai.com"],
|
||||||
|
blocked_domains=["old.docs.crawl4ai.com"],
|
||||||
|
),
|
||||||
|
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||||
|
ContentTypeFilter(allowed_types=["text/html"]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a composite scorer that combines multiple scoring strategies
|
||||||
|
keyword_scorer = KeywordRelevanceScorer(
|
||||||
|
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||||
|
)
|
||||||
|
# Set up the configuration
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||||
|
max_depth=1,
|
||||||
|
include_external=False,
|
||||||
|
filter_chain=filter_chain,
|
||||||
|
url_scorer=keyword_scorer,
|
||||||
|
),
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||||
|
stream=True,
|
||||||
|
verbose=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute the crawl
|
||||||
|
results = []
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://docs.crawl4ai.com", config=config
|
||||||
|
):
|
||||||
|
results.append(result)
|
||||||
|
score = result.metadata.get("score", 0)
|
||||||
|
depth = result.metadata.get("depth", 0)
|
||||||
|
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||||
|
|
||||||
|
duration = time.perf_counter() - start_time
|
||||||
|
|
||||||
|
# Summarize the results
|
||||||
|
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||||
|
print(
|
||||||
|
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Group by depth
|
||||||
|
depth_counts = {}
|
||||||
|
for result in results:
|
||||||
|
depth = result.metadata.get("depth", 0)
|
||||||
|
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||||
|
|
||||||
|
print("\n📊 Pages crawled by depth:")
|
||||||
|
for depth, count in sorted(depth_counts.items()):
|
||||||
|
print(f" Depth {depth}: {count} pages")
|
||||||
|
|
||||||
|
|
||||||
async def run_tutorial():
|
async def run_tutorial():
|
||||||
"""
|
"""
|
||||||
Executes all tutorial sections in sequence.
|
Executes all tutorial sections in sequence.
|
||||||
@@ -482,12 +478,12 @@ async def run_tutorial():
|
|||||||
|
|
||||||
# Define sections - uncomment to run specific parts during development
|
# Define sections - uncomment to run specific parts during development
|
||||||
tutorial_sections = [
|
tutorial_sections = [
|
||||||
# basic_deep_crawl,
|
basic_deep_crawl,
|
||||||
# stream_vs_nonstream,
|
stream_vs_nonstream,
|
||||||
# filters_and_scorers,
|
filters_and_scorers,
|
||||||
max_pages_and_thresholds, # Added new section
|
max_pages_and_thresholds,
|
||||||
wrap_up,
|
|
||||||
advanced_filters,
|
advanced_filters,
|
||||||
|
wrap_up,
|
||||||
]
|
]
|
||||||
|
|
||||||
for section in tutorial_sections:
|
for section in tutorial_sections:
|
||||||
@@ -497,7 +493,6 @@ async def run_tutorial():
|
|||||||
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
||||||
print("For more information, check out https://docs.crawl4ai.com")
|
print("For more information, check out https://docs.crawl4ai.com")
|
||||||
|
|
||||||
|
|
||||||
# Execute the tutorial when run directly
|
# Execute the tutorial when run directly
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(run_tutorial())
|
asyncio.run(run_tutorial())
|
||||||
@@ -11,7 +11,7 @@ import asyncio
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai.extraction_strategy import (
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
@@ -61,19 +61,19 @@ async def main():
|
|||||||
|
|
||||||
# 1. LLM Extraction with different input formats
|
# 1. LLM Extraction with different input formats
|
||||||
markdown_strategy = LLMExtractionStrategy(
|
markdown_strategy = LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||||
instruction="Extract product information including name, price, and description",
|
instruction="Extract product information including name, price, and description",
|
||||||
)
|
)
|
||||||
|
|
||||||
html_strategy = LLMExtractionStrategy(
|
html_strategy = LLMExtractionStrategy(
|
||||||
input_format="html",
|
input_format="html",
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||||
instruction="Extract product information from HTML including structured data",
|
instruction="Extract product information from HTML including structured data",
|
||||||
)
|
)
|
||||||
|
|
||||||
fit_markdown_strategy = LLMExtractionStrategy(
|
fit_markdown_strategy = LLMExtractionStrategy(
|
||||||
input_format="fit_markdown",
|
input_format="fit_markdown",
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||||
instruction="Extract product information from cleaned markdown",
|
instruction="Extract product information from cleaned markdown",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
@@ -23,7 +23,7 @@ async def main():
|
|||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||||
schema=OpenAIModelFee.model_json_schema(),
|
schema=OpenAIModelFee.model_json_schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def test_llm_filter():
|
async def test_llm_filter():
|
||||||
@@ -23,7 +23,7 @@ async def test_llm_filter():
|
|||||||
|
|
||||||
# Initialize LLM filter with focused instruction
|
# Initialize LLM filter with focused instruction
|
||||||
filter = LLMContentFilter(
|
filter = LLMContentFilter(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
instruction="""
|
instruction="""
|
||||||
Focus on extracting the core educational content about Python classes.
|
Focus on extracting the core educational content about Python classes.
|
||||||
Include:
|
Include:
|
||||||
@@ -43,7 +43,7 @@ async def test_llm_filter():
|
|||||||
)
|
)
|
||||||
|
|
||||||
filter = LLMContentFilter(
|
filter = LLMContentFilter(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||||
ignore_cache = True,
|
ignore_cache = True,
|
||||||
instruction="""
|
instruction="""
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import os, sys
|
import os, sys
|
||||||
|
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
sys.path.append(
|
sys.path.append(
|
||||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
@@ -211,7 +211,7 @@ async def extract_structured_data_using_llm(
|
|||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
page_timeout=80000,
|
page_timeout=80000,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||||||
schema=OpenAIModelFee.model_json_schema(),
|
schema=OpenAIModelFee.model_json_schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import os, sys
|
import os, sys
|
||||||
|
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
# append parent directory to system path
|
# append parent directory to system path
|
||||||
sys.path.append(
|
sys.path.append(
|
||||||
@@ -147,7 +147,7 @@ async def extract_structured_data_using_llm(
|
|||||||
url="https://openai.com/api/pricing/",
|
url="https://openai.com/api/pricing/",
|
||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||||||
schema=OpenAIModelFee.model_json_schema(),
|
schema=OpenAIModelFee.model_json_schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||||
@@ -570,7 +570,7 @@ async def generate_knowledge_graph():
|
|||||||
relationships: List[Relationship]
|
relationships: List[Relationship]
|
||||||
|
|
||||||
extraction_strategy = LLMExtractionStrategy(
|
extraction_strategy = LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
||||||
schema=KnowledgeGraph.model_json_schema(),
|
schema=KnowledgeGraph.model_json_schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="""Extract entities and relationships from the given text.""",
|
instruction="""Extract entities and relationships from the given text.""",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.chunking_strategy import *
|
from crawl4ai.chunking_strategy import *
|
||||||
from crawl4ai.extraction_strategy import *
|
from crawl4ai.extraction_strategy import *
|
||||||
@@ -179,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
|
|||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
cprint(
|
cprint(
|
||||||
@@ -198,7 +198,7 @@ def add_llm_extraction_strategy(crawler):
|
|||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||||
instruction="I am interested in only financial news",
|
instruction="I am interested in only financial news",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@@ -210,7 +210,7 @@ def add_llm_extraction_strategy(crawler):
|
|||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||||
instruction="Extract only content related to technology",
|
instruction="Extract only content related to technology",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
|
|||||||
from crawl4ai import RoundRobinProxyStrategy
|
from crawl4ai import RoundRobinProxyStrategy
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
from crawl4ai import DefaultMarkdownGenerator
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
@@ -284,9 +284,9 @@ async def llm_content_filter():
|
|||||||
PART 5: LLM Content Filter
|
PART 5: LLM Content Filter
|
||||||
|
|
||||||
This function demonstrates:
|
This function demonstrates:
|
||||||
- Configuring LLM providers via LlmConfig
|
- Configuring LLM providers via LLMConfig
|
||||||
- Using LLM to generate focused markdown
|
- Using LLM to generate focused markdown
|
||||||
- LlmConfig for configuration
|
- LLMConfig for configuration
|
||||||
|
|
||||||
Note: Requires a valid API key for the chosen LLM provider
|
Note: Requires a valid API key for the chosen LLM provider
|
||||||
"""
|
"""
|
||||||
@@ -296,7 +296,7 @@ async def llm_content_filter():
|
|||||||
|
|
||||||
# Create LLM configuration
|
# Create LLM configuration
|
||||||
# Replace with your actual API key or set as environment variable
|
# Replace with your actual API key or set as environment variable
|
||||||
llm_config = LlmConfig(
|
llm_config = LLMConfig(
|
||||||
provider="gemini/gemini-1.5-pro",
|
provider="gemini/gemini-1.5-pro",
|
||||||
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
|
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
|
||||||
)
|
)
|
||||||
@@ -309,7 +309,7 @@ async def llm_content_filter():
|
|||||||
# Create markdown generator with LLM filter
|
# Create markdown generator with LLM filter
|
||||||
markdown_generator = DefaultMarkdownGenerator(
|
markdown_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=LLMContentFilter(
|
content_filter=LLMContentFilter(
|
||||||
llmConfig=llm_config,
|
llm_config=llm_config,
|
||||||
instruction="Extract key concepts and summaries"
|
instruction="Extract key concepts and summaries"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -381,7 +381,7 @@ async def llm_schema_generation():
|
|||||||
PART 7: LLM Schema Generation
|
PART 7: LLM Schema Generation
|
||||||
|
|
||||||
This function demonstrates:
|
This function demonstrates:
|
||||||
- Configuring LLM providers via LlmConfig
|
- Configuring LLM providers via LLMConfig
|
||||||
- Using LLM to generate extraction schemas
|
- Using LLM to generate extraction schemas
|
||||||
- JsonCssExtractionStrategy
|
- JsonCssExtractionStrategy
|
||||||
|
|
||||||
@@ -406,9 +406,9 @@ async def llm_schema_generation():
|
|||||||
<div class="rating">4.7/5</div>
|
<div class="rating">4.7/5</div>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
print("\n📊 Setting up LlmConfig...")
|
print("\n📊 Setting up LLMConfig...")
|
||||||
# Create LLM configuration
|
# Create LLM configuration
|
||||||
llm_config = LlmConfig(
|
llm_config = LLMConfig(
|
||||||
provider="gemini/gemini-1.5-pro",
|
provider="gemini/gemini-1.5-pro",
|
||||||
api_token="env:GEMINI_API_KEY"
|
api_token="env:GEMINI_API_KEY"
|
||||||
)
|
)
|
||||||
@@ -416,7 +416,7 @@ async def llm_schema_generation():
|
|||||||
print(" This would use the LLM to analyze HTML and create an extraction schema")
|
print(" This would use the LLM to analyze HTML and create an extraction schema")
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html=sample_html,
|
html=sample_html,
|
||||||
llmConfig = llm_config,
|
llm_config = llm_config,
|
||||||
query="Extract product name and price"
|
query="Extract product name and price"
|
||||||
)
|
)
|
||||||
print("\n✅ Generated Schema:")
|
print("\n✅ Generated Schema:")
|
||||||
|
|||||||
@@ -245,8 +245,8 @@ run_config = CrawlerRunConfig(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
# 3. **LlmConfig** - Setting up LLM providers
|
# 3. **LLMConfig** - Setting up LLM providers
|
||||||
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||||
|
|
||||||
1. LLMExtractionStrategy
|
1. LLMExtractionStrategy
|
||||||
2. LLMContentFilter
|
2. LLMContentFilter
|
||||||
@@ -262,7 +262,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that
|
|||||||
|
|
||||||
## 3.2 Example Usage
|
## 3.2 Example Usage
|
||||||
```python
|
```python
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||||
```
|
```
|
||||||
|
|
||||||
## 4. Putting It All Together
|
## 4. Putting It All Together
|
||||||
@@ -270,7 +270,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI
|
|||||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||||
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
||||||
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
||||||
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Create a modified copy with the clone() method
|
# Create a modified copy with the clone() method
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ OverlappingWindowChunking(
|
|||||||
```python
|
```python
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
# Define schema
|
# Define schema
|
||||||
class Article(BaseModel):
|
class Article(BaseModel):
|
||||||
@@ -141,7 +141,7 @@ class Article(BaseModel):
|
|||||||
|
|
||||||
# Create strategy
|
# Create strategy
|
||||||
strategy = LLMExtractionStrategy(
|
strategy = LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||||
schema=Article.schema(),
|
schema=Article.schema(),
|
||||||
instruction="Extract article details"
|
instruction="Extract article details"
|
||||||
)
|
)
|
||||||
@@ -198,7 +198,7 @@ result = await crawler.arun(
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
# Create chunking strategy
|
# Create chunking strategy
|
||||||
chunker = OverlappingWindowChunking(
|
chunker = OverlappingWindowChunking(
|
||||||
@@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking(
|
|||||||
|
|
||||||
# Use with extraction strategy
|
# Use with extraction strategy
|
||||||
strategy = LLMExtractionStrategy(
|
strategy = LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||||
chunking_strategy=chunker
|
chunking_strategy=chunker
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5
|
|||||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||||
* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||||
|
|
||||||
**Minor Updates & Improvements:**
|
**Minor Updates & Improvements:**
|
||||||
|
|
||||||
@@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur
|
|||||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||||
* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||||
|
|
||||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||||
|
|
||||||
|
|||||||
@@ -305,13 +305,13 @@ asyncio.run(main())
|
|||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||||
|
|
||||||
markdown_generator = DefaultMarkdownGenerator(
|
markdown_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries")
|
content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
|
||||||
)
|
)
|
||||||
|
|
||||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||||
@@ -335,13 +335,13 @@ asyncio.run(main())
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||||
|
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
|
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
|
||||||
llmConfig = llm_config,
|
llm_config = llm_config,
|
||||||
query="Extract product name and price"
|
query="Extract product name and price"
|
||||||
)
|
)
|
||||||
print(schema)
|
print(schema)
|
||||||
@@ -394,20 +394,20 @@ print(schema)
|
|||||||
serialization, especially for sets of allowed/blocked domains. No code changes
|
serialization, especially for sets of allowed/blocked domains. No code changes
|
||||||
required.
|
required.
|
||||||
|
|
||||||
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for
|
- **Added: New `LLMConfig` parameter.** This new parameter can be passed for
|
||||||
extraction, filtering, and schema generation tasks. It simplifies passing
|
extraction, filtering, and schema generation tasks. It simplifies passing
|
||||||
provider strings, API tokens, and base URLs across all sections where LLM
|
provider strings, API tokens, and base URLs across all sections where LLM
|
||||||
configuration is necessary. It also enables reuse and allows for quick
|
configuration is necessary. It also enables reuse and allows for quick
|
||||||
experimentation between different LLM configurations.
|
experimentation between different LLM configurations.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
# Example of using LlmConfig with LLMExtractionStrategy
|
# Example of using LLMConfig with LLMExtractionStrategy
|
||||||
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||||
strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...)
|
strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
|
||||||
|
|
||||||
# Example usage within a crawler
|
# Example usage within a crawler
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
@@ -418,7 +418,7 @@ print(schema)
|
|||||||
```
|
```
|
||||||
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
|
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
|
||||||
`base_url`, and `api_base` from `LLMExtractionStrategy` and
|
`base_url`, and `api_base` from `LLMExtractionStrategy` and
|
||||||
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
|
`LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
|
||||||
|
|
||||||
- **Changed: Improved browser context management and added shared data support.
|
- **Changed: Improved browser context management and added shared data support.
|
||||||
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now
|
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ Crawl4AI’s flexibility stems from two key classes:
|
|||||||
|
|
||||||
1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
|
1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
|
||||||
2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
|
2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
|
||||||
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||||
|
|
||||||
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
|
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
|
||||||
|
|
||||||
@@ -239,7 +239,7 @@ The `clone()` method:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 3. LlmConfig Essentials
|
## 3. LLMConfig Essentials
|
||||||
|
|
||||||
### Key fields to note
|
### Key fields to note
|
||||||
|
|
||||||
@@ -256,16 +256,16 @@ The `clone()` method:
|
|||||||
- If your provider has a custom endpoint
|
- If your provider has a custom endpoint
|
||||||
|
|
||||||
```python
|
```python
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||||
```
|
```
|
||||||
|
|
||||||
## 4. Putting It All Together
|
## 4. Putting It All Together
|
||||||
|
|
||||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each call’s needs:
|
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -289,14 +289,14 @@ async def main():
|
|||||||
|
|
||||||
# 3) Example LLM content filtering
|
# 3) Example LLM content filtering
|
||||||
|
|
||||||
gemini_config = LlmConfig(
|
gemini_config = LLMConfig(
|
||||||
provider="gemini/gemini-1.5-pro"
|
provider="gemini/gemini-1.5-pro"
|
||||||
api_token = "env:GEMINI_API_TOKEN"
|
api_token = "env:GEMINI_API_TOKEN"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize LLM filter with specific instruction
|
# Initialize LLM filter with specific instruction
|
||||||
filter = LLMContentFilter(
|
filter = LLMContentFilter(
|
||||||
llmConfig=gemini_config, # or your preferred provider
|
llm_config=gemini_config, # or your preferred provider
|
||||||
instruction="""
|
instruction="""
|
||||||
Focus on extracting the core educational content.
|
Focus on extracting the core educational content.
|
||||||
Include:
|
Include:
|
||||||
@@ -343,7 +343,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
For a **detailed list** of available parameters (including advanced ones), see:
|
For a **detailed list** of available parameters (including advanced ones), see:
|
||||||
|
|
||||||
- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md)
|
- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)
|
||||||
|
|
||||||
You can explore topics like:
|
You can explore topics like:
|
||||||
|
|
||||||
@@ -356,7 +356,7 @@ You can explore topics like:
|
|||||||
|
|
||||||
## 6. Conclusion
|
## 6. Conclusion
|
||||||
|
|
||||||
**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define:
|
**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
|
||||||
|
|
||||||
- **Which** browser to launch, how it should run, and any proxy or user agent needs.
|
- **Which** browser to launch, how it should run, and any proxy or user agent needs.
|
||||||
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
|
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
|
||||||
|
|||||||
@@ -211,7 +211,7 @@ if __name__ == "__main__":
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
class ArticleData(BaseModel):
|
class ArticleData(BaseModel):
|
||||||
@@ -220,7 +220,7 @@ class ArticleData(BaseModel):
|
|||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||||
schema=ArticleData.schema(),
|
schema=ArticleData.schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="Extract 'headline' and a short 'summary' from the content."
|
instruction="Extract 'headline' and a short 'summary' from the content."
|
||||||
|
|||||||
@@ -175,13 +175,13 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Initialize LLM filter with specific instruction
|
# Initialize LLM filter with specific instruction
|
||||||
filter = LLMContentFilter(
|
filter = LLMContentFilter(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||||
instruction="""
|
instruction="""
|
||||||
Focus on extracting the core educational content.
|
Focus on extracting the core educational content.
|
||||||
Include:
|
Include:
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
# Generate a schema (one-time cost)
|
# Generate a schema (one-time cost)
|
||||||
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
|
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
|
||||||
@@ -136,13 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
|
|||||||
# Using OpenAI (requires API token)
|
# Using OpenAI (requires API token)
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||||
)
|
)
|
||||||
|
|
||||||
# Or using Ollama (open source, no token needed)
|
# Or using Ollama (open source, no token needed)
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use the schema for fast, repeated extractions
|
# Use the schema for fast, repeated extractions
|
||||||
@@ -211,7 +211,7 @@ import os
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
@@ -241,7 +241,7 @@ async def extract_structured_data_using_llm(
|
|||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
page_timeout=80000,
|
page_timeout=80000,
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider=provider,api_token=api_token),
|
llm_config = LLMConfig(provider=provider,api_token=api_token),
|
||||||
schema=OpenAIModelFee.model_json_schema(),
|
schema=OpenAIModelFee.model_json_schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
extraction_strategy = LLMExtractionStrategy(
|
extraction_strategy = LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||||
schema=MyModel.model_json_schema(),
|
schema=MyModel.model_json_schema(),
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
|
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
|
||||||
@@ -96,7 +96,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
class Product(BaseModel):
|
class Product(BaseModel):
|
||||||
@@ -106,7 +106,7 @@ class Product(BaseModel):
|
|||||||
async def main():
|
async def main():
|
||||||
# 1. Define the LLM extraction strategy
|
# 1. Define the LLM extraction strategy
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
schema=Product.schema_json(), # Or use model_json_schema()
|
schema=Product.schema_json(), # Or use model_json_schema()
|
||||||
extraction_type="schema",
|
extraction_type="schema",
|
||||||
instruction="Extract all product objects with 'name' and 'price' from the content.",
|
instruction="Extract all product objects with 'name' and 'price' from the content.",
|
||||||
|
|||||||
@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
# Sample HTML with product information
|
# Sample HTML with product information
|
||||||
html = """
|
html = """
|
||||||
@@ -435,14 +435,14 @@ html = """
|
|||||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
schema_type="css",
|
schema_type="css",
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||||
)
|
)
|
||||||
|
|
||||||
# Option 2: Using Ollama (open source, no token needed)
|
# Option 2: Using Ollama (open source, no token needed)
|
||||||
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
schema_type="xpath",
|
schema_type="xpath",
|
||||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use the generated schema for fast, repeated extractions
|
# Use the generated schema for fast, repeated extractions
|
||||||
|
|||||||
78
docs/snippets/deep_crawl/intro.py
Normal file
78
docs/snippets/deep_crawl/intro.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import asyncio
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
BFSDeepCrawlStrategy,
|
||||||
|
CrawlResult,
|
||||||
|
FilterChain,
|
||||||
|
DomainFilter,
|
||||||
|
URLPatternFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import necessary classes from crawl4ai library:
|
||||||
|
# - AsyncWebCrawler: The main class for web crawling.
|
||||||
|
# - CrawlerRunConfig: Configuration class for crawler behavior.
|
||||||
|
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
|
||||||
|
# - CrawlResult: Data model for individual crawl results.
|
||||||
|
# - FilterChain: Used to chain multiple URL filters.
|
||||||
|
# - URLPatternFilter: Filter URLs based on patterns.
|
||||||
|
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
|
||||||
|
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
|
||||||
|
|
||||||
|
async def basic_deep_crawl():
|
||||||
|
"""
|
||||||
|
Performs a basic deep crawl starting from a seed URL, demonstrating:
|
||||||
|
- Breadth-First Search (BFS) deep crawling strategy.
|
||||||
|
- Filtering URLs based on URL patterns.
|
||||||
|
- Accessing crawl results and metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 1. Define URL Filters:
|
||||||
|
# Create a URLPatternFilter to include only URLs containing "text".
|
||||||
|
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=[
|
||||||
|
"*text*", # Include URLs that contain "text" in their path or URL
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
|
||||||
|
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
|
||||||
|
domain_filter = DomainFilter(
|
||||||
|
allowed_domains=["groq.com"],
|
||||||
|
blocked_domains=["example.com"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Configure CrawlerRunConfig for Deep Crawling:
|
||||||
|
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
|
||||||
|
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
|
||||||
|
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
|
||||||
|
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
|
||||||
|
),
|
||||||
|
verbose=True, # Enable verbose logging to see detailed output during crawling
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Initialize and Run AsyncWebCrawler:
|
||||||
|
# Use AsyncWebCrawler as a context manager for automatic start and close.
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results: List[CrawlResult] = await crawler.arun(
|
||||||
|
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
|
||||||
|
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
|
||||||
|
config=config, # Pass the configured CrawlerRunConfig to arun method
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Process and Print Crawl Results:
|
||||||
|
# Iterate through the list of CrawlResult objects returned by the deep crawl.
|
||||||
|
for result in results:
|
||||||
|
# Print the URL and its crawl depth from the metadata for each crawled URL.
|
||||||
|
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(basic_deep_crawl())
|
||||||
@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
|
|||||||
crawl4ai-migrate = "crawl4ai.migrations:main"
|
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||||
crawl4ai-setup = "crawl4ai.install:post_install"
|
crawl4ai-setup = "crawl4ai.install:post_install"
|
||||||
crawl4ai-doctor = "crawl4ai.install:doctor"
|
crawl4ai-doctor = "crawl4ai.install:doctor"
|
||||||
crwl = "crawl4ai.cli:cli"
|
crwl = "crawl4ai.cli:main"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def test_llm_filter():
|
async def test_llm_filter():
|
||||||
@@ -23,7 +23,7 @@ async def test_llm_filter():
|
|||||||
|
|
||||||
# Initialize LLM filter with focused instruction
|
# Initialize LLM filter with focused instruction
|
||||||
filter = LLMContentFilter(
|
filter = LLMContentFilter(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
instruction="""
|
instruction="""
|
||||||
Focus on extracting the core educational content about Python classes.
|
Focus on extracting the core educational content about Python classes.
|
||||||
Include:
|
Include:
|
||||||
@@ -43,7 +43,7 @@ async def test_llm_filter():
|
|||||||
)
|
)
|
||||||
|
|
||||||
filter = LLMContentFilter(
|
filter = LLMContentFilter(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||||
instruction="""
|
instruction="""
|
||||||
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import json
|
|||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
sys.path.append(parent_dir)
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
from crawl4ai.chunking_strategy import RegexChunking
|
from crawl4ai.chunking_strategy import RegexChunking
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
@@ -49,7 +49,7 @@ async def test_llm_extraction_strategy():
|
|||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
url = "https://www.nbcnews.com/business"
|
url = "https://www.nbcnews.com/business"
|
||||||
extraction_strategy = LLMExtractionStrategy(
|
extraction_strategy = LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||||
instruction="Extract only content related to technology",
|
instruction="Extract only content related to technology",
|
||||||
)
|
)
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from crawl4ai import (
|
|||||||
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
|
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
|
||||||
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
|
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
|
||||||
)
|
)
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
class Crawl4AiTester:
|
class Crawl4AiTester:
|
||||||
@@ -143,7 +143,7 @@ async def test_with_client():
|
|||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
markdown_generator=DefaultMarkdownGenerator(
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
content_filter=LLMContentFilter(
|
content_filter=LLMContentFilter(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-40"),
|
llm_config=LLMConfig(provider="openai/gpt-40"),
|
||||||
instruction="Extract key technical concepts"
|
instruction="Extract key technical concepts"
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import inspect
|
|||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
|
|
||||||
def to_serializable_dict(obj: Any) -> Dict:
|
def to_serializable_dict(obj: Any) -> Dict:
|
||||||
"""
|
"""
|
||||||
@@ -224,7 +224,7 @@ if __name__ == "__main__":
|
|||||||
config3 = CrawlerRunConfig(
|
config3 = CrawlerRunConfig(
|
||||||
markdown_generator=DefaultMarkdownGenerator(
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
content_filter=LLMContentFilter(
|
content_filter=LLMContentFilter(
|
||||||
llmConfig = LlmConfig(provider="openai/gpt-4"),
|
llm_config = LLMConfig(provider="openai/gpt-4"),
|
||||||
instruction="Extract key technical concepts",
|
instruction="Extract key technical concepts",
|
||||||
chunk_token_threshold=2000,
|
chunk_token_threshold=2000,
|
||||||
overlap_rate=0.1
|
overlap_rate=0.1
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import unittest, os
|
import unittest, os
|
||||||
from crawl4ai.async_configs import LlmConfig
|
from crawl4ai.types import LLMConfig
|
||||||
from crawl4ai.web_crawler import WebCrawler
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
from crawl4ai.chunking_strategy import (
|
from crawl4ai.chunking_strategy import (
|
||||||
RegexChunking,
|
RegexChunking,
|
||||||
@@ -43,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
|
|||||||
word_count_threshold=5,
|
word_count_threshold=5,
|
||||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
|
||||||
),
|
),
|
||||||
bypass_cache=True,
|
bypass_cache=True,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user