Merge branch 'next' into 2025-MAY-2
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -265,3 +265,6 @@ tests/**/benchmark_reports
|
|||||||
|
|
||||||
docs/**/data
|
docs/**/data
|
||||||
.codecat/
|
.codecat/
|
||||||
|
|
||||||
|
docs/apps/linkdin/debug*/
|
||||||
|
docs/apps/linkdin/samples/insights/*
|
||||||
@@ -66,6 +66,11 @@ from .deep_crawling import (
|
|||||||
DeepCrawlDecorator,
|
DeepCrawlDecorator,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
start_colab_display_server,
|
||||||
|
setup_colab_environment
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncLoggerBase",
|
"AsyncLoggerBase",
|
||||||
"AsyncLogger",
|
"AsyncLogger",
|
||||||
@@ -124,7 +129,9 @@ __all__ = [
|
|||||||
"Crawl4aiDockerClient",
|
"Crawl4aiDockerClient",
|
||||||
"ProxyRotationStrategy",
|
"ProxyRotationStrategy",
|
||||||
"RoundRobinProxyStrategy",
|
"RoundRobinProxyStrategy",
|
||||||
"ProxyConfig"
|
"ProxyConfig",
|
||||||
|
"start_colab_display_server",
|
||||||
|
"setup_colab_environment",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -764,6 +764,9 @@ class CrawlerRunConfig():
|
|||||||
Default: 60000 (60 seconds).
|
Default: 60000 (60 seconds).
|
||||||
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
||||||
Default: None.
|
Default: None.
|
||||||
|
wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
|
||||||
|
If None, uses page_timeout instead.
|
||||||
|
Default: None.
|
||||||
wait_for_images (bool): If True, wait for images to load before extracting content.
|
wait_for_images (bool): If True, wait for images to load before extracting content.
|
||||||
Default: False.
|
Default: False.
|
||||||
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
||||||
@@ -904,6 +907,7 @@ class CrawlerRunConfig():
|
|||||||
wait_until: str = "domcontentloaded",
|
wait_until: str = "domcontentloaded",
|
||||||
page_timeout: int = PAGE_TIMEOUT,
|
page_timeout: int = PAGE_TIMEOUT,
|
||||||
wait_for: str = None,
|
wait_for: str = None,
|
||||||
|
wait_for_timeout: int = None,
|
||||||
wait_for_images: bool = False,
|
wait_for_images: bool = False,
|
||||||
delay_before_return_html: float = 0.1,
|
delay_before_return_html: float = 0.1,
|
||||||
mean_delay: float = 0.1,
|
mean_delay: float = 0.1,
|
||||||
@@ -1000,6 +1004,7 @@ class CrawlerRunConfig():
|
|||||||
self.wait_until = wait_until
|
self.wait_until = wait_until
|
||||||
self.page_timeout = page_timeout
|
self.page_timeout = page_timeout
|
||||||
self.wait_for = wait_for
|
self.wait_for = wait_for
|
||||||
|
self.wait_for_timeout = wait_for_timeout
|
||||||
self.wait_for_images = wait_for_images
|
self.wait_for_images = wait_for_images
|
||||||
self.delay_before_return_html = delay_before_return_html
|
self.delay_before_return_html = delay_before_return_html
|
||||||
self.mean_delay = mean_delay
|
self.mean_delay = mean_delay
|
||||||
@@ -1141,6 +1146,7 @@ class CrawlerRunConfig():
|
|||||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||||
page_timeout=kwargs.get("page_timeout", 60000),
|
page_timeout=kwargs.get("page_timeout", 60000),
|
||||||
wait_for=kwargs.get("wait_for"),
|
wait_for=kwargs.get("wait_for"),
|
||||||
|
wait_for_timeout=kwargs.get("wait_for_timeout"),
|
||||||
wait_for_images=kwargs.get("wait_for_images", False),
|
wait_for_images=kwargs.get("wait_for_images", False),
|
||||||
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
||||||
mean_delay=kwargs.get("mean_delay", 0.1),
|
mean_delay=kwargs.get("mean_delay", 0.1),
|
||||||
@@ -1250,6 +1256,7 @@ class CrawlerRunConfig():
|
|||||||
"wait_until": self.wait_until,
|
"wait_until": self.wait_until,
|
||||||
"page_timeout": self.page_timeout,
|
"page_timeout": self.page_timeout,
|
||||||
"wait_for": self.wait_for,
|
"wait_for": self.wait_for,
|
||||||
|
"wait_for_timeout": self.wait_for_timeout,
|
||||||
"wait_for_images": self.wait_for_images,
|
"wait_for_images": self.wait_for_images,
|
||||||
"delay_before_return_html": self.delay_before_return_html,
|
"delay_before_return_html": self.delay_before_return_html,
|
||||||
"mean_delay": self.mean_delay,
|
"mean_delay": self.mean_delay,
|
||||||
@@ -1329,7 +1336,7 @@ class LLMConfig:
|
|||||||
provider: str = DEFAULT_PROVIDER,
|
provider: str = DEFAULT_PROVIDER,
|
||||||
api_token: Optional[str] = None,
|
api_token: Optional[str] = None,
|
||||||
base_url: Optional[str] = None,
|
base_url: Optional[str] = None,
|
||||||
temprature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
top_p: Optional[float] = None,
|
top_p: Optional[float] = None,
|
||||||
frequency_penalty: Optional[float] = None,
|
frequency_penalty: Optional[float] = None,
|
||||||
@@ -1357,7 +1364,7 @@ class LLMConfig:
|
|||||||
self.provider = DEFAULT_PROVIDER
|
self.provider = DEFAULT_PROVIDER
|
||||||
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.temprature = temprature
|
self.temperature = temperature
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
self.top_p = top_p
|
self.top_p = top_p
|
||||||
self.frequency_penalty = frequency_penalty
|
self.frequency_penalty = frequency_penalty
|
||||||
@@ -1371,7 +1378,7 @@ class LLMConfig:
|
|||||||
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
||||||
api_token=kwargs.get("api_token"),
|
api_token=kwargs.get("api_token"),
|
||||||
base_url=kwargs.get("base_url"),
|
base_url=kwargs.get("base_url"),
|
||||||
temprature=kwargs.get("temprature"),
|
temperature=kwargs.get("temperature"),
|
||||||
max_tokens=kwargs.get("max_tokens"),
|
max_tokens=kwargs.get("max_tokens"),
|
||||||
top_p=kwargs.get("top_p"),
|
top_p=kwargs.get("top_p"),
|
||||||
frequency_penalty=kwargs.get("frequency_penalty"),
|
frequency_penalty=kwargs.get("frequency_penalty"),
|
||||||
@@ -1385,7 +1392,7 @@ class LLMConfig:
|
|||||||
"provider": self.provider,
|
"provider": self.provider,
|
||||||
"api_token": self.api_token,
|
"api_token": self.api_token,
|
||||||
"base_url": self.base_url,
|
"base_url": self.base_url,
|
||||||
"temprature": self.temprature,
|
"temperature": self.temperature,
|
||||||
"max_tokens": self.max_tokens,
|
"max_tokens": self.max_tokens,
|
||||||
"top_p": self.top_p,
|
"top_p": self.top_p,
|
||||||
"frequency_penalty": self.frequency_penalty,
|
"frequency_penalty": self.frequency_penalty,
|
||||||
|
|||||||
@@ -971,8 +971,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
if config.wait_for:
|
if config.wait_for:
|
||||||
try:
|
try:
|
||||||
|
# Use wait_for_timeout if specified, otherwise fall back to page_timeout
|
||||||
|
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
|
||||||
await self.smart_wait(
|
await self.smart_wait(
|
||||||
page, config.wait_for, timeout=config.page_timeout
|
page, config.wait_for, timeout=timeout
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
@@ -1097,7 +1099,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
# If no session_id is given we should close the page
|
# If no session_id is given we should close the page
|
||||||
if not config.session_id:
|
all_contexts = page.context.browser.contexts
|
||||||
|
total_pages = sum(len(context.pages) for context in all_contexts)
|
||||||
|
if config.session_id:
|
||||||
|
pass
|
||||||
|
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
# Detach listeners before closing to prevent potential errors during close
|
# Detach listeners before closing to prevent potential errors during close
|
||||||
if config.capture_network_requests:
|
if config.capture_network_requests:
|
||||||
page.remove_listener("request", handle_request_capture)
|
page.remove_listener("request", handle_request_capture)
|
||||||
@@ -1107,6 +1115,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
page.remove_listener("console", handle_console_capture)
|
page.remove_listener("console", handle_console_capture)
|
||||||
page.remove_listener("pageerror", handle_pageerror_capture)
|
page.remove_listener("pageerror", handle_pageerror_capture)
|
||||||
|
|
||||||
|
# Close the page
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
||||||
|
|||||||
@@ -255,6 +255,13 @@ class ManagedBrowser:
|
|||||||
preexec_fn=os.setpgrp # Start in a new process group
|
preexec_fn=os.setpgrp # Start in a new process group
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If verbose is True print args used to run the process
|
||||||
|
if self.logger and self.browser_config.verbose:
|
||||||
|
self.logger.debug(
|
||||||
|
f"Starting browser with args: {' '.join(args)}",
|
||||||
|
tag="BROWSER"
|
||||||
|
)
|
||||||
|
|
||||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||||
await asyncio.sleep(0.5) # Give browser time to start
|
await asyncio.sleep(0.5) # Give browser time to start
|
||||||
await self._initial_startup_check()
|
await self._initial_startup_check()
|
||||||
@@ -511,6 +518,56 @@ class ManagedBrowser:
|
|||||||
return profiler.delete_profile(profile_name_or_path)
|
return profiler.delete_profile(profile_name_or_path)
|
||||||
|
|
||||||
|
|
||||||
|
async def clone_runtime_state(
|
||||||
|
src: BrowserContext,
|
||||||
|
dst: BrowserContext,
|
||||||
|
crawlerRunConfig: CrawlerRunConfig | None = None,
|
||||||
|
browserConfig: BrowserConfig | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Bring everything that *can* be changed at runtime from `src` → `dst`.
|
||||||
|
|
||||||
|
1. Cookies
|
||||||
|
2. localStorage (and sessionStorage, same API)
|
||||||
|
3. Extra headers, permissions, geolocation if supplied in configs
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ── 1. cookies ────────────────────────────────────────────────────────────
|
||||||
|
cookies = await src.cookies()
|
||||||
|
if cookies:
|
||||||
|
await dst.add_cookies(cookies)
|
||||||
|
|
||||||
|
# ── 2. localStorage / sessionStorage ──────────────────────────────────────
|
||||||
|
state = await src.storage_state()
|
||||||
|
for origin in state.get("origins", []):
|
||||||
|
url = origin["origin"]
|
||||||
|
kvs = origin.get("localStorage", [])
|
||||||
|
if not kvs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
page = dst.pages[0] if dst.pages else await dst.new_page()
|
||||||
|
await page.goto(url, wait_until="domcontentloaded")
|
||||||
|
for k, v in kvs:
|
||||||
|
await page.evaluate("(k,v)=>localStorage.setItem(k,v)", k, v)
|
||||||
|
|
||||||
|
# ── 3. runtime-mutable extras from configs ────────────────────────────────
|
||||||
|
# headers
|
||||||
|
if browserConfig and browserConfig.headers:
|
||||||
|
await dst.set_extra_http_headers(browserConfig.headers)
|
||||||
|
|
||||||
|
# geolocation
|
||||||
|
if crawlerRunConfig and crawlerRunConfig.geolocation:
|
||||||
|
await dst.grant_permissions(["geolocation"])
|
||||||
|
await dst.set_geolocation(
|
||||||
|
{
|
||||||
|
"latitude": crawlerRunConfig.geolocation.latitude,
|
||||||
|
"longitude": crawlerRunConfig.geolocation.longitude,
|
||||||
|
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return dst
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserManager:
|
class BrowserManager:
|
||||||
@@ -960,11 +1017,17 @@ class BrowserManager:
|
|||||||
|
|
||||||
# If using a managed browser, just grab the shared default_context
|
# If using a managed browser, just grab the shared default_context
|
||||||
if self.config.use_managed_browser:
|
if self.config.use_managed_browser:
|
||||||
context = self.default_context
|
if self.config.storage_state:
|
||||||
pages = context.pages
|
context = await self.create_browser_context(crawlerRunConfig)
|
||||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
ctx = self.default_context # default context, one window only
|
||||||
if not page:
|
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
||||||
page = context.pages[0] # await context.new_page()
|
page = await ctx.new_page()
|
||||||
|
else:
|
||||||
|
context = self.default_context
|
||||||
|
pages = context.pages
|
||||||
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
|
if not page:
|
||||||
|
page = context.pages[0] # await context.new_page()
|
||||||
else:
|
else:
|
||||||
# Otherwise, check if we have an existing context for this config
|
# Otherwise, check if we have an existing context for this config
|
||||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|||||||
@@ -218,8 +218,18 @@ class BrowserProfiler:
|
|||||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
# Start the browser
|
# Start the browser
|
||||||
await managed_browser.start()
|
# await managed_browser.start()
|
||||||
|
# 1. ── Start the browser ─────────────────────────────────────────
|
||||||
|
cdp_url = await managed_browser.start()
|
||||||
|
|
||||||
|
# 2. ── Attach Playwright to that running Chrome ──────────────────
|
||||||
|
pw = await async_playwright().start()
|
||||||
|
browser = await pw.chromium.connect_over_cdp(cdp_url)
|
||||||
|
# Grab the existing default context (there is always one)
|
||||||
|
context = browser.contexts[0]
|
||||||
|
|
||||||
# Check if browser started successfully
|
# Check if browser started successfully
|
||||||
browser_process = managed_browser.browser_process
|
browser_process = managed_browser.browser_process
|
||||||
@@ -244,6 +254,18 @@ class BrowserProfiler:
|
|||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 3. ── Persist storage state *before* we kill Chrome ─────────────
|
||||||
|
state_file = os.path.join(profile_path, "storage_state.json")
|
||||||
|
try:
|
||||||
|
await context.storage_state(path=state_file)
|
||||||
|
self.logger.info(f"[PROFILE].i storage_state saved → {state_file}", tag="PROFILE")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"[PROFILE].w failed to save storage_state: {e}", tag="PROFILE")
|
||||||
|
|
||||||
|
# 4. ── Close everything cleanly ──────────────────────────────────
|
||||||
|
await browser.close()
|
||||||
|
await pw.stop()
|
||||||
|
|
||||||
# If the browser is still running and the user pressed 'q', terminate it
|
# If the browser is still running and the user pressed 'q', terminate it
|
||||||
if browser_process.poll() is None and user_done_event.is_set():
|
if browser_process.poll() is None and user_done_event.is_set():
|
||||||
self.logger.info("Terminating browser process...", tag="PROFILE")
|
self.logger.info("Terminating browser process...", tag="PROFILE")
|
||||||
|
|||||||
@@ -541,7 +541,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
api_token: The API token for the provider.
|
api_token: The API token for the provider.
|
||||||
base_url: The base URL for the API request.
|
base_url: The base URL for the API request.
|
||||||
api_base: The base URL for the API request.
|
api_base: The base URL for the API request.
|
||||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
|
||||||
"""
|
"""
|
||||||
super().__init__( input_format=input_format, **kwargs)
|
super().__init__( input_format=input_format, **kwargs)
|
||||||
self.llm_config = llm_config
|
self.llm_config = llm_config
|
||||||
@@ -1168,7 +1168,11 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
elif not query and not target_json_example:
|
elif not query and not target_json_example:
|
||||||
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
|
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
|
||||||
|
|
||||||
user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
|
user_message["content"] += """IMPORTANT:
|
||||||
|
0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
|
||||||
|
1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable.
|
||||||
|
2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable.
|
||||||
|
3/ Do not use Regex as much as possible.
|
||||||
|
|
||||||
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
|
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ class PDFCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
# Just pass through with empty HTML - scraper will handle actual processing
|
# Just pass through with empty HTML - scraper will handle actual processing
|
||||||
return AsyncCrawlResponse(
|
return AsyncCrawlResponse(
|
||||||
html="", # Scraper will handle the real work
|
html="Scraper will handle the real work", # Scraper will handle the real work
|
||||||
response_headers={"Content-Type": "application/pdf"},
|
response_headers={"Content-Type": "application/pdf"},
|
||||||
status_code=200
|
status_code=200
|
||||||
)
|
)
|
||||||
@@ -66,6 +66,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
|||||||
image_save_dir=image_save_dir,
|
image_save_dir=image_save_dir,
|
||||||
batch_size=batch_size
|
batch_size=batch_size
|
||||||
)
|
)
|
||||||
|
self._temp_files = [] # Track temp files for cleanup
|
||||||
|
|
||||||
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
|
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
|
||||||
"""
|
"""
|
||||||
@@ -124,7 +125,13 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
|||||||
finally:
|
finally:
|
||||||
# Cleanup temp file if downloaded
|
# Cleanup temp file if downloaded
|
||||||
if url.startswith(("http://", "https://")):
|
if url.startswith(("http://", "https://")):
|
||||||
Path(pdf_path).unlink(missing_ok=True)
|
try:
|
||||||
|
Path(pdf_path).unlink(missing_ok=True)
|
||||||
|
if pdf_path in self._temp_files:
|
||||||
|
self._temp_files.remove(pdf_path)
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.warning(f"Failed to cleanup temp file {pdf_path}: {e}")
|
||||||
|
|
||||||
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
# For simple cases, you can use the sync version
|
# For simple cases, you can use the sync version
|
||||||
@@ -138,22 +145,45 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
# Create temp file with .pdf extension
|
# Create temp file with .pdf extension
|
||||||
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
||||||
|
self._temp_files.append(temp_file.name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Download PDF with streaming
|
if self.logger:
|
||||||
response = requests.get(url, stream=True)
|
self.logger.info(f"Downloading PDF from {url}...")
|
||||||
|
|
||||||
|
# Download PDF with streaming and timeout
|
||||||
|
# Connection timeout: 10s, Read timeout: 300s (5 minutes for large PDFs)
|
||||||
|
response = requests.get(url, stream=True, timeout=(20, 60 * 10))
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Get file size if available
|
||||||
|
total_size = int(response.headers.get('content-length', 0))
|
||||||
|
downloaded = 0
|
||||||
|
|
||||||
# Write to temp file
|
# Write to temp file
|
||||||
with open(temp_file.name, 'wb') as f:
|
with open(temp_file.name, 'wb') as f:
|
||||||
for chunk in response.iter_content(chunk_size=8192):
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
downloaded += len(chunk)
|
||||||
|
if self.logger and total_size > 0:
|
||||||
|
progress = (downloaded / total_size) * 100
|
||||||
|
if progress % 10 < 0.1: # Log every 10%
|
||||||
|
self.logger.debug(f"PDF download progress: {progress:.0f}%")
|
||||||
|
|
||||||
|
if self.logger:
|
||||||
|
self.logger.info(f"PDF downloaded successfully: {temp_file.name}")
|
||||||
|
|
||||||
return temp_file.name
|
return temp_file.name
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout as e:
|
||||||
|
# Clean up temp file if download fails
|
||||||
|
Path(temp_file.name).unlink(missing_ok=True)
|
||||||
|
self._temp_files.remove(temp_file.name)
|
||||||
|
raise RuntimeError(f"Timeout downloading PDF from {url}: {str(e)}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Clean up temp file if download fails
|
# Clean up temp file if download fails
|
||||||
Path(temp_file.name).unlink(missing_ok=True)
|
Path(temp_file.name).unlink(missing_ok=True)
|
||||||
|
self._temp_files.remove(temp_file.name)
|
||||||
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
|
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
|
||||||
|
|
||||||
elif url.startswith("file://"):
|
elif url.startswith("file://"):
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import html
|
|||||||
import lxml
|
import lxml
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
import platform
|
import platform
|
||||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||||
from array import array
|
from array import array
|
||||||
@@ -158,13 +159,20 @@ def merge_chunks(
|
|||||||
word_token_ratio: float = 1.0,
|
word_token_ratio: float = 1.0,
|
||||||
splitter: Callable = None
|
splitter: Callable = None
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Merges documents into chunks of specified token size.
|
"""
|
||||||
|
Merges a sequence of documents into chunks based on a target token count, with optional overlap.
|
||||||
|
|
||||||
|
Each document is split into tokens using the provided splitter function (defaults to str.split). Tokens are distributed into chunks aiming for the specified target size, with optional overlapping tokens between consecutive chunks. Returns a list of non-empty merged chunks as strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
docs: Input documents
|
docs: Sequence of input document strings to be merged.
|
||||||
target_size: Desired token count per chunk
|
target_size: Target number of tokens per chunk.
|
||||||
overlap: Number of tokens to overlap between chunks
|
overlap: Number of tokens to overlap between consecutive chunks.
|
||||||
word_token_ratio: Multiplier for word->token conversion
|
word_token_ratio: Multiplier to estimate token count from word count.
|
||||||
|
splitter: Callable used to split each document into tokens.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of merged document chunks as strings, each not exceeding the target token size.
|
||||||
"""
|
"""
|
||||||
# Pre-tokenize all docs and store token counts
|
# Pre-tokenize all docs and store token counts
|
||||||
splitter = splitter or str.split
|
splitter = splitter or str.split
|
||||||
@@ -173,7 +181,7 @@ def merge_chunks(
|
|||||||
total_tokens = 0
|
total_tokens = 0
|
||||||
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
tokens = doc.split()
|
tokens = splitter(doc)
|
||||||
count = int(len(tokens) * word_token_ratio)
|
count = int(len(tokens) * word_token_ratio)
|
||||||
if count: # Skip empty docs
|
if count: # Skip empty docs
|
||||||
token_counts.append(count)
|
token_counts.append(count)
|
||||||
@@ -1132,6 +1140,23 @@ def get_content_of_website_optimized(
|
|||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extracts and cleans content from website HTML, optimizing for useful media and contextual information.
|
||||||
|
|
||||||
|
Parses the provided HTML to extract internal and external links, filters and scores images for usefulness, gathers contextual descriptions for media, removes unwanted or low-value elements, and converts the cleaned HTML to Markdown. Also extracts metadata and returns all structured content in a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL of the website being processed.
|
||||||
|
html: The raw HTML content to extract from.
|
||||||
|
word_count_threshold: Minimum word count for elements to be retained.
|
||||||
|
css_selector: Optional CSS selector to restrict extraction to specific elements.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary containing Markdown content, cleaned HTML, extraction success status, media and link lists, and metadata.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidCSSSelectorError: If a provided CSS selector does not match any elements.
|
||||||
|
"""
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -1174,6 +1199,20 @@ def get_content_of_website_optimized(
|
|||||||
|
|
||||||
def process_image(img, url, index, total_images):
|
def process_image(img, url, index, total_images):
|
||||||
# Check if an image has valid display and inside undesired html elements
|
# Check if an image has valid display and inside undesired html elements
|
||||||
|
"""
|
||||||
|
Processes an HTML image element to determine its relevance and extract metadata.
|
||||||
|
|
||||||
|
Evaluates an image's visibility, context, and usefulness based on its attributes and parent elements. If the image passes validation and exceeds a usefulness score threshold, returns a dictionary with its source, alt text, contextual description, score, and type. Otherwise, returns None.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The BeautifulSoup image tag to process.
|
||||||
|
url: The base URL of the page containing the image.
|
||||||
|
index: The index of the image in the list of images on the page.
|
||||||
|
total_images: The total number of images on the page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary with image metadata if the image is considered useful, or None otherwise.
|
||||||
|
"""
|
||||||
def is_valid_image(img, parent, parent_classes):
|
def is_valid_image(img, parent, parent_classes):
|
||||||
style = img.get("style", "")
|
style = img.get("style", "")
|
||||||
src = img.get("src", "")
|
src = img.get("src", "")
|
||||||
@@ -1195,6 +1234,20 @@ def get_content_of_website_optimized(
|
|||||||
# Score an image for it's usefulness
|
# Score an image for it's usefulness
|
||||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||||
# Function to parse image height/width value and units
|
# Function to parse image height/width value and units
|
||||||
|
"""
|
||||||
|
Scores an HTML image element for usefulness based on size, format, attributes, and position.
|
||||||
|
|
||||||
|
The function evaluates an image's dimensions, file format, alt text, and its position among all images on the page to assign a usefulness score. Higher scores indicate images that are likely more relevant or informative for content extraction or summarization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The HTML image element to score.
|
||||||
|
base_url: The base URL used to resolve relative image sources.
|
||||||
|
index: The position of the image in the list of images on the page (zero-based).
|
||||||
|
images_count: The total number of images on the page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An integer usefulness score for the image.
|
||||||
|
"""
|
||||||
def parse_dimension(dimension):
|
def parse_dimension(dimension):
|
||||||
if dimension:
|
if dimension:
|
||||||
match = re.match(r"(\d+)(\D*)", dimension)
|
match = re.match(r"(\d+)(\D*)", dimension)
|
||||||
@@ -1209,6 +1262,16 @@ def get_content_of_website_optimized(
|
|||||||
# Fetch image file metadata to extract size and extension
|
# Fetch image file metadata to extract size and extension
|
||||||
def fetch_image_file_size(img, base_url):
|
def fetch_image_file_size(img, base_url):
|
||||||
# If src is relative path construct full URL, if not it may be CDN URL
|
# If src is relative path construct full URL, if not it may be CDN URL
|
||||||
|
"""
|
||||||
|
Fetches the file size of an image by sending a HEAD request to its URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: A BeautifulSoup tag representing the image element.
|
||||||
|
base_url: The base URL to resolve relative image sources.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The value of the "Content-Length" header as a string if available, otherwise None.
|
||||||
|
"""
|
||||||
img_url = urljoin(base_url, img.get("src"))
|
img_url = urljoin(base_url, img.get("src"))
|
||||||
try:
|
try:
|
||||||
response = requests.head(img_url)
|
response = requests.head(img_url)
|
||||||
@@ -1219,8 +1282,6 @@ def get_content_of_website_optimized(
|
|||||||
return None
|
return None
|
||||||
except InvalidSchema:
|
except InvalidSchema:
|
||||||
return None
|
return None
|
||||||
finally:
|
|
||||||
return
|
|
||||||
|
|
||||||
image_height = img.get("height")
|
image_height = img.get("height")
|
||||||
height_value, height_unit = parse_dimension(image_height)
|
height_value, height_unit = parse_dimension(image_height)
|
||||||
@@ -2845,5 +2906,73 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Fallback for parsing errors
|
# Fallback for parsing errors
|
||||||
return html_content[:max_size] if len(html_content) > max_size else html_content
|
return html_content[:max_size] if len(html_content) > max_size else html_content
|
||||||
|
|
||||||
|
def start_colab_display_server():
|
||||||
|
"""
|
||||||
|
Start virtual display server in Google Colab.
|
||||||
|
Raises error if not running in Colab environment.
|
||||||
|
"""
|
||||||
|
# Check if running in Google Colab
|
||||||
|
try:
|
||||||
|
import google.colab
|
||||||
|
from google.colab import output
|
||||||
|
from IPython.display import IFrame, display
|
||||||
|
except ImportError:
|
||||||
|
raise RuntimeError("This function must be run in Google Colab environment.")
|
||||||
|
|
||||||
|
import os, time, subprocess
|
||||||
|
|
||||||
|
os.environ["DISPLAY"] = ":99"
|
||||||
|
|
||||||
|
# Xvfb
|
||||||
|
xvfb = subprocess.Popen(["Xvfb", ":99", "-screen", "0", "1280x720x24"])
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# minimal window manager
|
||||||
|
fluxbox = subprocess.Popen(["fluxbox"])
|
||||||
|
|
||||||
|
# VNC → X
|
||||||
|
x11vnc = subprocess.Popen(["x11vnc",
|
||||||
|
"-display", ":99",
|
||||||
|
"-nopw", "-forever", "-shared",
|
||||||
|
"-rfbport", "5900", "-quiet"])
|
||||||
|
|
||||||
|
# websockify → VNC
|
||||||
|
novnc = subprocess.Popen(["/opt/novnc/utils/websockify/run",
|
||||||
|
"6080", "localhost:5900",
|
||||||
|
"--web", "/opt/novnc"])
|
||||||
|
|
||||||
|
time.sleep(2) # give ports a moment
|
||||||
|
|
||||||
|
# Colab proxy url
|
||||||
|
url = output.eval_js("google.colab.kernel.proxyPort(6080)")
|
||||||
|
display(IFrame(f"{url}/vnc.html?autoconnect=true&resize=scale", width=1024, height=768))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def setup_colab_environment():
|
||||||
|
"""
|
||||||
|
Alternative setup using IPython magic commands
|
||||||
|
"""
|
||||||
|
from IPython import get_ipython
|
||||||
|
ipython = get_ipython()
|
||||||
|
|
||||||
|
print("🚀 Setting up Crawl4AI environment in Google Colab...")
|
||||||
|
|
||||||
|
# Run the bash commands
|
||||||
|
ipython.run_cell_magic('bash', '', '''
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "📦 Installing system dependencies..."
|
||||||
|
apt-get update -y
|
||||||
|
apt-get install -y xvfb x11vnc fluxbox websockify git
|
||||||
|
|
||||||
|
echo "📥 Setting up virtual display..."
|
||||||
|
git clone https://github.com/novnc/noVNC /opt/novnc
|
||||||
|
git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify
|
||||||
|
|
||||||
|
pip install -q nest_asyncio google-colab
|
||||||
|
echo "✅ Setup complete!"
|
||||||
|
''')
|
||||||
|
|
||||||
|
|||||||
@@ -1263,7 +1263,7 @@ class LLMConfig:
|
|||||||
provider: str = DEFAULT_PROVIDER,
|
provider: str = DEFAULT_PROVIDER,
|
||||||
api_token: Optional[str] = None,
|
api_token: Optional[str] = None,
|
||||||
base_url: Optional[str] = None,
|
base_url: Optional[str] = None,
|
||||||
temprature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
top_p: Optional[float] = None,
|
top_p: Optional[float] = None,
|
||||||
frequency_penalty: Optional[float] = None,
|
frequency_penalty: Optional[float] = None,
|
||||||
@@ -1291,7 +1291,7 @@ class LLMConfig:
|
|||||||
self.provider = DEFAULT_PROVIDER
|
self.provider = DEFAULT_PROVIDER
|
||||||
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.temprature = temprature
|
self.temperature = temperature
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
self.top_p = top_p
|
self.top_p = top_p
|
||||||
self.frequency_penalty = frequency_penalty
|
self.frequency_penalty = frequency_penalty
|
||||||
@@ -1305,7 +1305,7 @@ class LLMConfig:
|
|||||||
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
||||||
api_token=kwargs.get("api_token"),
|
api_token=kwargs.get("api_token"),
|
||||||
base_url=kwargs.get("base_url"),
|
base_url=kwargs.get("base_url"),
|
||||||
temprature=kwargs.get("temprature"),
|
temperature=kwargs.get("temperature"),
|
||||||
max_tokens=kwargs.get("max_tokens"),
|
max_tokens=kwargs.get("max_tokens"),
|
||||||
top_p=kwargs.get("top_p"),
|
top_p=kwargs.get("top_p"),
|
||||||
frequency_penalty=kwargs.get("frequency_penalty"),
|
frequency_penalty=kwargs.get("frequency_penalty"),
|
||||||
@@ -1319,7 +1319,7 @@ class LLMConfig:
|
|||||||
"provider": self.provider,
|
"provider": self.provider,
|
||||||
"api_token": self.api_token,
|
"api_token": self.api_token,
|
||||||
"base_url": self.base_url,
|
"base_url": self.base_url,
|
||||||
"temprature": self.temprature,
|
"temperature": self.temperature,
|
||||||
"max_tokens": self.max_tokens,
|
"max_tokens": self.max_tokens,
|
||||||
"top_p": self.top_p,
|
"top_p": self.top_p,
|
||||||
"frequency_penalty": self.frequency_penalty,
|
"frequency_penalty": self.frequency_penalty,
|
||||||
@@ -4075,7 +4075,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
api_token: The API token for the provider.
|
api_token: The API token for the provider.
|
||||||
base_url: The base URL for the API request.
|
base_url: The base URL for the API request.
|
||||||
api_base: The base URL for the API request.
|
api_base: The base URL for the API request.
|
||||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
|
||||||
"""
|
"""
|
||||||
super().__init__( input_format=input_format, **kwargs)
|
super().__init__( input_format=input_format, **kwargs)
|
||||||
self.llm_config = llm_config
|
self.llm_config = llm_config
|
||||||
|
|||||||
1305
docs/apps/linkdin/Crawl4ai_Workshop_Extract_Linkdin_Data.ipynb
Normal file
1305
docs/apps/linkdin/Crawl4ai_Workshop_Extract_Linkdin_Data.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -1,7 +1,11 @@
|
|||||||
# Crawl4AI Prospect‑Wizard – step‑by‑step guide
|
# Crawl4AI Prospect‑Wizard – step‑by‑step guide
|
||||||
|
|
||||||
|
[](https://colab.research.google.com/drive/10nRCwmfxPjVrRUHyJsYlX7BH5bvPoGpx?usp=sharing)
|
||||||
|
|
||||||
A three‑stage demo that goes from **LinkedIn scraping** ➜ **LLM reasoning** ➜ **graph visualisation**.
|
A three‑stage demo that goes from **LinkedIn scraping** ➜ **LLM reasoning** ➜ **graph visualisation**.
|
||||||
|
|
||||||
|
**Try it in Google Colab!** Click the badge above to run this demo in a cloud environment with zero setup required.
|
||||||
|
|
||||||
```
|
```
|
||||||
prospect‑wizard/
|
prospect‑wizard/
|
||||||
├─ c4ai_discover.py # Stage 1 – scrape companies + people
|
├─ c4ai_discover.py # Stage 1 – scrape companies + people
|
||||||
|
|||||||
@@ -107,7 +107,14 @@ _COMPANY_SCHEMA_QUERY = dedent(
|
|||||||
|
|
||||||
IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
|
IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
|
||||||
The main div parent contains these li element is "div.search-results-container" you can use this.
|
The main div parent contains these li element is "div.search-results-container" you can use this.
|
||||||
The <ul> parent has "role" equal to "list". Using these two should be enough to target the <li> elements."
|
The <ul> parent has "role" equal to "list". Using these two should be enough to target the <li> elements.
|
||||||
|
|
||||||
|
IMPORTANT: Remember there might be multiple <a> tags that start with https://www.linkedin.com/company/[NAME],
|
||||||
|
so in case you refer to them for different fields, make sure to be more specific. One has the image, and one
|
||||||
|
has the person's name.
|
||||||
|
|
||||||
|
IMPORTANT: Be very smart in selecting the correct and unique way to address the element. You should ensure
|
||||||
|
your selector points to a single element and is unique to the place that contains the information.
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -235,6 +242,7 @@ async def crawl_people_page(
|
|||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
magic=True,
|
magic=True,
|
||||||
wait_for=".org-people-profile-card__card-spacing",
|
wait_for=".org-people-profile-card__card-spacing",
|
||||||
|
wait_for_images=5000,
|
||||||
delay_before_return_html=1,
|
delay_before_return_html=1,
|
||||||
session_id="people_search",
|
session_id="people_search",
|
||||||
)
|
)
|
||||||
@@ -422,6 +430,7 @@ def main():
|
|||||||
# decide on debug defaults
|
# decide on debug defaults
|
||||||
if cli_opts.debug:
|
if cli_opts.debug:
|
||||||
opts = detect_debug_defaults(force=True)
|
opts = detect_debug_defaults(force=True)
|
||||||
|
cli_opts = opts
|
||||||
else:
|
else:
|
||||||
env_defaults = detect_debug_defaults()
|
env_defaults = detect_debug_defaults()
|
||||||
opts = env_defaults if env_defaults else cli_opts
|
opts = env_defaults if env_defaults else cli_opts
|
||||||
|
|||||||
@@ -29,9 +29,10 @@ from typing import List, Dict, Any
|
|||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.logging import RichHandler
|
from rich.logging import RichHandler
|
||||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
||||||
import logging
|
|
||||||
|
|
||||||
|
|
||||||
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
||||||
|
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
@@ -45,6 +46,8 @@ import hashlib
|
|||||||
|
|
||||||
from litellm import completion #Support any LLM Provider
|
from litellm import completion #Support any LLM Provider
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
# Utils
|
# Utils
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
@@ -66,14 +69,16 @@ BASE_DIR = pathlib.Path(__file__).resolve().parent
|
|||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
def dev_defaults() -> SimpleNamespace:
|
def dev_defaults() -> SimpleNamespace:
|
||||||
return SimpleNamespace(
|
return SimpleNamespace(
|
||||||
in_dir="./debug_out",
|
in_dir="./samples",
|
||||||
out_dir="./insights_debug",
|
out_dir="./samples/insights",
|
||||||
embed_model="all-MiniLM-L6-v2",
|
embed_model="all-MiniLM-L6-v2",
|
||||||
top_k=10,
|
top_k=10,
|
||||||
llm_provider="openai/gpt-4.1",
|
llm_provider="openai/gpt-4.1",
|
||||||
llm_api_key=None,
|
llm_api_key=None,
|
||||||
max_llm_tokens=8000,
|
max_llm_tokens=8000,
|
||||||
llm_temperature=1.0,
|
llm_temperature=1.0,
|
||||||
|
stub=False, # Set to True to use a stub for org-chart inference
|
||||||
|
llm_base_url=None, # e.g., "https://api.openai.com/v1" for OpenAI
|
||||||
workers=4
|
workers=4
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -82,8 +87,9 @@ def dev_defaults() -> SimpleNamespace:
|
|||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
def embed_descriptions(companies, model_name:str, opts) -> np.ndarray:
|
def embed_descriptions(companies, model_name:str, opts) -> np.ndarray:
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
logging.debug(f"Using embedding model: {model_name}")
|
console = Console()
|
||||||
|
console.print(f"Using embedding model: [bold cyan]{model_name}[/]")
|
||||||
cache_path = BASE_DIR / Path(opts.out_dir) / "embeds_cache.json"
|
cache_path = BASE_DIR / Path(opts.out_dir) / "embeds_cache.json"
|
||||||
cache = {}
|
cache = {}
|
||||||
if cache_path.exists():
|
if cache_path.exists():
|
||||||
@@ -122,7 +128,6 @@ def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any
|
|||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
sims = cosine_similarity(embeds)
|
sims = cosine_similarity(embeds)
|
||||||
nodes, edges = [], []
|
nodes, edges = [], []
|
||||||
idx_of = {c["handle"]: i for i,c in enumerate(companies)}
|
|
||||||
for i,c in enumerate(companies):
|
for i,c in enumerate(companies):
|
||||||
node = dict(
|
node = dict(
|
||||||
id=c["handle"].strip("/"),
|
id=c["handle"].strip("/"),
|
||||||
@@ -252,18 +257,18 @@ def render_html(out:Path, template_dir:Path):
|
|||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
async def run(opts):
|
async def run(opts):
|
||||||
# ── silence SDK noise ──────────────────────────────────────────────────────
|
# ── silence SDK noise ──────────────────────────────────────────────────────
|
||||||
for noisy in ("openai", "httpx", "httpcore"):
|
# for noisy in ("openai", "httpx", "httpcore"):
|
||||||
lg = logging.getLogger(noisy)
|
# lg = logging.getLogger(noisy)
|
||||||
lg.setLevel(logging.WARNING) # or ERROR if you want total silence
|
# lg.setLevel(logging.WARNING) # or ERROR if you want total silence
|
||||||
lg.propagate = False # optional: stop them reaching root
|
# lg.propagate = False # optional: stop them reaching root
|
||||||
|
|
||||||
# ────────────── logging bootstrap ──────────────
|
# ────────────── logging bootstrap ──────────────
|
||||||
console = Console()
|
console = Console()
|
||||||
logging.basicConfig(
|
# logging.basicConfig(
|
||||||
level="INFO",
|
# level="INFO",
|
||||||
format="%(message)s",
|
# format="%(message)s",
|
||||||
handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
|
# handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
|
||||||
)
|
# )
|
||||||
|
|
||||||
in_dir = BASE_DIR / Path(opts.in_dir)
|
in_dir = BASE_DIR / Path(opts.in_dir)
|
||||||
out_dir = BASE_DIR / Path(opts.out_dir)
|
out_dir = BASE_DIR / Path(opts.out_dir)
|
||||||
@@ -272,12 +277,12 @@ async def run(opts):
|
|||||||
companies = load_jsonl(in_dir/"companies.jsonl")
|
companies = load_jsonl(in_dir/"companies.jsonl")
|
||||||
people = load_jsonl(in_dir/"people.jsonl")
|
people = load_jsonl(in_dir/"people.jsonl")
|
||||||
|
|
||||||
logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
|
console.print(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
|
||||||
|
|
||||||
logging.info("[bold]⇢[/] Embedding company descriptions…")
|
console.print("[bold]⇢[/] Embedding company descriptions…")
|
||||||
embeds = embed_descriptions(companies, opts.embed_model, opts)
|
embeds = embed_descriptions(companies, opts.embed_model, opts)
|
||||||
|
|
||||||
logging.info("[bold]⇢[/] Building similarity graph")
|
console.print("[bold]⇢[/] Building similarity graph")
|
||||||
company_graph = build_company_graph(companies, embeds, opts.top_k)
|
company_graph = build_company_graph(companies, embeds, opts.top_k)
|
||||||
dump_json(company_graph, out_dir/"company_graph.json")
|
dump_json(company_graph, out_dir/"company_graph.json")
|
||||||
|
|
||||||
@@ -286,19 +291,19 @@ async def run(opts):
|
|||||||
for comp in companies:
|
for comp in companies:
|
||||||
handle = comp["handle"].strip("/").replace("/","_")
|
handle = comp["handle"].strip("/").replace("/","_")
|
||||||
out_file = out_dir/f"org_chart_{handle}.json"
|
out_file = out_dir/f"org_chart_{handle}.json"
|
||||||
if out_file.exists() and False:
|
if out_file.exists():
|
||||||
logging.info(f"[green]✓[/] Skipping existing {comp['name']}")
|
console.print(f"[green]✓[/] Skipping existing {comp['name']}")
|
||||||
continue
|
continue
|
||||||
to_process.append(comp)
|
to_process.append(comp)
|
||||||
|
|
||||||
|
|
||||||
if not to_process:
|
if not to_process:
|
||||||
logging.info("[yellow]All companies already processed[/]")
|
console.print("[yellow]All companies already processed[/]")
|
||||||
else:
|
else:
|
||||||
workers = getattr(opts, 'workers', 1)
|
workers = getattr(opts, 'workers', 1)
|
||||||
parallel = workers > 1
|
parallel = workers > 1
|
||||||
|
|
||||||
logging.info(f"[bold]⇢[/] Inferring org-charts via LLM {f'(parallel={workers} workers)' if parallel else ''}")
|
console.print(f"[bold]⇢[/] Inferring org-charts via LLM {f'(parallel={workers} workers)' if parallel else ''}")
|
||||||
|
|
||||||
with Progress(
|
with Progress(
|
||||||
SpinnerColumn(),
|
SpinnerColumn(),
|
||||||
@@ -341,12 +346,11 @@ async def run(opts):
|
|||||||
# Run with concurrency control
|
# Run with concurrency control
|
||||||
await asyncio.gather(*(bounded_process(task) for task in tasks))
|
await asyncio.gather(*(bounded_process(task) for task in tasks))
|
||||||
|
|
||||||
logging.info("[bold]⇢[/] Flattening decision-makers CSV")
|
console.print("[bold]⇢[/] Flattening decision-makers CSV")
|
||||||
export_decision_makers(out_dir, out_dir/"decision_makers.csv")
|
export_decision_makers(out_dir, out_dir/"decision_makers.csv")
|
||||||
|
|
||||||
render_html(out_dir, template_dir=BASE_DIR/"templates")
|
render_html(out_dir, template_dir=BASE_DIR/"templates")
|
||||||
logging.success = lambda msg, **k: console.print(f"[bold green]✓[/] {msg}", **k)
|
console.print(f"[bold green]✓[/] Stage-2 artefacts written to {out_dir}")
|
||||||
logging.success(f"Stage-2 artefacts written to {out_dir}")
|
|
||||||
|
|
||||||
# ───────────────────────────────────────────────────────────────────────────────
|
# ───────────────────────────────────────────────────────────────────────────────
|
||||||
# CLI
|
# CLI
|
||||||
@@ -369,8 +373,8 @@ def build_arg_parser():
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
dbg = dev_defaults()
|
dbg = dev_defaults()
|
||||||
# opts = dbg if True else build_arg_parser().parse_args()
|
opts = dbg if True else build_arg_parser().parse_args()
|
||||||
opts = build_arg_parser().parse_args()
|
# opts = build_arg_parser().parse_args()
|
||||||
asyncio.run(run(opts))
|
asyncio.run(run(opts))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
9
docs/apps/linkdin/samples/companies.jsonl
Normal file
9
docs/apps/linkdin/samples/companies.jsonl
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{"handle": "https://www.linkedin.com/company/healthpartnersng/", "name": "Health Partners HMO", "descriptor": "Hospitals and Health Care • Ikoyi, LAGOS", "about": "Healthpartners Ltd is a leading HMO in Nigeria providing affordablehealthinsuranceandhealthmanagementservices for companies and individuals in Nigeria. We have several individual and group plans that meets yourhealthmanagementneeds. Call us now at 0807-460-9165, 0807-714-0759 or email...", "followers": null, "people_url": "https://www.linkedin.com/company/healthpartnersng/people/", "captured_at": "2025-04-29T10:46:08Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "name": "Health & Insurance Management Services Organization", "descriptor": "Non-profit Organizations • Mbeya", "about": "Health&InsuranceManagementServices Organization (HIMSO) was established and registered in 2012 as a Non- Government Organization (NGO) with the aim...", "followers": 35, "people_url": "https://www.linkedin.com/company/health-insurance-management-services-organization/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "name": "National Health Insurance Management Authority", "descriptor": "Insurance • Lusaka, Lusaka", "about": "The NationalHealthInsuranceManagementAuthority (NHIMA) is established pursuant to section 4 of the NationalHealthInsurance(NHI) Act No. 2 of 2018. The compulsory NationalHealthInsurancescheme seeks to provide for a sound and reliable healthcare financing for Zambian households and the entirehealthsector...", "followers": null, "people_url": "https://www.linkedin.com/company/national-health-insurance-management-authority/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/health-alliance-plan/", "name": "Health Alliance Plan", "descriptor": "Hospitals and Health Care • Detroit, MI", "about": "...organizations to enhance the lives of those we touch. We offer six distincthealthinsurancelines: • Group Insured Commercial • Individual • Medicare • Medicaid • Self-Funded • Network Leasing HAP also provides: • Award-winning wellness programs • Community outreach • Digitalhealthtools • Diseasemanagement...", "followers": null, "people_url": "https://www.linkedin.com/company/health-alliance-plan/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "name": "Insurance Recruiting Solutions", "descriptor": "Insurance • Waukee, Iowa", "about": "InsuranceRecruiting Solutions provides staffing and recruiting services exclusively to theinsuranceindustry. We are committed to providing highly personalized recruiting services, tailored to each candidate and employer. With years ofinsuranceindustry experience, we speak your language. As a leading national...", "followers": null, "people_url": "https://www.linkedin.com/company/insurance-recruiting-solutions/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "name": "Health Plan of San Mateo (HPSM)", "descriptor": "Hospitals and Health Care • South San Francisco, California", "about": "TheHealthPlan of San Mateo (HPSM) is a local non-profithealthcare plan that offershealthcoverage and a provider network to San Mateo County's under-insured population. We currently serve more than 145,000 County residents.", "followers": null, "people_url": "https://www.linkedin.com/company/healthplanofsanmateo/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/insurance-management-group_2/", "name": "Insurance Management Group", "descriptor": "Insurance • Marion, Indiana", "about": "InsuranceManagementGroup is an all-riskinsuranceagency with over 140 years of experience, specializing in Home, Auto, BusinessInsurance, Individual Life &Health, and Employee Benefits. We represent highly rated and financially soundinsurancecarriers, to ensure that our clients are getting the best coverage...", "followers": null, "people_url": "https://www.linkedin.com/company/insurance-management-group_2/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "name": "CareCard Health Insurance Management Co", "descriptor": "Insurance • Damascus", "about": "CareCard offers Business Process Outsourcing (BPO) services toInsurance, Self Funded and Retireehealthplan market. CareCard provides operational outsourcing...", "followers": 187, "people_url": "https://www.linkedin.com/company/carecard-health-insurance-management-co/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
|
{"handle": "https://www.linkedin.com/company/healthcluster/", "name": "Health Cluster", "descriptor": "Technology, Information and Internet • Dubai", "about": "..., knowledge and interaction. The company has solutions and products inHealthTech, eHealth, DigitalHealth, Revenue CycleManagement– RCM Solutions, AI & ML, Internet...", "followers": null, "people_url": "https://www.linkedin.com/company/healthcluster/people/", "captured_at": "2025-04-29T13:15:04Z"}
|
||||||
108
docs/apps/linkdin/samples/people.jsonl
Normal file
108
docs/apps/linkdin/samples/people.jsonl
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
{"profile_url": null, "name": "Yahya Ipuge", "headline": "Senior Health Specialist, Independent Consultant, Certified Board Director, Board Chair in NGO and Private Entities", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFuqPObSyLPMQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1517757008397?e=1751500800&v=beta&t=zaHc2CY7AJ-eX1MCSvazp8ny37iBAu3YsyaZjwq6gB0", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-29T13:15:33Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Field officer at Health and Insurance Management Services Organization", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5103AQEVmdDwTIhsjQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1540989154156?e=1751500800&v=beta&t=7N0baJNfZ26dbrNNbv2055sbGlacQUwQu07wUTN0whs", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-29T13:15:33Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Medical Practitioner @ Health & Insurance | Master's Degree in Infection Control", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQHjMXy7dSmmLg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1725975429410?e=1751500800&v=beta&t=lDIL2KhDw471XYvtCrRfkHAnG3Q-npDJnwDdK0sYvpA", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-29T13:15:34Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "--", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-29T13:15:38Z"}
|
||||||
|
{"profile_url": null, "name": "Fadhy Mtanga", "headline": "Executive Director at Health & Insurance Management Services Organization (HIMSO) Author | Creative Writer | Social Scientist", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQEloEreyg3qVQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1704391866585?e=1751500800&v=beta&t=86am-v3cjBPBldLTwgt8-AY-YbxFY6QZQzObwLTtMEA", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-29T13:15:38Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Business Administrator at Consultancy Business investments", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQEuKXJmknr2YA/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1714545221728?e=1751500800&v=beta&t=zJG-rDZgYJJ0eROibf-Wag-v_JecCghwU3ul4TaH2Eg", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-29T13:15:48Z"}
|
||||||
|
{"profile_url": null, "name": "Tamani Phiri", "headline": "Corporate Business Strategy | Thought Leadership | Corporate Governance", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQF4mFx8jY2n-w/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1730302954035?e=1751500800&v=beta&t=i4QIrHA6A9eLtKolwTRNhuoiaTad28sf5KHxAFuXG-w", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-29T13:15:48Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Member Service Assistant @ National Health Insurance Management Authority (NHIMA) | Clinical Officer | Health Insurance & Public Health | Claims Processing & Customer Support | Data & Policy Analyst", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQGob13KyxrB0g/profile-displayphoto-shrink_100_100/B4DZYCgreeHIAU-/0/1743798848889?e=1751500800&v=beta&t=uXxTsMLi5s7hr8FBEzVTDw7V3eJ85kpTaIC7i_5fM-Y", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-29T13:15:48Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Economist/ Development Analyst/ Planner/ Customer Care", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQFEc3EgfdpZeg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1727782989867?e=1751500800&v=beta&t=dWjKzSu5FDRgmxAVret9jQPhWF2VjcrnmEpR2LDMC1Q", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-29T13:15:48Z"}
|
||||||
|
{"profile_url": null, "name": "Samantha Ngandwe", "headline": "Quality Assurance and Accreditation Officer at National Health Insurance Management Authority", "followers": 382, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQHyOjyoz7d95g/profile-displayphoto-shrink_100_100/B4DZYvvhP5GwAY-/0/1744557712084?e=1751500800&v=beta&t=DLYRpz20zmwUWx1UY1Dn-ykvgWBnwn8XHWLaDMf199M", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-29T13:15:48Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Dental Surgery Assistant at Health Promotion Board", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-29T13:16:11Z"}
|
||||||
|
{"profile_url": null, "name": "Liz England Tucker", "headline": "Medical Performance Optimization", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQFY6yx360QunQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1713831102587?e=1751500800&v=beta&t=u-C8Ozpl_ITkTpdgt5QD-C5_Qt7MA0DagLRmiuGKngQ", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-29T13:16:11Z"}
|
||||||
|
{"profile_url": null, "name": "Merrill Hausenfluck", "headline": "Chief Financial Officer", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQGKxDKRJM_BCg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1696292650180?e=1751500800&v=beta&t=NbUVC-QP-XL3frBpQcn3GtGrZ04Fl0xdko4V-mHxPag", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-29T13:16:11Z"}
|
||||||
|
{"profile_url": null, "name": "Mike Treash", "headline": "Senior Vice President and Chief Operating Officer at Health Alliance Plan", "followers": 2000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQH_c6tIq929gw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1720478900599?e=1751500800&v=beta&t=l9RLnLDKBBJjJQTsFMJMa_1MpWCKcV4AUa3dcjGnSXQ", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-29T13:16:11Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Manager at Health Alliance Plan", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-29T13:16:11Z"}
|
||||||
|
{"profile_url": null, "name": "Scot Dickerson", "headline": "Insurance Industry Specialist, Insurance Recruiter, Talent Acquisition, Talent Sourcing, Hiring Consultant, Career Consultant, Staffing, Executive Recruiter at Insurance Recruiting Solutions #insurancejobs #insurance", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQGLFvtPPU3HEw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1724950672124?e=1751500800&v=beta&t=uT4SFSMF32O1d50Z0dbnd6zRRKdABHxSGlOZdxWdXBM", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-29T13:16:24Z"}
|
||||||
|
{"profile_url": null, "name": "Steele Dickerson", "headline": "Insurance Recruiting Solutions", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQEyICWaE_PvXA/profile-displayphoto-shrink_100_100/B56ZQuDHyZH0Ac-/0/1735939358232?e=1751500800&v=beta&t=9FdnWHrjnPQ7LQ5FdwC7sY8sS6hm-R4zfWO5Vmwm46w", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-29T13:16:24Z"}
|
||||||
|
{"profile_url": null, "name": "Madeline Judas", "headline": "Recruiting Operations & Business Development Specialist", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQG6xiTaJ71UiA/profile-displayphoto-shrink_100_100/B56ZU_N_jPHoAY-/0/1740522388021?e=1751500800&v=beta&t=CxvAsYgU0zelghZsRhUJOC26ILVovP3ZPn4nMnWkEJE", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-29T13:16:24Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "All Lines Claims Adjuster / General Lines Agent (Property & Casualty : Life, Accident, Health & HMO)", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQFTjkb7SxTWWg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1725920318474?e=1751500800&v=beta&t=BGEzQg1c2l8qxuy2iKJ896nElsiYcaWnhkf-mqc-KhY", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-29T13:16:24Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Clinical Pharmacy Manager at Health Plan of San Mateo (HPSM)", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQEPO0pZOxznoA/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1551565536585?e=1751500800&v=beta&t=qwMGzWX_Zefkciq8h2m9daLMflT0WoDr5F1R5pXvyM4", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-29T13:16:40Z"}
|
||||||
|
{"profile_url": null, "name": "Tamana M.", "headline": "MPH Candidate at Brown University | Data Coordinator", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQEY3iDtFmpzlg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1714197678074?e=1751500800&v=beta&t=IsVT0uC7A-T-Tp22gZFDG9wiT7LMB5GmhccuI8f9c-I", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-29T13:16:40Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Program Manager", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-29T13:16:40Z"}
|
||||||
|
{"profile_url": null, "name": "Mackenzie Baysinger Moniz, MSW", "headline": "Program Manager at Health Plan of San Mateo", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQHAd3A4zLyuWA/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1675716742150?e=1751500800&v=beta&t=ot3fMyJFnHwwNfKJiA_YxZp6MOK_iVGtSCUgVNq867g", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-29T13:16:40Z"}
|
||||||
|
{"profile_url": null, "name": "John O.", "headline": "Healthcare Delivery Strategy Execution", "followers": null, "connection_degree": "· 3rd", "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-29T13:16:40Z"}
|
||||||
|
{"profile_url": null, "name": "Daniel McQuilkin", "headline": "Senior Vice President", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFkScOqwhxvfQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1521406683682?e=1751500800&v=beta&t=iohhak3lrV1gpmA6dnoCxTRJidskfgmZUXKbNQbkxjs", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-29T13:17:05Z"}
|
||||||
|
{"profile_url": null, "name": "Tony Bonacuse", "headline": "Senior Vice President at Insurance Management Group", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQF_JJOFLjkZoQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1516269003018?e=1751500800&v=beta&t=0APZt5RNhvUj4IxsSdi7JO9KxezZzOH_WQCibn5Szgs", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-29T13:17:05Z"}
|
||||||
|
{"profile_url": null, "name": "Mark Bilger", "headline": "Director - Sr. Vice President at Insurance Management Group", "followers": 1000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQEzX5qUfqhd2g/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1663842785708?e=1751500800&v=beta&t=YyKXRQol0cDntoq8vbdxyaRvEFf0vWKNHPxk0cyWiG8", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-29T13:17:05Z"}
|
||||||
|
{"profile_url": null, "name": "Adam Young, MBA", "headline": "Husband | Father | Traveler | Sports Fanatic | Food Enthusiast | Independent Insurance Professional", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQErWIq1AVyxKg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1601480475688?e=1751500800&v=beta&t=jK_mhX0PkDdG8WBZaipIIYRDm1PnWIuFR7sCKDhDi6s", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-29T13:17:05Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Senior Vice President at Insurance Management Group / Partner", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQH3dm30dXH82w/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1572228299104?e=1751500800&v=beta&t=iuBQYs4iLHJgRgjFbSA2YiNiAI8zDILqg-nVsLR9Qjk", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-29T13:17:05Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Doctor at CareCard Health Insurance Management Co", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-29T13:17:09Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Pharmacist", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQHyPi4Amu_Dkw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1640460490377?e=1751500800&v=beta&t=q7R_b7bD9CR-1-Dvu81WoEHN_ljHK16l6ioTIA0LN7Q", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-29T13:17:09Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "IT Manager at CareCard Health Insurance Management Co", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-29T13:17:09Z"}
|
||||||
|
{"profile_url": null, "name": "Amal Shabani", "headline": "at carecard", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQFLzeP3yPkjgg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1519625412373?e=1751500800&v=beta&t=GULSoesSn83F_fYkkH_nPxWIjjs1d9Pucc3dUDNei6I", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-29T13:17:09Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "--", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-29T13:17:09Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Biologiste | Pharmaco-épidemiologie & Pharmaco-économie | Software Helath Care Management", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQHOPXrX5-oeug/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1663013895834?e=1751500800&v=beta&t=yE2RGp0rfhcJkjh_vdM0VwpaPUtoPewM80lTlr20OHU", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-29T13:17:14Z"}
|
||||||
|
{"profile_url": null, "name": "Ruqaia Ali Alkhalifa", "headline": " RN,BSN, MSN,NE Database Officer for Scholarship Programs and Central Committee rapporteur at Al-Ahsa Health Cluster.", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQGfNujqDnuZDA/profile-displayphoto-shrink_100_100/B4EZOvsQThH0AU-/0/1733819436577?e=1751500800&v=beta&t=jleAVvhbg0H85tSi9TG96x0fqdkS1oytfaU02LHsFEI", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-29T13:17:14Z"}
|
||||||
|
{"profile_url": null, "name": "Fahad Mohyuddin", "headline": "Healthcare AI Strategist | Digital Health | SaaS | Telehealth | HIS | EHR | IoT", "followers": 7000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFLnPh8fu-HHg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1647320077586?e=1751500800&v=beta&t=S__knVzEVrGZuyqwszCe_5V_kawbG5tejmmEe3fkMJE", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-29T13:17:14Z"}
|
||||||
|
{"profile_url": null, "name": "Muhammad Moid Shams", "headline": "Azure DevOps | AWS Cloud Infrastructure| Freight Tech | Health Tech | HL7- NABIDH | HL7+ FHIR | KSA -NPHIES | FHIR - MOPH | HL7- Riayati | Freight Tech | Insure Tech | with Azure, Azure AI , PowerApps, D365 , M365", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQEzousRurY2Zg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1711283874675?e=1751500800&v=beta&t=ZheuoRIAkS_9M8WXafdwB1nJEuy-a5HEsrXlfOANx80", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-29T13:17:14Z"}
|
||||||
|
{"profile_url": null, "name": "Muhammad Shahzaib (PMP® - SCRUM®)", "headline": "PMP-Certified Project Manager | Health Care & Web Solutions Expert | Customer Success & Operations Management Expert | Business Transformation Expert", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D35AQFyp7WcBZinYA/profile-framedphoto-shrink_100_100/profile-framedphoto-shrink_100_100/0/1730638721808?e=1746540000&v=beta&t=QoGze1AlotUfm3K9kMWG6ZGVHS3ADu38THVPlxlYUys", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-29T13:17:14Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Music Professional at Health Options Worldwide", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5103AQGF-Dp6v6nkGw/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1585401654822?e=1751500800&v=beta&t=7yeO-dGz1p_B66cJVSlTSdAYJLMFFwxPIhwwcR8uWWo", "company_handle": "https://www.linkedin.com/company/health-options-worldwide/", "captured_at": "2025-04-29T13:17:17Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Trainer/instructor at Health Options Worldwide", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-options-worldwide/", "captured_at": "2025-04-29T13:17:17Z"}
|
||||||
|
{"profile_url": null, "name": "Michael Akpoarebe-Isaac", "headline": "Chief Operating officer, Health Partners HMO", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQE7KNFaLMyqYg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1602714385413?e=1751500800&v=beta&t=In5GaREqoXtO3sPCx9ZJJBwIPY4008ii13RPRl0w0Fw", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-29T13:17:22Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "REGISTERED NURSE/CLAIMS SUPERVISOR/HEALTH EDUCATOR/ CASE MANAGER/ Lekki.", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4E35AQEEqf5i5pD76g/profile-framedphoto-shrink_100_100/profile-framedphoto-shrink_100_100/0/1724219552412?e=1746540000&v=beta&t=1PAfKEpQFL196LZHfY0wHAZ35TH0fRjku9ihSfDdOk4", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-29T13:17:22Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Data Analyst|Dedicated Retention Officer Boosting Customer Loyalty| Business Developer/ Event planner", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D35AQHIgeS1H7w65w/profile-framedphoto-shrink_100_100/B4DZV1QfcCGcAk-/0/1741429012517?e=1746540000&v=beta&t=NCIbW7MWY7Cy4YEC2xzLoX54-Lm5CNhorbuSQe0lZSk", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-29T13:17:22Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Former Group managing director at Health Partners Ltd", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQHPQPvIQbPQPg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1583328612508?e=1751500800&v=beta&t=LpynArccJCWrdWMSBvYLH4SI5G-xae7ECoWUUAl_CeU", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-29T13:17:22Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "HEAD, FINCON, @ HEALTH PARTNERS (HMO) LTD", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQG8XOvnazEibQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1518882054975?e=1751500800&v=beta&t=5gT6GAWGTqYfpvkjOk0ArvV73I_KspkWXgoG-VhoStg", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-29T13:17:22Z"}
|
||||||
|
{"profile_url": null, "name": "Yahya Ipuge", "headline": "Senior Health Specialist, Independent Consultant, Certified Board Director, Board Chair in NGO and Private Entities", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFuqPObSyLPMQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1517757008397?e=1751500800&v=beta&t=zaHc2CY7AJ-eX1MCSvazp8ny37iBAu3YsyaZjwq6gB0", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:36:39Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Field officer at Health and Insurance Management Services Organization", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5103AQEVmdDwTIhsjQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1540989154156?e=1751500800&v=beta&t=7N0baJNfZ26dbrNNbv2055sbGlacQUwQu07wUTN0whs", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:36:39Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Medical Practitioner @ Health & Insurance | Master's Degree in Infection Control", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQHjMXy7dSmmLg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1725975429410?e=1751500800&v=beta&t=lDIL2KhDw471XYvtCrRfkHAnG3Q-npDJnwDdK0sYvpA", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:36:39Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "--", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:36:39Z"}
|
||||||
|
{"profile_url": null, "name": "Fadhy Mtanga", "headline": "Executive Director at Health & Insurance Management Services Organization (HIMSO) Author | Creative Writer | Social Scientist", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQEloEreyg3qVQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1704391866585?e=1751500800&v=beta&t=86am-v3cjBPBldLTwgt8-AY-YbxFY6QZQzObwLTtMEA", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:36:39Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Member Service Assistant @ National Health Insurance Management Authority (NHIMA) | Clinical Officer | Health Insurance & Public Health | Claims Processing & Customer Support | Data & Policy Analyst", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQGob13KyxrB0g/profile-displayphoto-shrink_100_100/B4DZYCgreeHIAU-/0/1743798848889?e=1751500800&v=beta&t=uXxTsMLi5s7hr8FBEzVTDw7V3eJ85kpTaIC7i_5fM-Y", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:36:45Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Business Administrator at Consultancy Business investments", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQEuKXJmknr2YA/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1714545221728?e=1751500800&v=beta&t=zJG-rDZgYJJ0eROibf-Wag-v_JecCghwU3ul4TaH2Eg", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:36:45Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Economist/ Development Analyst/ Planner/ Customer Care", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQFEc3EgfdpZeg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1727782989867?e=1751500800&v=beta&t=dWjKzSu5FDRgmxAVret9jQPhWF2VjcrnmEpR2LDMC1Q", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:36:45Z"}
|
||||||
|
{"profile_url": null, "name": "Tamani Phiri", "headline": "Corporate Business Strategy | Thought Leadership | Corporate Governance", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQF4mFx8jY2n-w/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1730302954035?e=1751500800&v=beta&t=i4QIrHA6A9eLtKolwTRNhuoiaTad28sf5KHxAFuXG-w", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:36:45Z"}
|
||||||
|
{"profile_url": null, "name": "Samantha Ngandwe", "headline": "Quality Assurance and Accreditation Officer at National Health Insurance Management Authority", "followers": 382, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQHyOjyoz7d95g/profile-displayphoto-shrink_100_100/B4DZYvvhP5GwAY-/0/1744557712084?e=1751500800&v=beta&t=DLYRpz20zmwUWx1UY1Dn-ykvgWBnwn8XHWLaDMf199M", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:36:45Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Dental Surgery Assistant at Health Promotion Board", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:36:51Z"}
|
||||||
|
{"profile_url": null, "name": "Merrill Hausenfluck", "headline": "Chief Financial Officer", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQGKxDKRJM_BCg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1696292650180?e=1751500800&v=beta&t=NbUVC-QP-XL3frBpQcn3GtGrZ04Fl0xdko4V-mHxPag", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:36:51Z"}
|
||||||
|
{"profile_url": null, "name": "Mike Treash", "headline": "Senior Vice President and Chief Operating Officer at Health Alliance Plan", "followers": 2000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQH_c6tIq929gw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1720478900599?e=1751500800&v=beta&t=l9RLnLDKBBJjJQTsFMJMa_1MpWCKcV4AUa3dcjGnSXQ", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:36:51Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Manager at Health Alliance Plan", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:36:51Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Manager, Government Programs at Health Alliance Plan", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQF473eFGZeIpQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1654455840818?e=1751500800&v=beta&t=FllKCznSi0Ndm75QYy2i5UDtflCojNGkVzRpoChPC8c", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:36:51Z"}
|
||||||
|
{"profile_url": null, "name": "Steele Dickerson", "headline": "Insurance Recruiting Solutions", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQEyICWaE_PvXA/profile-displayphoto-shrink_100_100/B56ZQuDHyZH0Ac-/0/1735939358232?e=1751500800&v=beta&t=9FdnWHrjnPQ7LQ5FdwC7sY8sS6hm-R4zfWO5Vmwm46w", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-30T07:36:56Z"}
|
||||||
|
{"profile_url": null, "name": "Yahya Ipuge", "headline": "Senior Health Specialist, Independent Consultant, Certified Board Director, Board Chair in NGO and Private Entities", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFuqPObSyLPMQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1517757008397?e=1751500800&v=beta&t=zaHc2CY7AJ-eX1MCSvazp8ny37iBAu3YsyaZjwq6gB0", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:44:32+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Field officer at Health and Insurance Management Services Organization", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5103AQEVmdDwTIhsjQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1540989154156?e=1751500800&v=beta&t=7N0baJNfZ26dbrNNbv2055sbGlacQUwQu07wUTN0whs", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:44:32+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Medical Practitioner @ Health & Insurance | Master's Degree in Infection Control", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQHjMXy7dSmmLg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1725975429410?e=1751500800&v=beta&t=lDIL2KhDw471XYvtCrRfkHAnG3Q-npDJnwDdK0sYvpA", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:44:32+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "--", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:44:32+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Fadhy Mtanga", "headline": "Executive Director at Health & Insurance Management Services Organization (HIMSO) Author | Creative Writer | Social Scientist", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQEloEreyg3qVQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1704391866585?e=1751500800&v=beta&t=86am-v3cjBPBldLTwgt8-AY-YbxFY6QZQzObwLTtMEA", "company_handle": "https://www.linkedin.com/company/health-insurance-management-services-organization/", "captured_at": "2025-04-30T07:44:32+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Member Service Assistant @ National Health Insurance Management Authority (NHIMA) | Clinical Officer | Health Insurance & Public Health | Claims Processing & Customer Support | Data & Policy Analyst", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQGob13KyxrB0g/profile-displayphoto-shrink_100_100/B4DZYCgreeHIAU-/0/1743798848889?e=1751500800&v=beta&t=uXxTsMLi5s7hr8FBEzVTDw7V3eJ85kpTaIC7i_5fM-Y", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:44:38+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Business Administrator at Consultancy Business investments", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQEuKXJmknr2YA/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1714545221728?e=1751500800&v=beta&t=zJG-rDZgYJJ0eROibf-Wag-v_JecCghwU3ul4TaH2Eg", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:44:38+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Economist/ Development Analyst/ Planner/ Customer Care", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQFEc3EgfdpZeg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1727782989867?e=1751500800&v=beta&t=dWjKzSu5FDRgmxAVret9jQPhWF2VjcrnmEpR2LDMC1Q", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:44:38+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Tamani Phiri", "headline": "Corporate Business Strategy | Thought Leadership | Corporate Governance", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQF4mFx8jY2n-w/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1730302954035?e=1751500800&v=beta&t=i4QIrHA6A9eLtKolwTRNhuoiaTad28sf5KHxAFuXG-w", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:44:38+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Samantha Ngandwe", "headline": "Quality Assurance and Accreditation Officer at National Health Insurance Management Authority", "followers": 382, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQHyOjyoz7d95g/profile-displayphoto-shrink_100_100/B4DZYvvhP5GwAY-/0/1744557712084?e=1751500800&v=beta&t=DLYRpz20zmwUWx1UY1Dn-ykvgWBnwn8XHWLaDMf199M", "company_handle": "https://www.linkedin.com/company/national-health-insurance-management-authority/", "captured_at": "2025-04-30T07:44:38+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Dental Surgery Assistant at Health Promotion Board", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:44:43+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Merrill Hausenfluck", "headline": "Chief Financial Officer", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQGKxDKRJM_BCg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1696292650180?e=1751500800&v=beta&t=NbUVC-QP-XL3frBpQcn3GtGrZ04Fl0xdko4V-mHxPag", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:44:43+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Mike Treash", "headline": "Senior Vice President and Chief Operating Officer at Health Alliance Plan", "followers": 2000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQH_c6tIq929gw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1720478900599?e=1751500800&v=beta&t=l9RLnLDKBBJjJQTsFMJMa_1MpWCKcV4AUa3dcjGnSXQ", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:44:43+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Manager at Health Alliance Plan", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:44:43+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Manager, Government Programs at Health Alliance Plan", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQF473eFGZeIpQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1654455840818?e=1751500800&v=beta&t=FllKCznSi0Ndm75QYy2i5UDtflCojNGkVzRpoChPC8c", "company_handle": "https://www.linkedin.com/company/health-alliance-plan/", "captured_at": "2025-04-30T07:44:43+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Steele Dickerson", "headline": "Insurance Recruiting Solutions", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQEyICWaE_PvXA/profile-displayphoto-shrink_100_100/B56ZQuDHyZH0Ac-/0/1735939358232?e=1751500800&v=beta&t=9FdnWHrjnPQ7LQ5FdwC7sY8sS6hm-R4zfWO5Vmwm46w", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-30T07:44:48+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Scot Dickerson", "headline": "Insurance Industry Specialist, Insurance Recruiter, Talent Acquisition, Talent Sourcing, Hiring Consultant, Career Consultant, Staffing, Executive Recruiter at Insurance Recruiting Solutions #insurancejobs #insurance", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQGLFvtPPU3HEw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1724950672124?e=1751500800&v=beta&t=uT4SFSMF32O1d50Z0dbnd6zRRKdABHxSGlOZdxWdXBM", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-30T07:44:48+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Madeline Judas", "headline": "Recruiting Operations & Business Development Specialist", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQG6xiTaJ71UiA/profile-displayphoto-shrink_100_100/B56ZU_N_jPHoAY-/0/1740522388021?e=1751500800&v=beta&t=CxvAsYgU0zelghZsRhUJOC26ILVovP3ZPn4nMnWkEJE", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-30T07:44:48+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "All Lines Claims Adjuster / General Lines Agent (Property & Casualty : Life, Accident, Health & HMO)", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQFTjkb7SxTWWg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1725920318474?e=1751500800&v=beta&t=BGEzQg1c2l8qxuy2iKJ896nElsiYcaWnhkf-mqc-KhY", "company_handle": "https://www.linkedin.com/company/insurance-recruiting-solutions/", "captured_at": "2025-04-30T07:44:48+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Clinical Pharmacy Manager at Health Plan of San Mateo (HPSM)", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQEPO0pZOxznoA/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1551565536585?e=1751500800&v=beta&t=qwMGzWX_Zefkciq8h2m9daLMflT0WoDr5F1R5pXvyM4", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-30T07:44:54+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Tamana M.", "headline": "MPH Candidate at Brown University | Data Coordinator", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQEY3iDtFmpzlg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1714197678074?e=1751500800&v=beta&t=IsVT0uC7A-T-Tp22gZFDG9wiT7LMB5GmhccuI8f9c-I", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-30T07:44:54+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Program Manager", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-30T07:44:54+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Mackenzie Baysinger Moniz, MSW", "headline": "Program Manager at Health Plan of San Mateo", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D5603AQHAd3A4zLyuWA/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1675716742150?e=1751500800&v=beta&t=ot3fMyJFnHwwNfKJiA_YxZp6MOK_iVGtSCUgVNq867g", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-30T07:44:54+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "John O.", "headline": "Healthcare Delivery Strategy Execution", "followers": null, "connection_degree": "· 3rd", "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/healthplanofsanmateo/", "captured_at": "2025-04-30T07:44:54+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Daniel McQuilkin", "headline": "Senior Vice President", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFkScOqwhxvfQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1521406683682?e=1751500800&v=beta&t=iohhak3lrV1gpmA6dnoCxTRJidskfgmZUXKbNQbkxjs", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-30T07:44:59+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Tony Bonacuse", "headline": "Senior Vice President at Insurance Management Group", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQF_JJOFLjkZoQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1516269003018?e=1751500800&v=beta&t=0APZt5RNhvUj4IxsSdi7JO9KxezZzOH_WQCibn5Szgs", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-30T07:44:59+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Mark Bilger", "headline": "Director - Sr. Vice President at Insurance Management Group", "followers": 1000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQEzX5qUfqhd2g/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1663842785708?e=1751500800&v=beta&t=YyKXRQol0cDntoq8vbdxyaRvEFf0vWKNHPxk0cyWiG8", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-30T07:44:59+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Senior Vice President at Insurance Management Group / Partner", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQH3dm30dXH82w/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1572228299104?e=1751500800&v=beta&t=iuBQYs4iLHJgRgjFbSA2YiNiAI8zDILqg-nVsLR9Qjk", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-30T07:44:59+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Adam Young, MBA", "headline": "Husband | Father | Traveler | Sports Fanatic | Food Enthusiast | Independent Insurance Professional", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQErWIq1AVyxKg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1601480475688?e=1751500800&v=beta&t=jK_mhX0PkDdG8WBZaipIIYRDm1PnWIuFR7sCKDhDi6s", "company_handle": "https://www.linkedin.com/company/insurance-management-group_2/", "captured_at": "2025-04-30T07:44:59+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Doctor at CareCard Health Insurance Management Co", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-30T07:45:04+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Pharmacist", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQHyPi4Amu_Dkw/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1640460490377?e=1751500800&v=beta&t=q7R_b7bD9CR-1-Dvu81WoEHN_ljHK16l6ioTIA0LN7Q", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-30T07:45:04+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "IT Manager at CareCard Health Insurance Management Co", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-30T07:45:04+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Amal Shabani", "headline": "at carecard", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C5603AQFLzeP3yPkjgg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1519625412373?e=1751500800&v=beta&t=GULSoesSn83F_fYkkH_nPxWIjjs1d9Pucc3dUDNei6I", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-30T07:45:04+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "--", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/carecard-health-insurance-management-co/", "captured_at": "2025-04-30T07:45:04+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Biologiste | Pharmaco-épidemiologie & Pharmaco-économie | Software Helath Care Management", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQHOPXrX5-oeug/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1663013895834?e=1751500800&v=beta&t=yE2RGp0rfhcJkjh_vdM0VwpaPUtoPewM80lTlr20OHU", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-30T07:45:09+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Ruqaia Ali Alkhalifa", "headline": " RN,BSN, MSN,NE Database Officer for Scholarship Programs and Central Committee rapporteur at Al-Ahsa Health Cluster.", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4E03AQGfNujqDnuZDA/profile-displayphoto-shrink_100_100/B4EZOvsQThH0AU-/0/1733819436577?e=1751500800&v=beta&t=jleAVvhbg0H85tSi9TG96x0fqdkS1oytfaU02LHsFEI", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-30T07:45:09+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Fahad Mohyuddin", "headline": "Healthcare AI Strategist | Digital Health | SaaS | Telehealth | HIS | EHR | IoT", "followers": 7000, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQFLnPh8fu-HHg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1647320077586?e=1751500800&v=beta&t=S__knVzEVrGZuyqwszCe_5V_kawbG5tejmmEe3fkMJE", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-30T07:45:09+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Muhammad Moid Shams", "headline": "Azure DevOps | AWS Cloud Infrastructure| Freight Tech | Health Tech | HL7- NABIDH | HL7+ FHIR | KSA -NPHIES | FHIR - MOPH | HL7- Riayati | Freight Tech | Insure Tech | with Azure, Azure AI , PowerApps, D365 , M365", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D03AQEzousRurY2Zg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1711283874675?e=1751500800&v=beta&t=ZheuoRIAkS_9M8WXafdwB1nJEuy-a5HEsrXlfOANx80", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-30T07:45:09+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Muhammad Shahzaib (PMP® - SCRUM®)", "headline": "PMP-Certified Project Manager | Health Care & Web Solutions Expert | Customer Success & Operations Management Expert | Business Transformation Expert", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/D4D35AQFyp7WcBZinYA/profile-framedphoto-shrink_100_100/profile-framedphoto-shrink_100_100/0/1730638721808?e=1746604800&v=beta&t=oewST3uZcxrt48z76eiJgTxl1EPoo63Cq-JcTwrFTbs", "company_handle": "https://www.linkedin.com/company/healthcluster/", "captured_at": "2025-04-30T07:45:09+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Music Professional at Health Options Worldwide", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C5103AQGF-Dp6v6nkGw/profile-displayphoto-shrink_800_800/profile-displayphoto-shrink_800_800/0/1585401654822?e=1751500800&v=beta&t=7yeO-dGz1p_B66cJVSlTSdAYJLMFFwxPIhwwcR8uWWo", "company_handle": "https://www.linkedin.com/company/health-options-worldwide/", "captured_at": "2025-04-30T07:45:13+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Trainer/instructor at Health Options Worldwide", "followers": null, "connection_degree": null, "avatar_url": "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", "company_handle": "https://www.linkedin.com/company/health-options-worldwide/", "captured_at": "2025-04-30T07:45:13+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "Michael Akpoarebe-Isaac", "headline": "Chief Operating officer, Health Partners HMO", "followers": null, "connection_degree": "· 3rd", "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQE7KNFaLMyqYg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1602714385413?e=1751500800&v=beta&t=In5GaREqoXtO3sPCx9ZJJBwIPY4008ii13RPRl0w0Fw", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-30T07:45:19+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "REGISTERED NURSE/CLAIMS SUPERVISOR/HEALTH EDUCATOR/ CASE MANAGER/ Lekki.", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4E35AQEEqf5i5pD76g/profile-framedphoto-shrink_100_100/profile-framedphoto-shrink_100_100/0/1724219552412?e=1746604800&v=beta&t=h0kqmp2KnpqQxsCCwyy7NpA8CAkSQ6qgbsZ0p0H7mXM", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-30T07:45:19+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Data Analyst|Dedicated Retention Officer Boosting Customer Loyalty| Business Developer/ Event planner", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/D4D35AQHIgeS1H7w65w/profile-framedphoto-shrink_100_100/B4DZV1QfcCGcAk-/0/1741429012517?e=1746604800&v=beta&t=zZi8WjnLpDrQD271jAId2mnfld_hO538QrN1-q2G4Zw", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-30T07:45:19+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "Former Group managing director at Health Partners Ltd", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4E03AQHPQPvIQbPQPg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1583328612508?e=1751500800&v=beta&t=LpynArccJCWrdWMSBvYLH4SI5G-xae7ECoWUUAl_CeU", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-30T07:45:19+00:00Z"}
|
||||||
|
{"profile_url": null, "name": "LinkedIn Member", "headline": "HEAD, FINCON, @ HEALTH PARTNERS (HMO) LTD", "followers": null, "connection_degree": null, "avatar_url": "https://media.licdn.com/dms/image/v2/C4D03AQG8XOvnazEibQ/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1518882054975?e=1751500800&v=beta&t=5gT6GAWGTqYfpvkjOk0ArvV73I_KspkWXgoG-VhoStg", "company_handle": "https://www.linkedin.com/company/healthpartnersng/", "captured_at": "2025-04-30T07:45:19+00:00Z"}
|
||||||
@@ -1,39 +1,51 @@
|
|||||||
{
|
{
|
||||||
"name": "LinkedIn Company Card",
|
"name": "LinkedIn Company Search Result Card",
|
||||||
"baseSelector": "div.search-results-container ul[role='list'] > li",
|
"baseSelector": "div[data-chameleon-result-urn][data-view-name=\"search-entity-result-universal-template\"]",
|
||||||
|
"baseFields": [
|
||||||
|
{
|
||||||
|
"name": "chameleon_result_urn",
|
||||||
|
"type": "attribute",
|
||||||
|
"attribute": "data-chameleon-result-urn"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "view_name",
|
||||||
|
"type": "attribute",
|
||||||
|
"attribute": "data-view-name"
|
||||||
|
}
|
||||||
|
],
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"name": "handle",
|
"name": "handle",
|
||||||
"selector": "a[href*='/company/']",
|
"selector": "div.mb1 div.display-flex span a[data-test-app-aware-link]",
|
||||||
"type": "attribute",
|
"type": "attribute",
|
||||||
"attribute": "href"
|
"attribute": "href"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "profile_image",
|
"name": "profile_image",
|
||||||
"selector": "a[href*='/company/'] img",
|
"selector": "div.ivm-image-view-model img",
|
||||||
"type": "attribute",
|
"type": "attribute",
|
||||||
"attribute": "src"
|
"attribute": "src"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "name",
|
"name": "name",
|
||||||
"selector": "span[class*='t-16'] a",
|
"selector": "div.mb1 div.display-flex span a[data-test-app-aware-link]",
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "descriptor",
|
"name": "descriptor",
|
||||||
"selector": "div[class*='t-black t-normal']",
|
"selector": "div.mb1 > div[class*=\"t-14 t-black\"]",
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "about",
|
"name": "about",
|
||||||
"selector": "p[class*='entity-result__summary--2-lines']",
|
"selector": "p.entity-result__summary--2-lines",
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "followers",
|
"name": "followers",
|
||||||
"selector": "div:contains('followers')",
|
"selector": "div.mb1 > div:nth-of-type(3)",
|
||||||
"type": "regex",
|
"type": "regex",
|
||||||
"pattern": "(\\d+)\\s*followers"
|
"pattern": "(\\d+[KM]?) followers"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -1,38 +1,41 @@
|
|||||||
{
|
{
|
||||||
"name": "LinkedIn People Card",
|
"name": "LinkedIn People Profile Card",
|
||||||
"baseSelector": "li.org-people-profile-card__profile-card-spacing",
|
"baseSelector": "li.org-people-profile-card__profile-card-spacing",
|
||||||
|
"baseFields": [],
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"name": "profile_url",
|
"name": "profile_url",
|
||||||
"selector": "a.eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo",
|
"selector": "div.artdeco-entity-lockup__title a[data-test-app-aware-link]",
|
||||||
"type": "attribute",
|
"type": "attribute",
|
||||||
"attribute": "href"
|
"attribute": "href"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "avatar_url",
|
||||||
|
"selector": "div.artdeco-entity-lockup__image img",
|
||||||
|
"type": "attribute",
|
||||||
|
"attribute": "src"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "name",
|
"name": "name",
|
||||||
"selector": ".artdeco-entity-lockup__title .lt-line-clamp--single-line",
|
"selector": "div.artdeco-entity-lockup__title a div.lt-line-clamp--single-line",
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "headline",
|
"name": "headline",
|
||||||
"selector": ".artdeco-entity-lockup__subtitle .lt-line-clamp--multi-line",
|
"selector": "div.artdeco-entity-lockup__subtitle div.lt-line-clamp--multi-line",
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "followers",
|
"name": "followers",
|
||||||
"selector": ".lt-line-clamp--multi-line.t-12",
|
"selector": "span.text-align-center span.lt-line-clamp--multi-line",
|
||||||
"type": "text"
|
"type": "regex",
|
||||||
|
"pattern": "(\\d+)"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "connection_degree",
|
"name": "connection_degree",
|
||||||
"selector": ".artdeco-entity-lockup__badge .artdeco-entity-lockup__degree",
|
"selector": "span.artdeco-entity-lockup__degree",
|
||||||
"type": "text"
|
"type": "regex",
|
||||||
},
|
"pattern": "(\\d+\\w+)"
|
||||||
{
|
|
||||||
"name": "avatar_url",
|
|
||||||
"selector": ".artdeco-entity-lockup__image img",
|
|
||||||
"type": "attribute",
|
|
||||||
"attribute": "src"
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -1,36 +1,34 @@
|
|||||||
<li class="yCLWzruNprmIzaZzFFonVFBtMrbaVYnuDFA">
|
<li class="kZRArQqqhjjrHYceWaFbyEGWHRZbtqjTMawKA">
|
||||||
<!----><!---->
|
<!----><!---->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<div class="IxlEPbRZwQYrRltKPvHAyjBmCdIWTAoYo" data-chameleon-result-urn="urn:li:company:362492"
|
<div class="xAuWirHJDUTuhkfOpmJApZWziplUyPIc" data-chameleon-result-urn="urn:li:company:2095237"
|
||||||
data-view-name="search-entity-result-universal-template">
|
data-view-name="search-entity-result-universal-template">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<div class="linked-area flex-1
|
<div class="linked-area flex-1
|
||||||
cursor-pointer">
|
cursor-pointer">
|
||||||
|
|
||||||
<div class="BAEgVqVuxosMJZodcelsgPoyRcrkiqgVCGHXNQ">
|
<div class="qMGLeKnJyQnibGOueKodvnfLgWpsuA">
|
||||||
<div class="afcvrbGzNuyRlhPPQWrWirJtUdHAAtUlqxwvVA">
|
<div class="cBPGFfFovHsbNhBFmECDIsPgMWmtMozOUfIAbs">
|
||||||
<div class="display-flex align-items-center">
|
<div class="display-flex align-items-center">
|
||||||
<!---->
|
<!---->
|
||||||
|
|
||||||
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo scale-down " aria-hidden="true"
|
<a class="sDWEFrcVubKuUVGggeBOYqLlgYgPbojOc scale-down " aria-hidden="true" tabindex="-1"
|
||||||
tabindex="-1" href="https://www.linkedin.com/company/managment-research-services-inc./"
|
href="https://www.linkedin.com/company/health-insurance/" data-test-app-aware-link="">
|
||||||
data-test-app-aware-link="">
|
|
||||||
|
|
||||||
<div class="ivm-image-view-model ">
|
<div class="ivm-image-view-model ">
|
||||||
|
|
||||||
<div class="ivm-view-attr__img-wrapper
|
<div class="ivm-view-attr__img-wrapper
|
||||||
|
|
||||||
">
|
">
|
||||||
<!---->
|
<!---->
|
||||||
<!----> <img width="48"
|
<!----> <img width="48"
|
||||||
src="https://media.licdn.com/dms/image/v2/C560BAQFWpusEOgW-ww/company-logo_100_100/company-logo_100_100/0/1630583697877/managment_research_services_inc_logo?e=1750896000&v=beta&t=Ch9vyEZdfng-1D1m_XqP5kjNpVXUBKkk9cNhMZUhx0E"
|
src="https://media.licdn.com/dms/image/v2/C560BAQEXIoLSJbShlw/company-logo_100_100/company-logo_100_100/0/1662748332921/health_insurance_logo?e=1753920000&v=beta&t=p2ZNMYNsC9KSlp-sIqMYuc88avBTjKF4CqDobq1Xr2M"
|
||||||
loading="lazy" height="48" alt="Management Research Services, Inc. (MRS, Inc)"
|
loading="lazy" height="48" alt="Health Insurance" id="ember28"
|
||||||
id="ember28"
|
|
||||||
class="ivm-view-attr__img--centered EntityPhoto-square-3 evi-image lazy-image ember-view">
|
class="ivm-view-attr__img--centered EntityPhoto-square-3 evi-image lazy-image ember-view">
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -42,7 +40,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div
|
<div
|
||||||
class="wympnVuDByXHvafWrMGJLZuchDmCRqLmWPwg MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA pt3 pb3 t-12 t-black--light">
|
class="BNxZPngZfeRnDrIUbICgBZvQjRvMAUnwCHuDrmRg yNRlrJOHDflDBnYPLbVmiAkUsCUZKUznmAc pt3 pb3 t-12 t-black--light">
|
||||||
<div class="mb1">
|
<div class="mb1">
|
||||||
|
|
||||||
<div class="t-roman t-sans">
|
<div class="t-roman t-sans">
|
||||||
@@ -50,13 +48,14 @@
|
|||||||
|
|
||||||
|
|
||||||
<div class="display-flex">
|
<div class="display-flex">
|
||||||
<span class="TikBXjihYvcNUoIzkslUaEjfIuLmYxfs OoHEyXgsiIqGADjcOtTmfdpoYVXrLKTvkwI ">
|
<span
|
||||||
<span class="CgaWLOzmXNuKbRIRARSErqCJcBPYudEKo
|
class="kmApjJVnFerynwITxTBSCqzqgoHwVfkiA HHGiVqODTCkszDUDWwPGPJGUPfAeRpygAKwwLePrQ ">
|
||||||
t-16">
|
<span class="OjTMoZLoiuspGuWWptwqxZRcMcHZBoSDxfig
|
||||||
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
|
t-16">
|
||||||
href="https://www.linkedin.com/company/managment-research-services-inc./"
|
<a class="sDWEFrcVubKuUVGggeBOYqLlgYgPbojOc "
|
||||||
|
href="https://www.linkedin.com/company/health-insurance/"
|
||||||
data-test-app-aware-link="">
|
data-test-app-aware-link="">
|
||||||
<!---->Management Research Services, Inc. (MRS, Inc)<!---->
|
<!---->Health Insurance<!---->
|
||||||
<!----> </a>
|
<!----> </a>
|
||||||
<!----> </span>
|
<!----> </span>
|
||||||
</span>
|
</span>
|
||||||
@@ -69,14 +68,14 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<div class="LjmdKCEqKITHihFOiQsBAQylkdnsWhqZii
|
<div class="kFTZPhxHBbvnnRxiRPmTxafKGLUNSiaeInag
|
||||||
t-14 t-black t-normal">
|
t-14 t-black t-normal">
|
||||||
<!---->Insurance • Milwaukee, Wisconsin<!---->
|
<!---->Insurance ⢠Cardiff, CA<!---->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="cTPhJiHyNLmxdQYFlsEOutjznmqrVHUByZwZ
|
<div class="FlWUwyrEUZpkVCgzGTDwUHTLntfZNseavlY
|
||||||
t-14 t-normal">
|
t-14 t-normal">
|
||||||
<!---->1K followers<!---->
|
<!---->3K followers<!---->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
@@ -86,23 +85,19 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!---->
|
<!---->
|
||||||
<p class="yWzlqwKNlvCWVNoKqmzoDDEnBMUuyynaLg
|
<p class="JBUEKeXhPyClEtYwdsASPYsZsCkTvUBqsDUs
|
||||||
entity-result__summary--2-lines
|
entity-result__summary--2-lines
|
||||||
t-12 t-black--light
|
t-12 t-black--light
|
||||||
">
|
">
|
||||||
<!---->MRS combines 30 years of experience supporting the Life,<span class="white-space-pre">
|
<!---->Your<span class="white-space-pre"> </span><strong><!---->health<!----></strong><span
|
||||||
</span><strong><!---->Health<!----></strong><span class="white-space-pre"> </span>and
|
class="white-space-pre"> </span><!----><!----><strong><!---->insurance<!----></strong><span
|
||||||
Annuities<span class="white-space-pre"> </span><strong><!---->Insurance<!----></strong><span
|
class="white-space-pre"> </span>expert for all stages of your life; Medicare, Individuals,
|
||||||
class="white-space-pre"> </span>Industry with customized<span class="white-space-pre">
|
Families, Small Groups, CoveredCA.<!---->
|
||||||
</span><strong><!---->insurance<!----></strong><span class="white-space-pre">
|
|
||||||
</span>underwriting solutions that efficiently support clients’ workflows. Supported by the
|
|
||||||
Agenium Platform (www.agenium.ai) our innovative underwriting solutions are guaranteed to
|
|
||||||
optimize requirements...<!---->
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<!---->
|
<!---->
|
||||||
</div>
|
</div>
|
||||||
<div class="qXxdnXtzRVFTnTnetmNpssucBwQBsWlUuk MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA">
|
<div class="JZcKRppsWfaxfMaqtvfVwEeAtzNwryBOMdo yNRlrJOHDflDBnYPLbVmiAkUsCUZKUznmAc">
|
||||||
<!---->
|
<!---->
|
||||||
|
|
||||||
|
|
||||||
@@ -111,7 +106,7 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<button aria-label="Follow Management Research Services, Inc. (MRS, Inc)" id="ember61"
|
<button aria-label="Follow Health Insurance" id="ember49"
|
||||||
class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view"
|
class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view"
|
||||||
type="button"><!---->
|
type="button"><!---->
|
||||||
<span class="artdeco-button__text">
|
<span class="artdeco-button__text">
|
||||||
|
|||||||
@@ -2,41 +2,40 @@
|
|||||||
<div>
|
<div>
|
||||||
|
|
||||||
|
|
||||||
<section class="artdeco-card full-width qQdPErXQkSAbwApNgNfuxukTIPPykttCcZGOHk">
|
<section class="artdeco-card full-width IxXiAcHfbZpayHVZUYdQwfYOkMbOirmr">
|
||||||
<!---->
|
<!---->
|
||||||
|
|
||||||
<img width="210" src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"
|
<img width="210" src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"
|
||||||
ariarole="presentation" loading="lazy" height="210" alt="" id="ember96"
|
ariarole="presentation" loading="lazy" height="210" alt="" id="ember102"
|
||||||
class="evi-image lazy-image ghost-default ember-view org-people-profile-card__cover-photo org-people-profile-card__cover-photo--people">
|
class="evi-image lazy-image ghost-default ember-view org-people-profile-card__cover-photo org-people-profile-card__cover-photo--people">
|
||||||
|
|
||||||
<div class="org-people-profile-card__profile-info">
|
<div class="org-people-profile-card__profile-info">
|
||||||
<div id="ember97"
|
<div id="ember103"
|
||||||
class="artdeco-entity-lockup artdeco-entity-lockup--stacked-center artdeco-entity-lockup--size-7 ember-view">
|
class="artdeco-entity-lockup artdeco-entity-lockup--stacked-center artdeco-entity-lockup--size-7 ember-view">
|
||||||
<div id="ember98"
|
<div id="ember104"
|
||||||
class="artdeco-entity-lockup__image artdeco-entity-lockup__image--type-circle ember-view"
|
class="artdeco-entity-lockup__image artdeco-entity-lockup__image--type-circle ember-view"
|
||||||
type="circle">
|
type="circle">
|
||||||
|
|
||||||
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
|
<a class="sDWEFrcVubKuUVGggeBOYqLlgYgPbojOc " id="org-people-profile-card__profile-image-0"
|
||||||
id="org-people-profile-card__profile-image-0"
|
href="https://www.linkedin.com/in/ericweberhcbd?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAABVh2MBFoyTaAxDqYQQcW8oGxVsqsKioHw"
|
||||||
href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
|
|
||||||
data-test-app-aware-link="">
|
data-test-app-aware-link="">
|
||||||
<img width="104"
|
<img width="104"
|
||||||
src="https://media.licdn.com/dms/image/v2/D5603AQGs2Vyju4xZ7A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1681741067031?e=1750896000&v=beta&t=Hvj--IrrmpVIH7pec7-l_PQok8vsS__CGeUqBWOw7co"
|
src="https://media.licdn.com/dms/image/v2/C4D03AQHNP9KoXtSrkg/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1573501774845?e=1753920000&v=beta&t=JYsY56biGUmDzbYj2ORZMcd1dSm2IRWCA-IM3KNFLw8"
|
||||||
loading="lazy" height="104" alt="Dr. Rayna S." id="ember99"
|
loading="lazy" height="104" alt="Eric Weber" id="ember105"
|
||||||
class="evi-image lazy-image ember-view">
|
class="evi-image lazy-image ember-view">
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
<div id="ember100" class="artdeco-entity-lockup__content ember-view">
|
<div id="ember106" class="artdeco-entity-lockup__content ember-view">
|
||||||
<div id="ember101" class="artdeco-entity-lockup__title ember-view">
|
<div id="ember107" class="artdeco-entity-lockup__title ember-view">
|
||||||
<a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo link-without-visited-state"
|
<a class="sDWEFrcVubKuUVGggeBOYqLlgYgPbojOc link-without-visited-state"
|
||||||
aria-label="View Dr. Rayna S.’s profile"
|
aria-label="View Eric Weberâs profile"
|
||||||
href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
|
href="https://www.linkedin.com/in/ericweberhcbd?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAABVh2MBFoyTaAxDqYQQcW8oGxVsqsKioHw"
|
||||||
data-test-app-aware-link="">
|
data-test-app-aware-link="">
|
||||||
<div id="ember103" class="ember-view lt-line-clamp lt-line-clamp--single-line AGabuksChUpCmjWshSnaZryLKSthOKkwclxY
|
<div id="ember109" class="ember-view lt-line-clamp lt-line-clamp--single-line rMKrzkehlCEvJWoQjDQJFaHmBFAYQLMGrNY
|
||||||
t-black" style="">
|
t-black" style="">
|
||||||
Dr. Rayna S.
|
Eric Weber
|
||||||
|
|
||||||
<!---->
|
<!---->
|
||||||
</div>
|
</div>
|
||||||
@@ -44,33 +43,33 @@
|
|||||||
</a>
|
</a>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
<div id="ember104" class="artdeco-entity-lockup__badge ember-view"> <span class="a11y-text">3rd+
|
<div id="ember110" class="artdeco-entity-lockup__badge ember-view"> <span class="a11y-text">3rd+
|
||||||
degree connection</span>
|
degree connection</span>
|
||||||
<span class="artdeco-entity-lockup__degree" aria-hidden="true">
|
<span class="artdeco-entity-lockup__degree" aria-hidden="true">
|
||||||
· 3rd
|
· 3rd
|
||||||
</span>
|
</span>
|
||||||
<!----><!---->
|
<!----><!---->
|
||||||
</div>
|
</div>
|
||||||
<div id="ember105" class="artdeco-entity-lockup__subtitle ember-view">
|
<div id="ember111" class="artdeco-entity-lockup__subtitle ember-view">
|
||||||
<div class="t-14 t-black--light t-normal">
|
<div class="t-14 t-black--light t-normal">
|
||||||
<div id="ember107" class="ember-view lt-line-clamp lt-line-clamp--multi-line"
|
<div id="ember113" class="ember-view lt-line-clamp lt-line-clamp--multi-line"
|
||||||
style="-webkit-line-clamp: 2">
|
style="-webkit-line-clamp: 2">
|
||||||
Leadership and Talent Development Consultant and Professional Speaker
|
HIPN Executive Editor | Healthcare BizDev CEO â Health Insurance Plan News.
|
||||||
|
|
||||||
<!---->
|
<!---->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div id="ember108" class="artdeco-entity-lockup__caption ember-view"></div>
|
<div id="ember114" class="artdeco-entity-lockup__caption ember-view"></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
<span class="text-align-center">
|
<span class="text-align-center">
|
||||||
<span id="ember110"
|
<span id="ember116"
|
||||||
class="ember-view lt-line-clamp lt-line-clamp--multi-line t-12 t-black--light mt2"
|
class="ember-view lt-line-clamp lt-line-clamp--multi-line t-12 t-black--light mt2"
|
||||||
style="-webkit-line-clamp: 3">
|
style="-webkit-line-clamp: 3">
|
||||||
727 followers
|
10K followers
|
||||||
|
|
||||||
<!----> </span>
|
<!----> </span>
|
||||||
|
|
||||||
@@ -78,7 +77,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<footer class="ph3 pb3">
|
<footer class="ph3 pb3">
|
||||||
<button aria-label="Follow Dr. Rayna S." id="ember111"
|
<button aria-label="Follow Eric Weber" id="ember117"
|
||||||
class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view full-width"
|
class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view full-width"
|
||||||
type="button"><!---->
|
type="button"><!---->
|
||||||
<span class="artdeco-button__text">
|
<span class="artdeco-button__text">
|
||||||
|
|||||||
37
docs/md_v2/assets/feedback-overrides.css
Normal file
37
docs/md_v2/assets/feedback-overrides.css
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
/* docs/assets/feedback-overrides.css */
|
||||||
|
:root {
|
||||||
|
/* brand */
|
||||||
|
--feedback-primary-color: #09b5a5;
|
||||||
|
--feedback-highlight-color: #fed500; /* stars etc */
|
||||||
|
|
||||||
|
/* modal shell / text */
|
||||||
|
--feedback-modal-content-bg-color: var(--background-color);
|
||||||
|
--feedback-modal-content-text-color: var(--font-color);
|
||||||
|
--feedback-modal-content-border-color: var(--primary-dimmed-color);
|
||||||
|
--feedback-modal-content-border-radius: 4px;
|
||||||
|
|
||||||
|
/* overlay */
|
||||||
|
--feedback-overlay-bg-color: rgba(0,0,0,.75);
|
||||||
|
|
||||||
|
/* rating buttons */
|
||||||
|
--feedback-modal-rating-button-color: var(--secondary-color);
|
||||||
|
--feedback-modal-rating-button-selected-color: var(--primary-color);
|
||||||
|
|
||||||
|
/* inputs */
|
||||||
|
--feedback-modal-input-bg-color: var(--code-bg-color);
|
||||||
|
--feedback-modal-input-text-color: var(--font-color);
|
||||||
|
--feedback-modal-input-border-color: var(--primary-dimmed-color);
|
||||||
|
--feedback-modal-input-border-color-focused: var(--primary-color);
|
||||||
|
|
||||||
|
/* submit / secondary buttons */
|
||||||
|
--feedback-modal-button-submit-bg-color: var(--primary-color);
|
||||||
|
--feedback-modal-button-submit-bg-color-hover: var(--primary-dimmed-color);
|
||||||
|
--feedback-modal-button-submit-text-color: var(--invert-font-color);
|
||||||
|
|
||||||
|
--feedback-modal-button-bg-color: transparent; /* screenshot btn */
|
||||||
|
--feedback-modal-button-border-color: var(--primary-color);
|
||||||
|
--feedback-modal-button-icon-color: var(--primary-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* optional: keep the “Powered by” link subtle */
|
||||||
|
.feedback-logo a{color:var(--secondary-color);}
|
||||||
5
docs/md_v2/assets/gtag.js
Normal file
5
docs/md_v2/assets/gtag.js
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
window.dataLayer = window.dataLayer || [];
|
||||||
|
function gtag(){dataLayer.push(arguments);}
|
||||||
|
gtag('js', new Date());
|
||||||
|
|
||||||
|
gtag('config', 'G-58W0K2ZQ25');
|
||||||
13263
docs/md_v2/assets/llmtxt/crawl4ai_all_examples_content.llm.txt
Normal file
13263
docs/md_v2/assets/llmtxt/crawl4ai_all_examples_content.llm.txt
Normal file
File diff suppressed because one or more lines are too long
16658
docs/md_v2/assets/llmtxt/crawl4ai_all_memory_content.llm.txt
Normal file
16658
docs/md_v2/assets/llmtxt/crawl4ai_all_memory_content.llm.txt
Normal file
File diff suppressed because it is too large
Load Diff
7708
docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
Normal file
7708
docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
Normal file
File diff suppressed because it is too large
Load Diff
7615
docs/md_v2/assets/llmtxt/crawl4ai_config_objects.llm.full.txt
Normal file
7615
docs/md_v2/assets/llmtxt/crawl4ai_config_objects.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,556 @@
|
|||||||
|
# Detailed Outline for crawl4ai - config_objects Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_config_objects.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2024-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction to Configuration Objects in Crawl4ai
|
||||||
|
|
||||||
|
* **1.1. Purpose of Configuration Objects**
|
||||||
|
* Explanation: Configuration objects in `crawl4ai` serve to centralize and manage settings for various components and behaviors of the library. This includes browser setup, individual crawl run parameters, LLM provider interactions, proxy settings, and more.
|
||||||
|
* Benefit: This approach enhances code readability by grouping related settings, improves maintainability by providing a clear structure for configurations, and offers ease of customization for users to tailor the library's behavior to their specific needs.
|
||||||
|
* **1.2. General Principles and Usage**
|
||||||
|
* **1.2.1. Immutability/Cloning:**
|
||||||
|
* Concept: Most configuration objects are designed with a `clone()` method, allowing users to create modified copies without altering the original configuration instance. This promotes safer state management, especially when reusing base configurations for multiple tasks.
|
||||||
|
* Method: `clone(**kwargs)` on most configuration objects.
|
||||||
|
* **1.2.2. Serialization and Deserialization:**
|
||||||
|
* Concept: `crawl4ai` configuration objects can be serialized to dictionary format (e.g., for saving to JSON) and deserialized back into their respective class instances.
|
||||||
|
* Methods:
|
||||||
|
* `dump() -> dict`: Serializes the object to a dictionary suitable for JSON, often using the internal `to_serializable_dict` helper.
|
||||||
|
* `load(data: dict) -> ConfigClass` (Static Method): Deserializes an object from a dictionary, often using the internal `from_serializable_dict` helper.
|
||||||
|
* `to_dict() -> dict`: Converts the object to a standard Python dictionary.
|
||||||
|
* `from_dict(data: dict) -> ConfigClass` (Static Method): Creates an instance from a standard Python dictionary.
|
||||||
|
* Helper Functions:
|
||||||
|
* `crawl4ai.async_configs.to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`: Recursively converts objects into a serializable dictionary format, handling complex types like enums and nested objects.
|
||||||
|
* `crawl4ai.async_configs.from_serializable_dict(data: Any) -> Any`: Reconstructs Python objects from the serializable dictionary format.
|
||||||
|
* **1.3. Scope of this Document**
|
||||||
|
* Statement: This document provides a factual API reference for the primary configuration objects within the `crawl4ai` library, detailing their purpose, initialization parameters, attributes, and key methods.
|
||||||
|
|
||||||
|
## 2. Core Configuration Objects
|
||||||
|
|
||||||
|
### 2.1. `BrowserConfig`
|
||||||
|
Located in `crawl4ai.async_configs`.
|
||||||
|
|
||||||
|
* **2.1.1. Purpose:**
|
||||||
|
* Description: The `BrowserConfig` class is used to configure the settings for a browser instance and its associated contexts when using browser-based crawler strategies like `AsyncPlaywrightCrawlerStrategy`. It centralizes all parameters that affect the creation and behavior of the browser.
|
||||||
|
* **2.1.2. Initialization (`__init__`)**
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
class BrowserConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
browser_type: str = "chromium",
|
||||||
|
headless: bool = True,
|
||||||
|
browser_mode: str = "dedicated",
|
||||||
|
use_managed_browser: bool = False,
|
||||||
|
cdp_url: Optional[str] = None,
|
||||||
|
use_persistent_context: bool = False,
|
||||||
|
user_data_dir: Optional[str] = None,
|
||||||
|
chrome_channel: Optional[str] = "chromium", # Note: 'channel' is preferred
|
||||||
|
channel: Optional[str] = "chromium",
|
||||||
|
proxy: Optional[str] = None,
|
||||||
|
proxy_config: Optional[Union[ProxyConfig, dict]] = None,
|
||||||
|
viewport_width: int = 1080,
|
||||||
|
viewport_height: int = 600,
|
||||||
|
viewport: Optional[dict] = None,
|
||||||
|
accept_downloads: bool = False,
|
||||||
|
downloads_path: Optional[str] = None,
|
||||||
|
storage_state: Optional[Union[str, dict]] = None,
|
||||||
|
ignore_https_errors: bool = True,
|
||||||
|
java_script_enabled: bool = True,
|
||||||
|
sleep_on_close: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
cookies: Optional[List[dict]] = None,
|
||||||
|
headers: Optional[dict] = None,
|
||||||
|
user_agent: Optional[str] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
|
||||||
|
user_agent_mode: Optional[str] = "",
|
||||||
|
user_agent_generator_config: Optional[dict] = None, # Default is {} in __init__
|
||||||
|
text_mode: bool = False,
|
||||||
|
light_mode: bool = False,
|
||||||
|
extra_args: Optional[List[str]] = None,
|
||||||
|
debugging_port: int = 9222,
|
||||||
|
host: str = "localhost"
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `browser_type (str, default: "chromium")`: Specifies the browser engine to use. Supported values: `"chromium"`, `"firefox"`, `"webkit"`.
|
||||||
|
* `headless (bool, default: True)`: If `True`, runs the browser without a visible GUI. Set to `False` for debugging or visual interaction.
|
||||||
|
* `browser_mode (str, default: "dedicated")`: Defines how the browser is initialized. Options: `"builtin"` (uses built-in CDP), `"dedicated"` (new instance each time), `"cdp"` (connects to an existing CDP endpoint specified by `cdp_url`), `"docker"` (runs browser in a Docker container).
|
||||||
|
* `use_managed_browser (bool, default: False)`: If `True`, launches the browser using a managed approach (e.g., via CDP or Docker), allowing for more advanced control. Automatically set to `True` if `browser_mode` is `"builtin"`, `"docker"`, or if `cdp_url` is provided, or if `use_persistent_context` is `True`.
|
||||||
|
* `cdp_url (Optional[str], default: None)`: The URL for the Chrome DevTools Protocol (CDP) endpoint. If not provided and `use_managed_browser` is active, it might be set by an internal browser manager.
|
||||||
|
* `use_persistent_context (bool, default: False)`: If `True`, uses a persistent browser context (profile), saving cookies, localStorage, etc., across sessions. Requires `user_data_dir`. Sets `use_managed_browser=True`.
|
||||||
|
* `user_data_dir (Optional[str], default: None)`: Path to a directory for storing user data for persistent sessions. If `None` and `use_persistent_context` is `True`, a temporary directory might be used.
|
||||||
|
* `chrome_channel (Optional[str], default: "chromium")`: Specifies the Chrome channel (e.g., "chrome", "msedge", "chromium-beta"). Only applicable if `browser_type` is "chromium".
|
||||||
|
* `channel (Optional[str], default: "chromium")`: Preferred alias for `chrome_channel`. Set to `""` for Firefox or WebKit.
|
||||||
|
* `proxy (Optional[str], default: None)`: A string representing the proxy server URL (e.g., "http://username:password@proxy.example.com:8080").
|
||||||
|
* `proxy_config (Optional[Union[ProxyConfig, dict]], default: None)`: A `ProxyConfig` object or a dictionary specifying detailed proxy settings. Overrides the `proxy` string if both are provided.
|
||||||
|
* `viewport_width (int, default: 1080)`: Default width of the browser viewport in pixels.
|
||||||
|
* `viewport_height (int, default: 600)`: Default height of the browser viewport in pixels.
|
||||||
|
* `viewport (Optional[dict], default: None)`: A dictionary specifying viewport dimensions, e.g., `{"width": 1920, "height": 1080}`. If set, overrides `viewport_width` and `viewport_height`.
|
||||||
|
* `accept_downloads (bool, default: False)`: If `True`, allows files to be downloaded by the browser.
|
||||||
|
* `downloads_path (Optional[str], default: None)`: Directory path where downloaded files will be stored. Required if `accept_downloads` is `True`.
|
||||||
|
* `storage_state (Optional[Union[str, dict]], default: None)`: Path to a JSON file or a dictionary containing the browser's storage state (cookies, localStorage, etc.) to load.
|
||||||
|
* `ignore_https_errors (bool, default: True)`: If `True`, HTTPS certificate errors will be ignored.
|
||||||
|
* `java_script_enabled (bool, default: True)`: If `True`, JavaScript execution is enabled on web pages.
|
||||||
|
* `sleep_on_close (bool, default: False)`: If `True`, introduces a small delay before the browser is closed.
|
||||||
|
* `verbose (bool, default: True)`: If `True`, enables verbose logging for browser operations.
|
||||||
|
* `cookies (Optional[List[dict]], default: None)`: A list of cookie dictionaries to be set in the browser context. Each dictionary should conform to Playwright's cookie format.
|
||||||
|
* `headers (Optional[dict], default: None)`: A dictionary of additional HTTP headers to be sent with every request made by the browser.
|
||||||
|
* `user_agent (Optional[str], default: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36")`: The User-Agent string the browser will use.
|
||||||
|
* `user_agent_mode (Optional[str], default: "")`: Mode for generating the User-Agent string. If set (e.g., to "random"), `user_agent_generator_config` can be used.
|
||||||
|
* `user_agent_generator_config (Optional[dict], default: {})`: Configuration dictionary for the User-Agent generator if `user_agent_mode` is active.
|
||||||
|
* `text_mode (bool, default: False)`: If `True`, attempts to disable images and other rich content to potentially speed up loading for text-focused crawls.
|
||||||
|
* `light_mode (bool, default: False)`: If `True`, disables certain background browser features for potential performance gains.
|
||||||
|
* `extra_args (Optional[List[str]], default: None)`: A list of additional command-line arguments to pass to the browser executable upon launch.
|
||||||
|
* `debugging_port (int, default: 9222)`: The port to use for the browser's remote debugging protocol (CDP).
|
||||||
|
* `host (str, default: "localhost")`: The host on which the browser's remote debugging protocol will listen.
|
||||||
|
* **2.1.3. Key Public Attributes/Properties:**
|
||||||
|
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||||
|
* `browser_hint (str)`: [Read-only] - A string representing client hints (Sec-CH-UA) generated based on the `user_agent` string. This is automatically set during initialization.
|
||||||
|
* **2.1.4. Key Public Methods:**
|
||||||
|
* `from_kwargs(cls, kwargs: dict) -> BrowserConfig` (Static Method):
|
||||||
|
* Purpose: Creates a `BrowserConfig` instance from a dictionary of keyword arguments.
|
||||||
|
* `to_dict(self) -> dict`:
|
||||||
|
* Purpose: Converts the `BrowserConfig` instance into a dictionary representation.
|
||||||
|
* `clone(self, **kwargs) -> BrowserConfig`:
|
||||||
|
* Purpose: Creates a deep copy of the current `BrowserConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||||
|
* `dump(self) -> dict`:
|
||||||
|
* Purpose: Serializes the `BrowserConfig` object into a dictionary format that is suitable for JSON storage or transmission, utilizing the `to_serializable_dict` helper.
|
||||||
|
* `load(cls, data: dict) -> BrowserConfig` (Static Method):
|
||||||
|
* Purpose: Deserializes a `BrowserConfig` object from a dictionary (typically one created by `dump()`), utilizing the `from_serializable_dict` helper.
|
||||||
|
|
||||||
|
### 2.2. `CrawlerRunConfig`
|
||||||
|
Located in `crawl4ai.async_configs`.
|
||||||
|
|
||||||
|
* **2.2.1. Purpose:**
|
||||||
|
* Description: The `CrawlerRunConfig` class encapsulates all settings that control the behavior of a single crawl operation performed by `AsyncWebCrawler.arun()` or multiple operations within `AsyncWebCrawler.arun_many()`. This includes parameters for content processing, page interaction, caching, and media handling.
|
||||||
|
* **2.2.2. Initialization (`__init__`)**
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
class CrawlerRunConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: Optional[str] = None,
|
||||||
|
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||||
|
extraction_strategy: Optional[ExtractionStrategy] = None,
|
||||||
|
chunking_strategy: Optional[ChunkingStrategy] = RegexChunking(),
|
||||||
|
markdown_generator: Optional[MarkdownGenerationStrategy] = DefaultMarkdownGenerator(),
|
||||||
|
only_text: bool = False,
|
||||||
|
css_selector: Optional[str] = None,
|
||||||
|
target_elements: Optional[List[str]] = None, # Default is [] in __init__
|
||||||
|
excluded_tags: Optional[List[str]] = None, # Default is [] in __init__
|
||||||
|
excluded_selector: Optional[str] = "", # Default is "" in __init__
|
||||||
|
keep_data_attributes: bool = False,
|
||||||
|
keep_attrs: Optional[List[str]] = None, # Default is [] in __init__
|
||||||
|
remove_forms: bool = False,
|
||||||
|
prettify: bool = False,
|
||||||
|
parser_type: str = "lxml",
|
||||||
|
scraping_strategy: Optional[ContentScrapingStrategy] = None, # Instantiated with WebScrapingStrategy() if None
|
||||||
|
proxy_config: Optional[Union[ProxyConfig, dict]] = None,
|
||||||
|
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||||
|
locale: Optional[str] = None,
|
||||||
|
timezone_id: Optional[str] = None,
|
||||||
|
geolocation: Optional[GeolocationConfig] = None,
|
||||||
|
fetch_ssl_certificate: bool = False,
|
||||||
|
cache_mode: CacheMode = CacheMode.BYPASS,
|
||||||
|
session_id: Optional[str] = None,
|
||||||
|
shared_data: Optional[dict] = None,
|
||||||
|
wait_until: str = "domcontentloaded",
|
||||||
|
page_timeout: int = PAGE_TIMEOUT,
|
||||||
|
wait_for: Optional[str] = None,
|
||||||
|
wait_for_timeout: Optional[int] = None,
|
||||||
|
wait_for_images: bool = False,
|
||||||
|
delay_before_return_html: float = 0.1,
|
||||||
|
mean_delay: float = 0.1,
|
||||||
|
max_range: float = 0.3,
|
||||||
|
semaphore_count: int = 5,
|
||||||
|
js_code: Optional[Union[str, List[str]]] = None,
|
||||||
|
js_only: bool = False,
|
||||||
|
ignore_body_visibility: bool = True,
|
||||||
|
scan_full_page: bool = False,
|
||||||
|
scroll_delay: float = 0.2,
|
||||||
|
process_iframes: bool = False,
|
||||||
|
remove_overlay_elements: bool = False,
|
||||||
|
simulate_user: bool = False,
|
||||||
|
override_navigator: bool = False,
|
||||||
|
magic: bool = False,
|
||||||
|
adjust_viewport_to_content: bool = False,
|
||||||
|
screenshot: bool = False,
|
||||||
|
screenshot_wait_for: Optional[float] = None,
|
||||||
|
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_THRESHOLD,
|
||||||
|
pdf: bool = False,
|
||||||
|
capture_mhtml: bool = False,
|
||||||
|
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||||
|
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||||||
|
table_score_threshold: int = 7,
|
||||||
|
exclude_external_images: bool = False,
|
||||||
|
exclude_all_images: bool = False,
|
||||||
|
exclude_social_media_domains: Optional[List[str]] = None, # Uses SOCIAL_MEDIA_DOMAINS if None
|
||||||
|
exclude_external_links: bool = False,
|
||||||
|
exclude_social_media_links: bool = False,
|
||||||
|
exclude_domains: Optional[List[str]] = None, # Default is [] in __init__
|
||||||
|
exclude_internal_links: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
log_console: bool = False,
|
||||||
|
capture_network_requests: bool = False,
|
||||||
|
capture_console_messages: bool = False,
|
||||||
|
method: str = "GET",
|
||||||
|
stream: bool = False,
|
||||||
|
check_robots_txt: bool = False,
|
||||||
|
user_agent: Optional[str] = None,
|
||||||
|
user_agent_mode: Optional[str] = None,
|
||||||
|
user_agent_generator_config: Optional[dict] = None, # Default is {} in __init__
|
||||||
|
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||||
|
experimental: Optional[Dict[str, Any]] = None # Default is {} in __init__
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `url (Optional[str], default: None)`: The target URL for this specific crawl run.
|
||||||
|
* `word_count_threshold (int, default: MIN_WORD_THRESHOLD)`: Minimum word count for a text block to be considered significant during content processing.
|
||||||
|
* `extraction_strategy (Optional[ExtractionStrategy], default: None)`: Strategy for extracting structured data from the page. If `None`, `NoExtractionStrategy` is used.
|
||||||
|
* `chunking_strategy (Optional[ChunkingStrategy], default: RegexChunking())`: Strategy to split content into chunks before extraction.
|
||||||
|
* `markdown_generator (Optional[MarkdownGenerationStrategy], default: DefaultMarkdownGenerator())`: Strategy for converting HTML to Markdown.
|
||||||
|
* `only_text (bool, default: False)`: If `True`, attempts to extract only textual content, potentially ignoring structural elements beneficial for rich Markdown.
|
||||||
|
* `css_selector (Optional[str], default: None)`: A CSS selector defining the primary region of the page to focus on for content extraction. The raw HTML is reduced to this region.
|
||||||
|
* `target_elements (Optional[List[str]], default: [])`: A list of CSS selectors. If provided, only the content within these elements will be considered for Markdown generation and structured data extraction. Unlike `css_selector`, this does not reduce the raw HTML but scopes the processing.
|
||||||
|
* `excluded_tags (Optional[List[str]], default: [])`: A list of HTML tag names (e.g., "nav", "footer") to be removed from the HTML before processing.
|
||||||
|
* `excluded_selector (Optional[str], default: "")`: A CSS selector specifying elements to be removed from the HTML before processing.
|
||||||
|
* `keep_data_attributes (bool, default: False)`: If `True`, `data-*` attributes on HTML elements are preserved during cleaning.
|
||||||
|
* `keep_attrs (Optional[List[str]], default: [])`: A list of specific HTML attribute names to preserve during HTML cleaning.
|
||||||
|
* `remove_forms (bool, default: False)`: If `True`, all `<form>` elements are removed from the HTML.
|
||||||
|
* `prettify (bool, default: False)`: If `True`, the cleaned HTML output is "prettified" for better readability.
|
||||||
|
* `parser_type (str, default: "lxml")`: The HTML parser to be used by the scraping strategy (e.g., "lxml", "html.parser").
|
||||||
|
* `scraping_strategy (Optional[ContentScrapingStrategy], default: WebScrapingStrategy())`: The strategy for scraping content from the HTML.
|
||||||
|
* `proxy_config (Optional[Union[ProxyConfig, dict]], default: None)`: Proxy configuration for this specific run. Overrides any proxy settings in `BrowserConfig`.
|
||||||
|
* `proxy_rotation_strategy (Optional[ProxyRotationStrategy], default: None)`: Strategy to use for rotating proxies if multiple are available.
|
||||||
|
* `locale (Optional[str], default: None)`: Locale to set for the browser context (e.g., "en-US", "fr-FR"). Affects `Accept-Language` header and JavaScript `navigator.language`.
|
||||||
|
* `timezone_id (Optional[str], default: None)`: Timezone ID to set for the browser context (e.g., "America/New_York", "Europe/Paris"). Affects JavaScript `Date` objects.
|
||||||
|
* `geolocation (Optional[GeolocationConfig], default: None)`: A `GeolocationConfig` object or dictionary to set the browser's mock geolocation.
|
||||||
|
* `fetch_ssl_certificate (bool, default: False)`: If `True`, the SSL certificate information for the main URL will be fetched and included in the `CrawlResult`.
|
||||||
|
* `cache_mode (CacheMode, default: CacheMode.BYPASS)`: Defines caching behavior for this run. See `CacheMode` enum for options.
|
||||||
|
* `session_id (Optional[str], default: None)`: An identifier for a browser session. If provided, `crawl4ai` will attempt to reuse an existing page/context associated with this ID, or create a new one and associate it.
|
||||||
|
* `shared_data (Optional[dict], default: None)`: A dictionary for passing custom data between hooks during the crawl lifecycle.
|
||||||
|
* `wait_until (str, default: "domcontentloaded")`: Playwright's page navigation wait condition (e.g., "load", "domcontentloaded", "networkidle", "commit").
|
||||||
|
* `page_timeout (int, default: PAGE_TIMEOUT)`: Maximum time in milliseconds for page navigation and other page operations.
|
||||||
|
* `wait_for (Optional[str], default: None)`: A CSS selector or a JavaScript expression (prefixed with "js:"). The crawler will wait until this condition is met before proceeding.
|
||||||
|
* `wait_for_timeout (Optional[int], default: None)`: Specific timeout in milliseconds for the `wait_for` condition. If `None`, `page_timeout` is used.
|
||||||
|
* `wait_for_images (bool, default: False)`: If `True`, attempts to wait for all images on the page to finish loading.
|
||||||
|
* `delay_before_return_html (float, default: 0.1)`: Delay in seconds to wait just before the final HTML content is retrieved from the page.
|
||||||
|
* `mean_delay (float, default: 0.1)`: Used with `arun_many`. The mean base delay in seconds between processing URLs.
|
||||||
|
* `max_range (float, default: 0.3)`: Used with `arun_many`. The maximum additional random delay (added to `mean_delay`) between processing URLs.
|
||||||
|
* `semaphore_count (int, default: 5)`: Used with `arun_many` and semaphore-based dispatchers. The maximum number of concurrent crawl operations.
|
||||||
|
* `js_code (Optional[Union[str, List[str]]], default: None)`: A string or list of strings containing JavaScript code to be executed on the page after it loads.
|
||||||
|
* `js_only (bool, default: False)`: If `True`, indicates that this `arun` call is primarily for JavaScript execution on an already loaded page (within a session) and a full page navigation might not be needed.
|
||||||
|
* `ignore_body_visibility (bool, default: True)`: If `True`, proceeds with content extraction even if the `<body>` element is not deemed visible by Playwright.
|
||||||
|
* `scan_full_page (bool, default: False)`: If `True`, the crawler will attempt to scroll through the entire page to trigger lazy-loaded content.
|
||||||
|
* `scroll_delay (float, default: 0.2)`: Delay in seconds between each scroll step when `scan_full_page` is `True`.
|
||||||
|
* `process_iframes (bool, default: False)`: If `True`, attempts to extract and inline content from `<iframe>` elements.
|
||||||
|
* `remove_overlay_elements (bool, default: False)`: If `True`, attempts to identify and remove common overlay elements (popups, cookie banners) before content extraction.
|
||||||
|
* `simulate_user (bool, default: False)`: If `True`, enables heuristics to simulate user interactions (like mouse movements) to potentially bypass some anti-bot measures.
|
||||||
|
* `override_navigator (bool, default: False)`: If `True`, overrides certain JavaScript `navigator` properties to appear more like a standard browser.
|
||||||
|
* `magic (bool, default: False)`: If `True`, enables a combination of techniques (like `remove_overlay_elements`, `simulate_user`) to try and handle dynamic/obfuscated sites.
|
||||||
|
* `adjust_viewport_to_content (bool, default: False)`: If `True`, attempts to adjust the browser viewport size to match the dimensions of the page content.
|
||||||
|
* `screenshot (bool, default: False)`: If `True`, a screenshot of the page will be taken and included in `CrawlResult.screenshot`.
|
||||||
|
* `screenshot_wait_for (Optional[float], default: None)`: Additional delay in seconds to wait before taking the screenshot.
|
||||||
|
* `screenshot_height_threshold (int, default: SCREENSHOT_HEIGHT_THRESHOLD)`: If page height exceeds this, a full-page screenshot strategy might be different.
|
||||||
|
* `pdf (bool, default: False)`: If `True`, a PDF version of the page will be generated and included in `CrawlResult.pdf`.
|
||||||
|
* `capture_mhtml (bool, default: False)`: If `True`, an MHTML archive of the page will be captured and included in `CrawlResult.mhtml`.
|
||||||
|
* `image_description_min_word_threshold (int, default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)`: Minimum word count for surrounding text to be considered as an image description.
|
||||||
|
* `image_score_threshold (int, default: IMAGE_SCORE_THRESHOLD)`: Heuristic score threshold for an image to be included in `CrawlResult.media`.
|
||||||
|
* `table_score_threshold (int, default: 7)`: Heuristic score threshold for an HTML table to be considered a data table and included in `CrawlResult.media`.
|
||||||
|
* `exclude_external_images (bool, default: False)`: If `True`, images hosted on different domains than the main page URL are excluded.
|
||||||
|
* `exclude_all_images (bool, default: False)`: If `True`, all images are excluded from `CrawlResult.media`.
|
||||||
|
* `exclude_social_media_domains (Optional[List[str]], default: SOCIAL_MEDIA_DOMAINS from config)`: List of social media domains whose links should be excluded.
|
||||||
|
* `exclude_external_links (bool, default: False)`: If `True`, all links pointing to external domains are excluded from `CrawlResult.links`.
|
||||||
|
* `exclude_social_media_links (bool, default: False)`: If `True`, links to domains in `exclude_social_media_domains` are excluded.
|
||||||
|
* `exclude_domains (Optional[List[str]], default: [])`: A list of specific domains whose links should be excluded.
|
||||||
|
* `exclude_internal_links (bool, default: False)`: If `True`, all links pointing to the same domain are excluded.
|
||||||
|
* `verbose (bool, default: True)`: Enables verbose logging for this specific crawl run. Overrides `BrowserConfig.verbose`.
|
||||||
|
* `log_console (bool, default: False)`: If `True`, browser console messages are captured (requires `capture_console_messages=True` to be effective).
|
||||||
|
* `capture_network_requests (bool, default: False)`: If `True`, captures details of network requests and responses made by the page.
|
||||||
|
* `capture_console_messages (bool, default: False)`: If `True`, captures messages logged to the browser's console.
|
||||||
|
* `method (str, default: "GET")`: HTTP method to use, primarily for `AsyncHTTPCrawlerStrategy`.
|
||||||
|
* `stream (bool, default: False)`: If `True` when using `arun_many`, results are yielded as an async generator instead of returned as a list at the end.
|
||||||
|
* `check_robots_txt (bool, default: False)`: If `True`, `robots.txt` rules for the domain will be checked and respected.
|
||||||
|
* `user_agent (Optional[str], default: None)`: User-Agent string for this specific run. Overrides `BrowserConfig.user_agent`.
|
||||||
|
* `user_agent_mode (Optional[str], default: None)`: User-Agent generation mode for this specific run.
|
||||||
|
* `user_agent_generator_config (Optional[dict], default: {})`: Configuration for User-Agent generator for this run.
|
||||||
|
* `deep_crawl_strategy (Optional[DeepCrawlStrategy], default: None)`: Strategy to use for deep crawling beyond the initial URL.
|
||||||
|
* `experimental (Optional[Dict[str, Any]], default: {})`: A dictionary for passing experimental or beta parameters.
|
||||||
|
* **2.2.3. Key Public Attributes/Properties:**
|
||||||
|
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||||
|
* **2.2.4. Deprecated Property Handling (`__getattr__`, `_UNWANTED_PROPS`)**
|
||||||
|
* Behavior: Attempting to access a deprecated property (e.g., `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`) raises an `AttributeError`. The error message directs the user to use the `cache_mode` parameter with the appropriate `CacheMode` enum member instead.
|
||||||
|
* List of Deprecated Properties and their `CacheMode` Equivalents:
|
||||||
|
* `bypass_cache`: Use `cache_mode=CacheMode.BYPASS`.
|
||||||
|
* `disable_cache`: Use `cache_mode=CacheMode.DISABLE`.
|
||||||
|
* `no_cache_read`: Use `cache_mode=CacheMode.WRITE_ONLY`.
|
||||||
|
* `no_cache_write`: Use `cache_mode=CacheMode.READ_ONLY`.
|
||||||
|
* **2.2.5. Key Public Methods:**
|
||||||
|
* `from_kwargs(cls, kwargs: dict) -> CrawlerRunConfig` (Static Method):
|
||||||
|
* Purpose: Creates a `CrawlerRunConfig` instance from a dictionary of keyword arguments.
|
||||||
|
* `dump(self) -> dict`:
|
||||||
|
* Purpose: Serializes the `CrawlerRunConfig` object to a dictionary suitable for JSON storage, handling complex nested objects using `to_serializable_dict`.
|
||||||
|
* `load(cls, data: dict) -> CrawlerRunConfig` (Static Method):
|
||||||
|
* Purpose: Deserializes a `CrawlerRunConfig` object from a dictionary (typically one created by `dump()`), using `from_serializable_dict`.
|
||||||
|
* `to_dict(self) -> dict`:
|
||||||
|
* Purpose: Converts the `CrawlerRunConfig` instance into a dictionary representation. Complex objects like strategies are typically represented by their class name or a simplified form.
|
||||||
|
* `clone(self, **kwargs) -> CrawlerRunConfig`:
|
||||||
|
* Purpose: Creates a deep copy of the current `CrawlerRunConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||||
|
|
||||||
|
### 2.3. `LLMConfig`
|
||||||
|
Located in `crawl4ai.async_configs`.
|
||||||
|
|
||||||
|
* **2.3.1. Purpose:**
|
||||||
|
* Description: The `LLMConfig` class provides configuration for interacting with Large Language Model (LLM) providers. It includes settings for the provider name, API token, base URL, and various model-specific parameters like temperature and max tokens.
|
||||||
|
* **2.3.2. Initialization (`__init__`)**
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
class LLMConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
provider: str = DEFAULT_PROVIDER, # e.g., "openai/gpt-4o-mini"
|
||||||
|
api_token: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `provider (str, default: DEFAULT_PROVIDER)`: The identifier for the LLM provider and model (e.g., "openai/gpt-4o-mini", "ollama/llama3.3", "gemini/gemini-1.5-pro").
|
||||||
|
* `api_token (Optional[str], default: None)`: The API token for authenticating with the LLM provider. If `None`, it attempts to load from environment variables based on the provider (e.g., `OPENAI_API_KEY` for OpenAI, `GEMINI_API_KEY` for Gemini). Can also be set as "env:YOUR_ENV_VAR_NAME".
|
||||||
|
* `base_url (Optional[str], default: None)`: A custom base URL for the LLM API endpoint, useful for self-hosted models or proxies.
|
||||||
|
* `temperature (Optional[float], default: None)`: Controls the randomness of the LLM's output. Higher values (e.g., 0.8) make output more random, lower values (e.g., 0.2) make it more deterministic.
|
||||||
|
* `max_tokens (Optional[int], default: None)`: The maximum number of tokens the LLM should generate in its response.
|
||||||
|
* `top_p (Optional[float], default: None)`: Nucleus sampling parameter. The model considers only tokens with cumulative probability mass up to `top_p`.
|
||||||
|
* `frequency_penalty (Optional[float], default: None)`: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
|
||||||
|
* `presence_penalty (Optional[float], default: None)`: Penalizes new tokens based on whether they have appeared in the text so far, increasing the model's likelihood to talk about new topics.
|
||||||
|
* `stop (Optional[List[str]], default: None)`: A list of sequences where the API will stop generating further tokens.
|
||||||
|
* `n (Optional[int], default: None)`: The number of completions to generate for each prompt.
|
||||||
|
* **2.3.3. Key Public Attributes/Properties:**
|
||||||
|
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||||
|
* **2.3.4. Key Public Methods:**
|
||||||
|
* `from_kwargs(cls, kwargs: dict) -> LLMConfig` (Static Method):
|
||||||
|
* Purpose: Creates an `LLMConfig` instance from a dictionary of keyword arguments.
|
||||||
|
* `to_dict(self) -> dict`:
|
||||||
|
* Purpose: Converts the `LLMConfig` instance into a dictionary representation.
|
||||||
|
* `clone(self, **kwargs) -> LLMConfig`:
|
||||||
|
* Purpose: Creates a deep copy of the current `LLMConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||||
|
|
||||||
|
### 2.4. `GeolocationConfig`
|
||||||
|
Located in `crawl4ai.async_configs`.
|
||||||
|
|
||||||
|
* **2.4.1. Purpose:**
|
||||||
|
* Description: The `GeolocationConfig` class stores settings for mocking the browser's geolocation, including latitude, longitude, and accuracy.
|
||||||
|
* **2.4.2. Initialization (`__init__`)**
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
class GeolocationConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
latitude: float,
|
||||||
|
longitude: float,
|
||||||
|
accuracy: Optional[float] = 0.0
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `latitude (float)`: The latitude coordinate (e.g., 37.7749 for San Francisco).
|
||||||
|
* `longitude (float)`: The longitude coordinate (e.g., -122.4194 for San Francisco).
|
||||||
|
* `accuracy (Optional[float], default: 0.0)`: The accuracy of the geolocation in meters.
|
||||||
|
* **2.4.3. Key Public Attributes/Properties:**
|
||||||
|
* `latitude (float)`: Stores the latitude.
|
||||||
|
* `longitude (float)`: Stores the longitude.
|
||||||
|
* `accuracy (Optional[float])`: Stores the accuracy.
|
||||||
|
* **2.4.4. Key Public Methods:**
|
||||||
|
* `from_dict(cls, geo_dict: dict) -> GeolocationConfig` (Static Method):
|
||||||
|
* Purpose: Creates a `GeolocationConfig` instance from a dictionary.
|
||||||
|
* `to_dict(self) -> dict`:
|
||||||
|
* Purpose: Converts the `GeolocationConfig` instance to a dictionary: `{"latitude": ..., "longitude": ..., "accuracy": ...}`.
|
||||||
|
* `clone(self, **kwargs) -> GeolocationConfig`:
|
||||||
|
* Purpose: Creates a copy of the `GeolocationConfig` instance, allowing for overriding specific attributes with `kwargs`.
|
||||||
|
|
||||||
|
### 2.5. `ProxyConfig`
|
||||||
|
Located in `crawl4ai.async_configs` (and `crawl4ai.proxy_strategy`).
|
||||||
|
|
||||||
|
* **2.5.1. Purpose:**
|
||||||
|
* Description: The `ProxyConfig` class encapsulates the configuration for a single proxy server, including its address, authentication credentials (if any), and optionally its public IP address.
|
||||||
|
* **2.5.2. Initialization (`__init__`)**
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
class ProxyConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
server: str,
|
||||||
|
username: Optional[str] = None,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
ip: Optional[str] = None,
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `server (str)`: The proxy server URL, including protocol and port (e.g., "http://127.0.0.1:8080", "socks5://proxy.example.com:1080").
|
||||||
|
* `username (Optional[str], default: None)`: The username for proxy authentication, if required.
|
||||||
|
* `password (Optional[str], default: None)`: The password for proxy authentication, if required.
|
||||||
|
* `ip (Optional[str], default: None)`: The public IP address of the proxy server. If not provided, it will be automatically extracted from the `server` string if possible.
|
||||||
|
* **2.5.3. Key Public Attributes/Properties:**
|
||||||
|
* `server (str)`: The proxy server URL.
|
||||||
|
* `username (Optional[str])`: The username for proxy authentication.
|
||||||
|
* `password (Optional[str])`: The password for proxy authentication.
|
||||||
|
* `ip (Optional[str])`: The public IP address of the proxy. This is either user-provided or automatically extracted from the `server` string during initialization via the internal `_extract_ip_from_server` method.
|
||||||
|
* **2.5.4. Key Public Methods:**
|
||||||
|
* `_extract_ip_from_server(self) -> Optional[str]` (Internal method):
|
||||||
|
* Purpose: Extracts the IP address component from the `self.server` URL string.
|
||||||
|
* `from_string(cls, proxy_str: str) -> ProxyConfig` (Static Method):
|
||||||
|
* Purpose: Creates a `ProxyConfig` instance from a string.
|
||||||
|
* Formats:
|
||||||
|
* `'ip:port:username:password'`
|
||||||
|
* `'ip:port'` (no authentication)
|
||||||
|
* `from_dict(cls, proxy_dict: dict) -> ProxyConfig` (Static Method):
|
||||||
|
* Purpose: Creates a `ProxyConfig` instance from a dictionary with keys "server", "username", "password", and "ip".
|
||||||
|
* `from_env(cls, env_var: str = "PROXIES") -> List[ProxyConfig]` (Static Method):
|
||||||
|
* Purpose: Loads a list of `ProxyConfig` objects from a comma-separated environment variable. Each proxy string in the variable should conform to the format accepted by `from_string`.
|
||||||
|
* `to_dict(self) -> dict`:
|
||||||
|
* Purpose: Converts the `ProxyConfig` instance to a dictionary: `{"server": ..., "username": ..., "password": ..., "ip": ...}`.
|
||||||
|
* `clone(self, **kwargs) -> ProxyConfig`:
|
||||||
|
* Purpose: Creates a copy of the `ProxyConfig` instance, allowing for overriding specific attributes with `kwargs`.
|
||||||
|
|
||||||
|
### 2.6. `HTTPCrawlerConfig`
|
||||||
|
Located in `crawl4ai.async_configs`.
|
||||||
|
|
||||||
|
* **2.6.1. Purpose:**
|
||||||
|
* Description: The `HTTPCrawlerConfig` class holds configuration settings specific to direct HTTP-based crawling strategies (e.g., `AsyncHTTPCrawlerStrategy`), which do not use a full browser environment.
|
||||||
|
* **2.6.2. Initialization (`__init__`)**
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
class HTTPCrawlerConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
method: str = "GET",
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
data: Optional[Dict[str, Any]] = None,
|
||||||
|
json: Optional[Dict[str, Any]] = None,
|
||||||
|
follow_redirects: bool = True,
|
||||||
|
verify_ssl: bool = True,
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `method (str, default: "GET")`: The HTTP method to use for the request (e.g., "GET", "POST", "PUT").
|
||||||
|
* `headers (Optional[Dict[str, str]], default: None)`: A dictionary of custom HTTP headers to send with the request.
|
||||||
|
* `data (Optional[Dict[str, Any]], default: None)`: Data to be sent in the body of the request, typically for "POST" or "PUT" requests (e.g., form data).
|
||||||
|
* `json (Optional[Dict[str, Any]], default: None)`: JSON data to be sent in the body of the request. If provided, the `Content-Type` header is typically set to `application/json`.
|
||||||
|
* `follow_redirects (bool, default: True)`: If `True`, the crawler will automatically follow HTTP redirects.
|
||||||
|
* `verify_ssl (bool, default: True)`: If `True`, SSL certificates will be verified. Set to `False` to ignore SSL errors (use with caution).
|
||||||
|
* **2.6.3. Key Public Attributes/Properties:**
|
||||||
|
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||||
|
* **2.6.4. Key Public Methods:**
|
||||||
|
* `from_kwargs(cls, kwargs: dict) -> HTTPCrawlerConfig` (Static Method):
|
||||||
|
* Purpose: Creates an `HTTPCrawlerConfig` instance from a dictionary of keyword arguments.
|
||||||
|
* `to_dict(self) -> dict`:
|
||||||
|
* Purpose: Converts the `HTTPCrawlerConfig` instance into a dictionary representation.
|
||||||
|
* `clone(self, **kwargs) -> HTTPCrawlerConfig`:
|
||||||
|
* Purpose: Creates a deep copy of the current `HTTPCrawlerConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||||
|
* `dump(self) -> dict`:
|
||||||
|
* Purpose: Serializes the `HTTPCrawlerConfig` object to a dictionary.
|
||||||
|
* `load(cls, data: dict) -> HTTPCrawlerConfig` (Static Method):
|
||||||
|
* Purpose: Deserializes an `HTTPCrawlerConfig` object from a dictionary.
|
||||||
|
|
||||||
|
## 3. Enumerations and Helper Constants
|
||||||
|
|
||||||
|
### 3.1. `CacheMode` (Enum)
|
||||||
|
Located in `crawl4ai.cache_context`.
|
||||||
|
|
||||||
|
* **3.1.1. Purpose:**
|
||||||
|
* Description: The `CacheMode` enumeration defines the different caching behaviors that can be applied to a crawl operation. It is used in `CrawlerRunConfig` to control how results are read from and written to the cache.
|
||||||
|
* **3.1.2. Enum Members:**
|
||||||
|
* `ENABLE (str)`: Value: "ENABLE". Description: Enables normal caching behavior. The crawler will attempt to read from the cache first, and if a result is not found or is stale, it will perform the crawl and write the new result to the cache.
|
||||||
|
* `DISABLE (str)`: Value: "DISABLE". Description: Disables all caching. The crawler will not read from or write to the cache. Every request will be a fresh crawl.
|
||||||
|
* `READ_ONLY (str)`: Value: "READ_ONLY". Description: The crawler will only attempt to read from the cache. If a result is found, it will be used. If not, the crawl will not proceed further for that URL, and no new data will be written to the cache.
|
||||||
|
* `WRITE_ONLY (str)`: Value: "WRITE_ONLY". Description: The crawler will not attempt to read from the cache. It will always perform a fresh crawl and then write the result to the cache.
|
||||||
|
* `BYPASS (str)`: Value: "BYPASS". Description: The crawler will skip reading from the cache for this specific operation and will perform a fresh crawl. The result of this crawl *will* be written to the cache. This is the default `cache_mode` for `CrawlerRunConfig`.
|
||||||
|
* **3.1.3. Usage:**
|
||||||
|
* Example:
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode
|
||||||
|
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLE) # Use cache fully
|
||||||
|
config_bypass = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Force fresh crawl, then cache
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. Serialization Helper Functions
|
||||||
|
Located in `crawl4ai.async_configs`.
|
||||||
|
|
||||||
|
### 4.1. `to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`
|
||||||
|
|
||||||
|
* **4.1.1. Purpose:**
|
||||||
|
* Description: This utility function recursively converts various Python objects, including `crawl4ai` configuration objects, into a dictionary format that is suitable for JSON serialization. It uses a `{ "type": "ClassName", "params": { ... } }` structure for custom class instances to enable proper deserialization later.
|
||||||
|
* **4.1.2. Parameters:**
|
||||||
|
* `obj (Any)`: The Python object to be serialized.
|
||||||
|
* `ignore_default_value (bool, default: False)`: If `True`, when serializing class instances, parameters whose current values match their `__init__` default values might be excluded from the "params" dictionary. (Note: The exact behavior depends on the availability of default values in the class signature and handling of empty/None values).
|
||||||
|
* **4.1.3. Returns:**
|
||||||
|
* `Dict`: A dictionary representation of the input object, structured for easy serialization (e.g., to JSON) and later deserialization by `from_serializable_dict`.
|
||||||
|
* **4.1.4. Key Behaviors:**
|
||||||
|
* **Basic Types:** `str`, `int`, `float`, `bool`, `None` are returned as is.
|
||||||
|
* **Enums:** Serialized as `{"type": "EnumClassName", "params": enum_member.value}`.
|
||||||
|
* **Datetime Objects:** Serialized to their ISO 8601 string representation.
|
||||||
|
* **Lists, Tuples, Sets, Frozensets:** Serialized by recursively calling `to_serializable_dict` on each of their elements, returning a list.
|
||||||
|
* **Plain Dictionaries:** Serialized as `{"type": "dict", "value": {key: serialized_value, ...}}`.
|
||||||
|
* **Class Instances (e.g., Config Objects):**
|
||||||
|
* The object's class name is stored in the "type" field.
|
||||||
|
* Parameters from the `__init__` signature and attributes from `__slots__` (if defined) are collected.
|
||||||
|
* Their current values are recursively serialized and stored in the "params" dictionary.
|
||||||
|
* The structure is `{"type": "ClassName", "params": {"param_name": serialized_param_value, ...}}`.
|
||||||
|
|
||||||
|
### 4.2. `from_serializable_dict(data: Any) -> Any`
|
||||||
|
|
||||||
|
* **4.2.1. Purpose:**
|
||||||
|
* Description: This utility function reconstructs Python objects, including `crawl4ai` configuration objects, from the serializable dictionary format previously created by `to_serializable_dict`.
|
||||||
|
* **4.2.2. Parameters:**
|
||||||
|
* `data (Any)`: The dictionary (or basic data type) to be deserialized. This is typically the output of `to_serializable_dict` after being, for example, loaded from a JSON string.
|
||||||
|
* **4.2.3. Returns:**
|
||||||
|
* `Any`: The reconstructed Python object (e.g., an instance of `BrowserConfig`, `LLMConfig`, a list, a plain dictionary, etc.).
|
||||||
|
* **4.2.4. Key Behaviors:**
|
||||||
|
* **Basic Types:** `str`, `int`, `float`, `bool`, `None` are returned as is.
|
||||||
|
* **Typed Dictionaries (from `to_serializable_dict`):**
|
||||||
|
* If `data` is a dictionary and contains a "type" key:
|
||||||
|
* If `data["type"] == "dict"`, it reconstructs a plain Python dictionary from `data["value"]` by recursively deserializing its items.
|
||||||
|
* Otherwise, it attempts to locate the class specified by `data["type"]` within the `crawl4ai` module.
|
||||||
|
* If the class is an `Enum`, it instantiates the enum member using `data["params"]` (the enum value).
|
||||||
|
* If it's a regular class, it recursively deserializes the items in `data["params"]` and uses them as keyword arguments (`**kwargs`) to instantiate the class.
|
||||||
|
* **Lists:** If `data` is a list, it reconstructs a list by recursively calling `from_serializable_dict` on each of its elements.
|
||||||
|
* **Legacy Dictionaries:** If `data` is a dictionary but does not conform to the "type" key structure (for backward compatibility), it attempts to deserialize its values.
|
||||||
|
|
||||||
|
## 5. Cross-References and Relationships
|
||||||
|
|
||||||
|
* **5.1. `BrowserConfig` Usage:**
|
||||||
|
* Typically instantiated once and passed to the `AsyncWebCrawler` constructor via its `config` parameter.
|
||||||
|
* `browser_config = BrowserConfig(headless=False)`
|
||||||
|
* `crawler = AsyncWebCrawler(config=browser_config)`
|
||||||
|
* It defines the global browser settings that will be used for all subsequent crawl operations unless overridden by `CrawlerRunConfig` on a per-run basis.
|
||||||
|
* **5.2. `CrawlerRunConfig` Usage:**
|
||||||
|
* Passed to the `arun()` or `arun_many()` methods of `AsyncWebCrawler`.
|
||||||
|
* `run_config = CrawlerRunConfig(screenshot=True, cache_mode=CacheMode.BYPASS)`
|
||||||
|
* `result = await crawler.arun(url="https://example.com", config=run_config)`
|
||||||
|
* Allows for fine-grained control over individual crawl requests, overriding global settings from `BrowserConfig` or `AsyncWebCrawler`'s defaults where applicable (e.g., `user_agent`, `proxy_config`, `cache_mode`).
|
||||||
|
* **5.3. `LLMConfig` Usage:**
|
||||||
|
* Instantiated and passed to LLM-based extraction strategies (e.g., `LLMExtractionStrategy`) or content filters (`LLMContentFilter`) during their initialization.
|
||||||
|
* `llm_conf = LLMConfig(provider="openai/gpt-4o-mini", api_token="sk-...")`
|
||||||
|
* `extraction_strategy = LLMExtractionStrategy(llm_config=llm_conf, schema=my_schema)`
|
||||||
|
* **5.4. `GeolocationConfig` and `ProxyConfig` Usage:**
|
||||||
|
* `GeolocationConfig` is typically instantiated and assigned to the `geolocation` parameter of `CrawlerRunConfig`.
|
||||||
|
* `geo_conf = GeolocationConfig(latitude=34.0522, longitude=-118.2437)`
|
||||||
|
* `run_config = CrawlerRunConfig(geolocation=geo_conf)`
|
||||||
|
* `ProxyConfig` can be assigned to the `proxy_config` parameter of `BrowserConfig` (for a global proxy applied to all contexts) or `CrawlerRunConfig` (for a proxy specific to a single crawl run).
|
||||||
|
* `proxy_conf = ProxyConfig(server="http://myproxy:8080")`
|
||||||
|
* `browser_config = BrowserConfig(proxy_config=proxy_conf)` (global)
|
||||||
|
* `run_config = CrawlerRunConfig(proxy_config=proxy_conf)` (per-run)
|
||||||
|
* **5.5. `HTTPCrawlerConfig` Usage:**
|
||||||
|
* Used when the `crawler_strategy` for `AsyncWebCrawler` is set to `AsyncHTTPCrawlerStrategy` (for non-browser-based HTTP requests).
|
||||||
|
* `http_conf = HTTPCrawlerConfig(method="POST", json={"key": "value"})`
|
||||||
|
* `http_strategy = AsyncHTTPCrawlerStrategy(http_crawler_config=http_conf)`
|
||||||
|
* `crawler = AsyncWebCrawler(crawler_strategy=http_strategy)`
|
||||||
|
* Alternatively, parameters like `method`, `data`, `json` can be passed directly to `arun()` when using `AsyncHTTPCrawlerStrategy` if they are part of the `CrawlerRunConfig`.
|
||||||
File diff suppressed because it is too large
Load Diff
2803
docs/md_v2/assets/llmtxt/crawl4ai_core.llm.full.txt
Normal file
2803
docs/md_v2/assets/llmtxt/crawl4ai_core.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
356
docs/md_v2/assets/llmtxt/crawl4ai_core_examples_content.llm.txt
Normal file
356
docs/md_v2/assets/llmtxt/crawl4ai_core_examples_content.llm.txt
Normal file
@@ -0,0 +1,356 @@
|
|||||||
|
```markdown
|
||||||
|
# Examples Outline for crawl4ai - core Component
|
||||||
|
|
||||||
|
**Target Document Type:** Examples Collection
|
||||||
|
**Target Output Filename Suggestion:** `llm_examples_core.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2024-05-24 10:00:00
|
||||||
|
---
|
||||||
|
|
||||||
|
This document provides a collection of runnable code examples for the `core` component of the `crawl4ai` library. Each example is designed to showcase a specific feature or configuration.
|
||||||
|
|
||||||
|
## 1. Basic `AsyncWebCrawler` Usage
|
||||||
|
|
||||||
|
### 1.1. Example: Simplest crawl of a single URL with default `BrowserConfig` and `CrawlerRunConfig`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def simplest_crawl():
|
||||||
|
# Uses default BrowserConfig and CrawlerRunConfig
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Crawl successful!")
|
||||||
|
print(f"Markdown (first 300 chars):\n{result.markdown.raw_markdown[:300]}...")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(simplest_crawl())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 1.2. Example: Using `AsyncWebCrawler` as an asynchronous context manager (`async with`).
|
||||||
|
|
||||||
|
This is the recommended way to manage the crawler's lifecycle.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def context_manager_crawl():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Crawl successful using context manager!")
|
||||||
|
print(f"Page title from metadata: {result.metadata.get('title')}")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(context_manager_crawl())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 1.3. Example: Explicitly starting and closing the `AsyncWebCrawler` using `start()` and `close()`.
|
||||||
|
|
||||||
|
Useful for scenarios where the crawler's lifecycle needs more manual control.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def explicit_lifecycle_crawl():
|
||||||
|
crawler = AsyncWebCrawler()
|
||||||
|
await crawler.start() # Explicitly start the crawler and browser
|
||||||
|
try:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Crawl successful with explicit start/close!")
|
||||||
|
print(f"Cleaned HTML (first 300 chars):\n{result.cleaned_html[:300]}...")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
finally:
|
||||||
|
await crawler.close() # Ensure the crawler is closed
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(explicit_lifecycle_crawl())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 1.4. Example: Handling a failed crawl (e.g., non-existent URL, network error) and checking `CrawlResult.success` and `CrawlResult.error_message`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def failed_crawl_handling():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Using a deliberately non-existent URL
|
||||||
|
result = await crawler.arun(url="https://thissitedoesnotexist.crawl4ai")
|
||||||
|
if not result.success:
|
||||||
|
print(f"Crawl failed as expected for URL: {result.url}")
|
||||||
|
print(f"Status Code: {result.status_code}")
|
||||||
|
print(f"Error Message: {result.error_message}")
|
||||||
|
else:
|
||||||
|
print("Crawl unexpectedly succeeded!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(failed_crawl_handling())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 1.5. Example: Processing raw HTML content directly using `crawler.aprocess_html()`.
|
||||||
|
|
||||||
|
This is useful if you already have HTML content and want to use Crawl4ai's processing capabilities.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
|
async def process_raw_html_directly():
|
||||||
|
raw_html_content = """
|
||||||
|
<html>
|
||||||
|
<head><title>My Test Page</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Welcome!</h1>
|
||||||
|
<p>This is a paragraph with a <a href="https://example.com">link</a>.</p>
|
||||||
|
<script>console.log("This should be removed");</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
# No need for BrowserConfig as we are not navigating
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Use CrawlerRunConfig if you need specific processing options
|
||||||
|
config = CrawlerRunConfig()
|
||||||
|
result = await crawler.aprocess_html(
|
||||||
|
url="raw://my_virtual_page", # Provide a conceptual URL
|
||||||
|
html=raw_html_content,
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
if result.success:
|
||||||
|
print("Raw HTML processed successfully!")
|
||||||
|
print(f"Markdown:\n{result.markdown.raw_markdown}")
|
||||||
|
print(f"Cleaned HTML:\n{result.cleaned_html}")
|
||||||
|
else:
|
||||||
|
print(f"HTML processing failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(process_raw_html_directly())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 1.6. Example: Crawling a local HTML file using the `file:///` prefix.
|
||||||
|
|
||||||
|
First, create a dummy HTML file named `local_test.html` in the same directory as your script.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# local_test.html
|
||||||
|
# <!DOCTYPE html>
|
||||||
|
# <html>
|
||||||
|
# <head>
|
||||||
|
# <title>Local Test File</title>
|
||||||
|
# </head>
|
||||||
|
# <body>
|
||||||
|
# <h1>Hello from a local file!</h1>
|
||||||
|
# <p>This content is loaded from the local filesystem.</p>
|
||||||
|
# </body>
|
||||||
|
# </html>
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def crawl_local_file():
|
||||||
|
# Create a dummy local HTML file for the example
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
local_file_path = script_dir / "local_test_for_crawl.html"
|
||||||
|
with open(local_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("<!DOCTYPE html><html><head><title>Local Test</title></head><body><h1>Local Content</h1></body></html>")
|
||||||
|
|
||||||
|
file_url = f"file:///{local_file_path.resolve()}"
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url=file_url)
|
||||||
|
if result.success:
|
||||||
|
print(f"Successfully crawled local file: {file_url}")
|
||||||
|
print(f"Markdown (first 100 chars): {result.markdown.raw_markdown[:100]}...")
|
||||||
|
else:
|
||||||
|
print(f"Failed to crawl local file: {result.error_message}")
|
||||||
|
|
||||||
|
# Clean up the dummy file
|
||||||
|
if os.path.exists(local_file_path):
|
||||||
|
os.remove(local_file_path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(crawl_local_file())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 1.7. Example: Accessing basic fields from `CrawlResult` (e.g., `url`, `html`, `markdown.raw_markdown`, `status_code`, `response_headers`).
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def access_crawl_result_fields():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print(f"URL Crawled: {result.url}")
|
||||||
|
print(f"Status Code: {result.status_code}")
|
||||||
|
|
||||||
|
print("\n--- Response Headers (sample) ---")
|
||||||
|
if result.response_headers:
|
||||||
|
for key, value in list(result.response_headers.items())[:3]: # Print first 3 headers
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
|
print(f"\n--- Raw HTML (first 100 chars) ---\n{result.html[:100]}...")
|
||||||
|
print(f"\n--- Cleaned HTML (first 100 chars) ---\n{result.cleaned_html[:100]}...")
|
||||||
|
|
||||||
|
if result.markdown:
|
||||||
|
print(f"\n--- Raw Markdown (first 100 chars) ---\n{result.markdown.raw_markdown[:100]}...")
|
||||||
|
|
||||||
|
print(f"\n--- Metadata (sample) ---")
|
||||||
|
if result.metadata:
|
||||||
|
for key, value in list(result.metadata.items())[:3]: # Print first 3 metadata items
|
||||||
|
print(f"{key}: {value}")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(access_crawl_result_fields())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
## 2. Configuring the Browser (`BrowserConfig`)
|
||||||
|
|
||||||
|
### 2.1. Example: Initializing `AsyncWebCrawler` with a custom `BrowserConfig` object.
|
||||||
|
|
||||||
|
This example sets the browser to run in non-headless mode and uses Firefox.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
|
|
||||||
|
async def custom_browser_config_init():
|
||||||
|
# Configure browser to be Firefox and visible
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="firefox",
|
||||||
|
headless=False # Set to True to run without UI
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pass the custom config to the crawler
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print(f"Crawl successful with custom BrowserConfig (Firefox, visible)!")
|
||||||
|
print(f"Page title: {result.metadata.get('title')}")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# This example might open a visible browser window.
|
||||||
|
# Ensure Firefox is installed if you run this.
|
||||||
|
# asyncio.run(custom_browser_config_init())
|
||||||
|
print("Skipping custom_browser_config_init example in automated run to avoid GUI interaction.")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
### 2.2. Browser Type and Headless Mode
|
||||||
|
|
||||||
|
#### 2.2.1. Example: Using Chromium browser (default).
|
||||||
|
|
||||||
|
This shows the default behavior if no `browser_type` is specified.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
|
|
||||||
|
async def chromium_default_crawl():
|
||||||
|
# Chromium is the default, but we can explicitly set it
|
||||||
|
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Crawl successful with Chromium (default)!")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(chromium_default_crawl())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
#### 2.2.2. Example: Using Firefox browser (`browser_type="firefox"`).
|
||||||
|
|
||||||
|
Ensure Firefox is installed on your system for this example to run.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
|
|
||||||
|
async def firefox_crawl():
|
||||||
|
browser_config = BrowserConfig(browser_type="firefox", headless=True)
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Crawl successful with Firefox!")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed with Firefox: {result.error_message}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error running Firefox example: {e}. Ensure Firefox is installed and Playwright browsers are set up (`crawl4ai-setup`).")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# asyncio.run(firefox_crawl())
|
||||||
|
print("Skipping Firefox example in automated run. Uncomment to run if Firefox is installed.")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
#### 2.2.3. Example: Using WebKit browser (`browser_type="webkit"`).
|
||||||
|
|
||||||
|
Ensure WebKit (Safari's engine) is installed via Playwright.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
|
|
||||||
|
async def webkit_crawl():
|
||||||
|
browser_config = BrowserConfig(browser_type="webkit", headless=True)
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Crawl successful with WebKit!")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed with WebKit: {result.error_message}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error running WebKit example: {e}. Ensure WebKit is installed and Playwright browsers are set up (`crawl4ai-setup`).")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# asyncio.run(webkit_crawl())
|
||||||
|
print("Skipping WebKit example in automated run. Uncomment to run if WebKit is installed.")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
#### 2.2.4. Example: Running the browser in non-headless mode (`headless=False`) for visual debugging.
|
||||||
|
|
||||||
|
This will open a visible browser window.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
|
|
||||||
|
async def non_headless_crawl():
|
||||||
|
browser_config = BrowserConfig(headless=False) # Browser window will be visible
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https
|
||||||
|
```
|
||||||
890
docs/md_v2/assets/llmtxt/crawl4ai_core_memory_content.llm.txt
Normal file
890
docs/md_v2/assets/llmtxt/crawl4ai_core_memory_content.llm.txt
Normal file
@@ -0,0 +1,890 @@
|
|||||||
|
Okay, I have read the objective and instructions. I will now generate the detailed Markdown outline for a "Foundational Memory" document for the `core` component of `crawl4ai`, using the provided `code_analysis_output.md` (the concatenated code snippets from the previous prompt) as the primary source of truth for API details.
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - core Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_core.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction to Core Components
|
||||||
|
* 1.1. Purpose: Provides the foundational classes, configurations, and data models for web crawling and scraping operations within the `crawl4ai` library.
|
||||||
|
* 1.2. Key Functionalities:
|
||||||
|
* Orchestration of asynchronous web crawling (`AsyncWebCrawler`).
|
||||||
|
* Configuration of browser behavior and specific crawl runs (`BrowserConfig`, `CrawlerRunConfig`).
|
||||||
|
* Standardized data structures for crawl results and associated data (`CrawlResult`, `Media`, `Links`, etc.).
|
||||||
|
* Strategies for fetching web content (`AsyncPlaywrightCrawlerStrategy`, `AsyncHTTPCrawlerStrategy`).
|
||||||
|
* Management of browser instances and sessions (`BrowserManager`, `ManagedBrowser`).
|
||||||
|
* Asynchronous logging (`AsyncLogger`).
|
||||||
|
* 1.3. Relationship with other `crawl4ai` components:
|
||||||
|
* The `core` component serves as the foundation upon which specialized strategies (e.g., PDF processing, Markdown generation, content extraction, chunking, content filtering) are built and integrated.
|
||||||
|
|
||||||
|
## 2. Main Class: `AsyncWebCrawler`
|
||||||
|
* 2.1. Purpose: The primary class for orchestrating asynchronous web crawling operations. It manages browser instances (via a `BrowserManager`), applies crawling strategies, and processes HTML content to produce structured results.
|
||||||
|
* 2.2. Initialization (`__init__`)
|
||||||
|
* 2.2.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||||
|
config: Optional[BrowserConfig] = None,
|
||||||
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||||
|
thread_safe: bool = False,
|
||||||
|
logger: Optional[AsyncLoggerBase] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 2.2.2. Parameters:
|
||||||
|
* `crawler_strategy (Optional[AsyncCrawlerStrategy])`: The strategy to use for fetching web content. If `None`, defaults to `AsyncPlaywrightCrawlerStrategy` initialized with `config` and `logger`.
|
||||||
|
* `config (Optional[BrowserConfig])`: Configuration object for browser settings. If `None`, a default `BrowserConfig()` is created.
|
||||||
|
* `base_directory (str)`: The base directory for storing crawl4ai related files, such as cache and logs. Defaults to `os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())`.
|
||||||
|
* `thread_safe (bool)`: If `True`, uses an `asyncio.Lock` for thread-safe operations, particularly relevant for `arun_many`. Default: `False`.
|
||||||
|
* `logger (Optional[AsyncLoggerBase])`: An instance of a logger. If `None`, a default `AsyncLogger` is initialized using `base_directory` and `config.verbose`.
|
||||||
|
* `**kwargs`: Additional keyword arguments, primarily for backward compatibility, passed to the `AsyncPlaywrightCrawlerStrategy` if `crawler_strategy` is not provided.
|
||||||
|
* 2.3. Key Public Attributes/Properties:
|
||||||
|
* `browser_config (BrowserConfig)`: Read-only. The browser configuration object used by the crawler.
|
||||||
|
* `crawler_strategy (AsyncCrawlerStrategy)`: Read-only. The active crawling strategy instance.
|
||||||
|
* `logger (AsyncLoggerBase)`: Read-only. The logger instance used by the crawler.
|
||||||
|
* `ready (bool)`: Read-only. `True` if the crawler has been started and is ready to perform crawl operations, `False` otherwise.
|
||||||
|
* 2.4. Lifecycle Methods:
|
||||||
|
* 2.4.1. `async start() -> AsyncWebCrawler`:
|
||||||
|
* Purpose: Asynchronously initializes the crawler strategy (e.g., launches the browser). This must be called before `arun` or `arun_many` if the crawler is not used as an asynchronous context manager.
|
||||||
|
* Returns: The `AsyncWebCrawler` instance (`self`).
|
||||||
|
* 2.4.2. `async close() -> None`:
|
||||||
|
* Purpose: Asynchronously closes the crawler strategy and cleans up resources (e.g., closes the browser). This should be called if `start()` was used explicitly.
|
||||||
|
* 2.4.3. `async __aenter__() -> AsyncWebCrawler`:
|
||||||
|
* Purpose: Entry point for asynchronous context management. Calls `self.start()`.
|
||||||
|
* Returns: The `AsyncWebCrawler` instance (`self`).
|
||||||
|
* 2.4.4. `async __aexit__(exc_type, exc_val, exc_tb) -> None`:
|
||||||
|
* Purpose: Exit point for asynchronous context management. Calls `self.close()`.
|
||||||
|
* 2.5. Primary Crawl Methods:
|
||||||
|
* 2.5.1. `async arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> RunManyReturn`:
|
||||||
|
* Purpose: Performs a single crawl operation for the given URL or raw HTML content.
|
||||||
|
* Parameters:
|
||||||
|
* `url (str)`: The URL to crawl (e.g., "http://example.com", "file:///path/to/file.html") or raw HTML content prefixed with "raw:" (e.g., "raw:<html>...</html>").
|
||||||
|
* `config (Optional[CrawlerRunConfig])`: Configuration for this specific crawl run. If `None`, a default `CrawlerRunConfig()` is used.
|
||||||
|
* `**kwargs`: Additional parameters passed to the underlying `aprocess_html` method, can be used to override settings in `config`.
|
||||||
|
* Returns: `RunManyReturn` (which resolves to `CrawlResultContainer` containing a single `CrawlResult`).
|
||||||
|
* 2.5.2. `async arun_many(urls: List[str], config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, **kwargs) -> RunManyReturn`:
|
||||||
|
* Purpose: Crawls multiple URLs concurrently using a specified or default dispatcher strategy.
|
||||||
|
* Parameters:
|
||||||
|
* `urls (List[str])`: A list of URLs to crawl.
|
||||||
|
* `config (Optional[CrawlerRunConfig])`: Configuration applied to all crawl runs in this batch.
|
||||||
|
* `dispatcher (Optional[BaseDispatcher])`: The dispatcher strategy to manage concurrent crawls. Defaults to `MemoryAdaptiveDispatcher`.
|
||||||
|
* `**kwargs`: Additional parameters passed to the underlying `arun` method for each URL.
|
||||||
|
* Returns: `RunManyReturn`. If `config.stream` is `True`, returns an `AsyncGenerator[CrawlResult, None]`. Otherwise, returns a `CrawlResultContainer` (list-like) of `CrawlResult` objects.
|
||||||
|
* 2.6. Internal Processing Method (User-Facing Effects):
|
||||||
|
* 2.6.1. `async aprocess_html(url: str, html: str, extracted_content: Optional[str], config: CrawlerRunConfig, screenshot_data: Optional[str], pdf_data: Optional[bytes], verbose: bool, **kwargs) -> CrawlResult`:
|
||||||
|
* Purpose: Processes the fetched HTML content. This method is called internally by `arun` after content is fetched (either from a live crawl or cache). It applies scraping strategies, content filtering, and Markdown generation based on the `config`.
|
||||||
|
* Parameters:
|
||||||
|
* `url (str)`: The URL of the content being processed.
|
||||||
|
* `html (str)`: The raw HTML content.
|
||||||
|
* `extracted_content (Optional[str])`: Pre-extracted content from a previous step or cache.
|
||||||
|
* `config (CrawlerRunConfig)`: Configuration for this processing run.
|
||||||
|
* `screenshot_data (Optional[str])`: Base64 encoded screenshot data, if available.
|
||||||
|
* `pdf_data (Optional[bytes])`: PDF data, if available.
|
||||||
|
* `verbose (bool)`: Verbosity setting for logging during processing.
|
||||||
|
* `**kwargs`: Additional parameters, including `is_raw_html` and `redirected_url`.
|
||||||
|
* Returns: A `CrawlResult` object containing the processed data.
|
||||||
|
|
||||||
|
## 3. Core Configuration Objects
|
||||||
|
|
||||||
|
* 3.1. Class `BrowserConfig` (from `crawl4ai.async_configs`)
|
||||||
|
* 3.1.1. Purpose: Configures the browser instance launched by Playwright, including its type, mode, display settings, proxy, user agent, and persistent storage options.
|
||||||
|
* 3.1.2. Initialization (`__init__`)
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
browser_type: str = "chromium",
|
||||||
|
headless: bool = True,
|
||||||
|
browser_mode: str = "dedicated",
|
||||||
|
use_managed_browser: bool = False,
|
||||||
|
cdp_url: Optional[str] = None,
|
||||||
|
use_persistent_context: bool = False,
|
||||||
|
user_data_dir: Optional[str] = None,
|
||||||
|
channel: Optional[str] = "chromium", # Note: 'channel' from code, outline had 'chrome_channel'
|
||||||
|
proxy: Optional[str] = None, # Note: 'proxy' from code, outline had 'proxy_config' for this level
|
||||||
|
proxy_config: Optional[Union[ProxyConfig, dict, None]] = None,
|
||||||
|
viewport_width: int = 1080,
|
||||||
|
viewport_height: int = 600,
|
||||||
|
viewport: Optional[dict] = None,
|
||||||
|
accept_downloads: bool = False,
|
||||||
|
downloads_path: Optional[str] = None,
|
||||||
|
storage_state: Optional[Union[str, dict, None]] = None,
|
||||||
|
ignore_https_errors: bool = True,
|
||||||
|
java_script_enabled: bool = True,
|
||||||
|
sleep_on_close: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
cookies: Optional[list] = None,
|
||||||
|
headers: Optional[dict] = None,
|
||||||
|
user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
|
||||||
|
user_agent_mode: str = "",
|
||||||
|
user_agent_generator_config: Optional[dict] = None, # Note: 'user_agent_generator_config' from code
|
||||||
|
text_mode: bool = False,
|
||||||
|
light_mode: bool = False,
|
||||||
|
extra_args: Optional[list] = None,
|
||||||
|
debugging_port: int = 9222,
|
||||||
|
host: str = "localhost",
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Key Parameters:
|
||||||
|
* `browser_type (str)`: Type of browser to launch ("chromium", "firefox", "webkit"). Default: "chromium".
|
||||||
|
* `headless (bool)`: Whether to run the browser in headless mode. Default: `True`.
|
||||||
|
* `browser_mode (str)`: How the browser should be initialized ("builtin", "dedicated", "cdp", "docker"). Default: "dedicated".
|
||||||
|
* `use_managed_browser (bool)`: Whether to launch the browser using a managed approach (e.g., via CDP). Default: `False`.
|
||||||
|
* `cdp_url (Optional[str])`: URL for Chrome DevTools Protocol endpoint. Default: `None`.
|
||||||
|
* `use_persistent_context (bool)`: Use a persistent browser context (profile). Default: `False`.
|
||||||
|
* `user_data_dir (Optional[str])`: Path to user data directory for persistent sessions. Default: `None`.
|
||||||
|
* `channel (Optional[str])`: Browser channel (e.g., "chromium", "chrome", "msedge"). Default: "chromium".
|
||||||
|
* `proxy (Optional[str])`: Simple proxy server URL string.
|
||||||
|
* `proxy_config (Optional[Union[ProxyConfig, dict, None]])`: Detailed proxy configuration object or dictionary. Takes precedence over `proxy`.
|
||||||
|
* `viewport_width (int)`: Default viewport width. Default: `1080`.
|
||||||
|
* `viewport_height (int)`: Default viewport height. Default: `600`.
|
||||||
|
* `viewport (Optional[dict])`: Dictionary to set viewport dimensions, overrides `viewport_width` and `viewport_height` if set (e.g., `{"width": 1920, "height": 1080}`). Default: `None`.
|
||||||
|
* `accept_downloads (bool)`: Whether to allow file downloads. Default: `False`.
|
||||||
|
* `downloads_path (Optional[str])`: Directory to store downloaded files. Default: `None`.
|
||||||
|
* `storage_state (Optional[Union[str, dict, None]])`: Path to a file or a dictionary containing browser storage state (cookies, localStorage). Default: `None`.
|
||||||
|
* `ignore_https_errors (bool)`: Ignore HTTPS certificate errors. Default: `True`.
|
||||||
|
* `java_script_enabled (bool)`: Enable JavaScript execution. Default: `True`.
|
||||||
|
* `user_agent (str)`: Custom User-Agent string. Default: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36".
|
||||||
|
* `user_agent_mode (str)`: Mode for generating User-Agent (e.g., "random"). Default: `""` (uses provided `user_agent`).
|
||||||
|
* `user_agent_generator_config (Optional[dict])`: Configuration for User-Agent generation if `user_agent_mode` is active. Default: `{}`.
|
||||||
|
* `text_mode (bool)`: If `True`, disables images and rich content for faster loading. Default: `False`.
|
||||||
|
* `light_mode (bool)`: Disables certain background features for performance. Default: `False`.
|
||||||
|
* `extra_args (Optional[list])`: Additional command-line arguments for the browser. Default: `None` (resolves to `[]`).
|
||||||
|
* `debugging_port (int)`: Port for browser debugging protocol. Default: `9222`.
|
||||||
|
* `host (str)`: Host for browser debugging protocol. Default: "localhost".
|
||||||
|
* 3.1.3. Key Public Methods:
|
||||||
|
* `clone(**kwargs) -> BrowserConfig`: Creates a new `BrowserConfig` instance as a copy of the current one, with specified keyword arguments overriding existing values.
|
||||||
|
* `to_dict() -> dict`: Returns a dictionary representation of the configuration object's attributes.
|
||||||
|
* `dump() -> dict`: Serializes the configuration object to a JSON-serializable dictionary, including nested objects.
|
||||||
|
* `static load(data: dict) -> BrowserConfig`: Deserializes a `BrowserConfig` instance from a dictionary (previously created by `dump`).
|
||||||
|
* `static from_kwargs(kwargs: dict) -> BrowserConfig`: Creates a `BrowserConfig` instance directly from a dictionary of keyword arguments.
|
||||||
|
|
||||||
|
* 3.2. Class `CrawlerRunConfig` (from `crawl4ai.async_configs`)
|
||||||
|
* 3.2.1. Purpose: Specifies settings for an individual crawl operation initiated by `arun()` or `arun_many()`. These settings can override or augment the global `BrowserConfig`.
|
||||||
|
* 3.2.2. Initialization (`__init__`)
|
||||||
|
* Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
# Content Processing Parameters
|
||||||
|
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||||
|
extraction_strategy: Optional[ExtractionStrategy] = None,
|
||||||
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
|
markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
|
||||||
|
only_text: bool = False,
|
||||||
|
css_selector: Optional[str] = None,
|
||||||
|
target_elements: Optional[List[str]] = None,
|
||||||
|
excluded_tags: Optional[list] = None,
|
||||||
|
excluded_selector: Optional[str] = None,
|
||||||
|
keep_data_attributes: bool = False,
|
||||||
|
keep_attrs: Optional[list] = None,
|
||||||
|
remove_forms: bool = False,
|
||||||
|
prettify: bool = False,
|
||||||
|
parser_type: str = "lxml",
|
||||||
|
scraping_strategy: ContentScrapingStrategy = None, # Will default to WebScrapingStrategy
|
||||||
|
proxy_config: Optional[Union[ProxyConfig, dict, None]] = None,
|
||||||
|
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||||
|
# Browser Location and Identity Parameters
|
||||||
|
locale: Optional[str] = None,
|
||||||
|
timezone_id: Optional[str] = None,
|
||||||
|
geolocation: Optional[GeolocationConfig] = None,
|
||||||
|
# SSL Parameters
|
||||||
|
fetch_ssl_certificate: bool = False,
|
||||||
|
# Caching Parameters
|
||||||
|
cache_mode: CacheMode = CacheMode.BYPASS,
|
||||||
|
session_id: Optional[str] = None,
|
||||||
|
bypass_cache: bool = False, # Legacy
|
||||||
|
disable_cache: bool = False, # Legacy
|
||||||
|
no_cache_read: bool = False, # Legacy
|
||||||
|
no_cache_write: bool = False, # Legacy
|
||||||
|
shared_data: Optional[dict] = None,
|
||||||
|
# Page Navigation and Timing Parameters
|
||||||
|
wait_until: str = "domcontentloaded",
|
||||||
|
page_timeout: int = PAGE_TIMEOUT,
|
||||||
|
wait_for: Optional[str] = None,
|
||||||
|
wait_for_timeout: Optional[int] = None,
|
||||||
|
wait_for_images: bool = False,
|
||||||
|
delay_before_return_html: float = 0.1,
|
||||||
|
mean_delay: float = 0.1,
|
||||||
|
max_range: float = 0.3,
|
||||||
|
semaphore_count: int = 5,
|
||||||
|
# Page Interaction Parameters
|
||||||
|
js_code: Optional[Union[str, List[str]]] = None,
|
||||||
|
js_only: bool = False,
|
||||||
|
ignore_body_visibility: bool = True,
|
||||||
|
scan_full_page: bool = False,
|
||||||
|
scroll_delay: float = 0.2,
|
||||||
|
process_iframes: bool = False,
|
||||||
|
remove_overlay_elements: bool = False,
|
||||||
|
simulate_user: bool = False,
|
||||||
|
override_navigator: bool = False,
|
||||||
|
magic: bool = False,
|
||||||
|
adjust_viewport_to_content: bool = False,
|
||||||
|
# Media Handling Parameters
|
||||||
|
screenshot: bool = False,
|
||||||
|
screenshot_wait_for: Optional[float] = None,
|
||||||
|
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_THRESHOLD,
|
||||||
|
pdf: bool = False,
|
||||||
|
capture_mhtml: bool = False,
|
||||||
|
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||||
|
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||||||
|
table_score_threshold: int = 7,
|
||||||
|
exclude_external_images: bool = False,
|
||||||
|
exclude_all_images: bool = False,
|
||||||
|
# Link and Domain Handling Parameters
|
||||||
|
exclude_social_media_domains: Optional[list] = None, # Note: 'exclude_social_media_domains' from code
|
||||||
|
exclude_external_links: bool = False,
|
||||||
|
exclude_social_media_links: bool = False,
|
||||||
|
exclude_domains: Optional[list] = None,
|
||||||
|
exclude_internal_links: bool = False,
|
||||||
|
# Debugging and Logging Parameters
|
||||||
|
verbose: bool = True,
|
||||||
|
log_console: bool = False,
|
||||||
|
# Network and Console Capturing Parameters
|
||||||
|
capture_network_requests: bool = False,
|
||||||
|
capture_console_messages: bool = False,
|
||||||
|
# Connection Parameters (for HTTPCrawlerStrategy)
|
||||||
|
method: str = "GET",
|
||||||
|
stream: bool = False,
|
||||||
|
url: Optional[str] = None,
|
||||||
|
# Robots.txt Handling
|
||||||
|
check_robots_txt: bool = False,
|
||||||
|
# User Agent Parameters
|
||||||
|
user_agent: Optional[str] = None,
|
||||||
|
user_agent_mode: Optional[str] = None,
|
||||||
|
user_agent_generator_config: Optional[dict] = None, # Note: 'user_agent_generator_config' from code
|
||||||
|
# Deep Crawl Parameters
|
||||||
|
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||||
|
# Experimental Parameters
|
||||||
|
experimental: Optional[Dict[str, Any]] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Key Parameters:
|
||||||
|
* `word_count_threshold (int)`: Minimum word count for a content block to be considered. Default: `MIN_WORD_THRESHOLD` (200).
|
||||||
|
* `extraction_strategy (Optional[ExtractionStrategy])`: Strategy for structured data extraction (e.g., `LLMExtractionStrategy`, `JsonCssExtractionStrategy`). Default: `None` (falls back to `NoExtractionStrategy`).
|
||||||
|
* `chunking_strategy (ChunkingStrategy)`: Strategy for splitting content into chunks before extraction. Default: `RegexChunking()`.
|
||||||
|
* `markdown_generator (MarkdownGenerationStrategy)`: Strategy for converting HTML to Markdown. Default: `DefaultMarkdownGenerator()`.
|
||||||
|
* `cache_mode (CacheMode)`: Caching behavior for this run. Default: `CacheMode.BYPASS`.
|
||||||
|
* `session_id (Optional[str])`: ID for session persistence (reusing browser tabs/contexts). Default: `None`.
|
||||||
|
* `js_code (Optional[Union[str, List[str]]])`: JavaScript code snippets to execute on the page. Default: `None`.
|
||||||
|
* `wait_for (Optional[str])`: CSS selector or JS condition (prefixed with "js:") to wait for before proceeding. Default: `None`.
|
||||||
|
* `page_timeout (int)`: Timeout for page operations (e.g., navigation) in milliseconds. Default: `PAGE_TIMEOUT` (60000ms).
|
||||||
|
* `screenshot (bool)`: If `True`, capture a screenshot of the page. Default: `False`.
|
||||||
|
* `pdf (bool)`: If `True`, generate a PDF of the page. Default: `False`.
|
||||||
|
* `capture_mhtml (bool)`: If `True`, capture an MHTML snapshot of the page. Default: `False`.
|
||||||
|
* `exclude_external_links (bool)`: If `True`, exclude external links from results. Default: `False`.
|
||||||
|
* `stream (bool)`: If `True` (used with `arun_many`), results are yielded as an `AsyncGenerator`. Default: `False`.
|
||||||
|
* `check_robots_txt (bool)`: If `True`, crawler will check and respect `robots.txt` rules. Default: `False`.
|
||||||
|
* `user_agent (Optional[str])`: Override the browser's User-Agent for this specific run.
|
||||||
|
* 3.2.3. Key Public Methods:
|
||||||
|
* `clone(**kwargs) -> CrawlerRunConfig`: Creates a new `CrawlerRunConfig` instance as a copy of the current one, with specified keyword arguments overriding existing values.
|
||||||
|
* `to_dict() -> dict`: Returns a dictionary representation of the configuration object's attributes.
|
||||||
|
* `dump() -> dict`: Serializes the configuration object to a JSON-serializable dictionary, including nested objects.
|
||||||
|
* `static load(data: dict) -> CrawlerRunConfig`: Deserializes a `CrawlerRunConfig` instance from a dictionary (previously created by `dump`).
|
||||||
|
* `static from_kwargs(kwargs: dict) -> CrawlerRunConfig`: Creates a `CrawlerRunConfig` instance directly from a dictionary of keyword arguments.
|
||||||
|
|
||||||
|
* 3.3. Supporting Configuration Objects (from `crawl4ai.async_configs`)
|
||||||
|
* 3.3.1. Class `GeolocationConfig`
|
||||||
|
* Purpose: Defines geolocation (latitude, longitude, accuracy) to be emulated by the browser.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
latitude: float,
|
||||||
|
longitude: float,
|
||||||
|
accuracy: Optional[float] = 0.0
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `latitude (float)`: Latitude coordinate (e.g., 37.7749).
|
||||||
|
* `longitude (float)`: Longitude coordinate (e.g., -122.4194).
|
||||||
|
* `accuracy (Optional[float])`: Accuracy in meters. Default: `0.0`.
|
||||||
|
* Methods:
|
||||||
|
* `static from_dict(geo_dict: Dict) -> GeolocationConfig`: Creates an instance from a dictionary.
|
||||||
|
* `to_dict() -> Dict`: Converts the instance to a dictionary.
|
||||||
|
* `clone(**kwargs) -> GeolocationConfig`: Creates a copy with updated values.
|
||||||
|
* 3.3.2. Class `ProxyConfig`
|
||||||
|
* Purpose: Defines the settings for a single proxy server, including server address, authentication credentials, and optional IP.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
server: str,
|
||||||
|
username: Optional[str] = None,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
ip: Optional[str] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `server (str)`: Proxy server URL (e.g., "http://127.0.0.1:8080", "socks5://user:pass@host:port").
|
||||||
|
* `username (Optional[str])`: Username for proxy authentication.
|
||||||
|
* `password (Optional[str])`: Password for proxy authentication.
|
||||||
|
* `ip (Optional[str])`: Optional IP address associated with the proxy for verification.
|
||||||
|
* Methods:
|
||||||
|
* `static from_string(proxy_str: str) -> ProxyConfig`: Creates an instance from a string (e.g., "ip:port:username:password" or "ip:port").
|
||||||
|
* `static from_dict(proxy_dict: Dict) -> ProxyConfig`: Creates an instance from a dictionary.
|
||||||
|
* `static from_env(env_var: str = "PROXIES") -> List[ProxyConfig]`: Loads a list of proxies from a comma-separated environment variable.
|
||||||
|
* `to_dict() -> Dict`: Converts the instance to a dictionary.
|
||||||
|
* `clone(**kwargs) -> ProxyConfig`: Creates a copy with updated values.
|
||||||
|
* 3.3.3. Class `HTTPCrawlerConfig`
|
||||||
|
* Purpose: Configuration for the `AsyncHTTPCrawlerStrategy`, specifying HTTP method, headers, data/JSON payload, and redirect/SSL verification behavior.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
method: str = "GET",
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
data: Optional[Dict[str, Any]] = None,
|
||||||
|
json: Optional[Dict[str, Any]] = None,
|
||||||
|
follow_redirects: bool = True,
|
||||||
|
verify_ssl: bool = True,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `method (str)`: HTTP method (e.g., "GET", "POST"). Default: "GET".
|
||||||
|
* `headers (Optional[Dict[str, str]])`: Dictionary of HTTP request headers. Default: `None`.
|
||||||
|
* `data (Optional[Dict[str, Any]])`: Dictionary of form data to send in the request body. Default: `None`.
|
||||||
|
* `json (Optional[Dict[str, Any]])`: JSON data to send in the request body. Default: `None`.
|
||||||
|
* `follow_redirects (bool)`: Whether to automatically follow HTTP redirects. Default: `True`.
|
||||||
|
* `verify_ssl (bool)`: Whether to verify SSL certificates. Default: `True`.
|
||||||
|
* Methods:
|
||||||
|
* `static from_kwargs(kwargs: dict) -> HTTPCrawlerConfig`: Creates an instance from keyword arguments.
|
||||||
|
* `to_dict() -> dict`: Converts config to a dictionary.
|
||||||
|
* `clone(**kwargs) -> HTTPCrawlerConfig`: Creates a copy with updated values.
|
||||||
|
* `dump() -> dict`: Serializes the config to a dictionary.
|
||||||
|
* `static load(data: dict) -> HTTPCrawlerConfig`: Deserializes from a dictionary.
|
||||||
|
* 3.3.4. Class `LLMConfig`
|
||||||
|
* Purpose: Configures settings for interacting with Large Language Models, including provider choice, API credentials, and generation parameters.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Key Parameters:
|
||||||
|
* `provider (str)`: Name of the LLM provider (e.g., "openai/gpt-4o", "ollama/llama3.3", "groq/llama3-8b-8192"). Default: `DEFAULT_PROVIDER` (from `crawl4ai.config`).
|
||||||
|
* `api_token (Optional[str])`: API token for the LLM provider. If prefixed with "env:", it reads from the specified environment variable (e.g., "env:OPENAI_API_KEY"). If not provided, it attempts to load from default environment variables based on the provider.
|
||||||
|
* `base_url (Optional[str])`: Custom base URL for the LLM API endpoint.
|
||||||
|
* `temperature (Optional[float])`: Sampling temperature for generation.
|
||||||
|
* `max_tokens (Optional[int])`: Maximum number of tokens to generate.
|
||||||
|
* `top_p (Optional[float])`: Nucleus sampling parameter.
|
||||||
|
* `frequency_penalty (Optional[float])`: Penalty for token frequency.
|
||||||
|
* `presence_penalty (Optional[float])`: Penalty for token presence.
|
||||||
|
* `stop (Optional[List[str]])`: List of stop sequences for generation.
|
||||||
|
* `n (Optional[int])`: Number of completions to generate.
|
||||||
|
* Methods:
|
||||||
|
* `static from_kwargs(kwargs: dict) -> LLMConfig`: Creates an instance from keyword arguments.
|
||||||
|
* `to_dict() -> dict`: Converts config to a dictionary.
|
||||||
|
* `clone(**kwargs) -> LLMConfig`: Creates a copy with updated values.
|
||||||
|
|
||||||
|
## 4. Core Data Models (Results & Payloads from `crawl4ai.models`)
|
||||||
|
|
||||||
|
* 4.1. Class `CrawlResult(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model representing the comprehensive result of a single crawl and processing operation.
|
||||||
|
* Key Fields:
|
||||||
|
* `url (str)`: The final URL that was crawled (after any redirects).
|
||||||
|
* `html (str)`: The raw HTML content fetched from the URL.
|
||||||
|
* `success (bool)`: `True` if the crawl operation (fetching and initial processing) was successful, `False` otherwise.
|
||||||
|
* `cleaned_html (Optional[str])`: HTML content after sanitization and removal of unwanted tags/attributes as per configuration. Default: `None`.
|
||||||
|
* `_markdown (Optional[MarkdownGenerationResult])`: (Private Attribute) Holds the `MarkdownGenerationResult` object if Markdown generation was performed. Use the `markdown` property to access. Default: `None`.
|
||||||
|
* `markdown (Optional[Union[str, MarkdownGenerationResult]])`: (Property) Provides access to Markdown content. Behaves as a string (raw markdown) by default but allows access to `MarkdownGenerationResult` attributes (e.g., `result.markdown.fit_markdown`).
|
||||||
|
* `extracted_content (Optional[str])`: JSON string representation of structured data extracted by an `ExtractionStrategy`. Default: `None`.
|
||||||
|
* `media (Media)`: An object containing lists of `MediaItem` for images, videos, audio, and extracted tables. Default: `Media()`.
|
||||||
|
* `links (Links)`: An object containing lists of `Link` for internal and external hyperlinks found on the page. Default: `Links()`.
|
||||||
|
* `downloaded_files (Optional[List[str]])`: A list of file paths if any files were downloaded during the crawl. Default: `None`.
|
||||||
|
* `js_execution_result (Optional[Dict[str, Any]])`: The result of any JavaScript code executed on the page. Default: `None`.
|
||||||
|
* `screenshot (Optional[str])`: Base64 encoded string of the page screenshot, if `screenshot=True` was set. Default: `None`.
|
||||||
|
* `pdf (Optional[bytes])`: Raw bytes of the PDF generated from the page, if `pdf=True` was set. Default: `None`.
|
||||||
|
* `mhtml (Optional[str])`: MHTML snapshot of the page, if `capture_mhtml=True` was set. Default: `None`.
|
||||||
|
* `metadata (Optional[dict])`: Dictionary of metadata extracted from the page (e.g., title, description, OpenGraph tags, Twitter card data). Default: `None`.
|
||||||
|
* `error_message (Optional[str])`: A message describing the error if `success` is `False`. Default: `None`.
|
||||||
|
* `session_id (Optional[str])`: The session ID used for this crawl, if applicable. Default: `None`.
|
||||||
|
* `response_headers (Optional[dict])`: HTTP response headers from the server. Default: `None`.
|
||||||
|
* `status_code (Optional[int])`: HTTP status code of the response. Default: `None`.
|
||||||
|
* `ssl_certificate (Optional[SSLCertificate])`: Information about the SSL certificate if `fetch_ssl_certificate=True`. Default: `None`.
|
||||||
|
* `dispatch_result (Optional[DispatchResult])`: Metadata about the task execution from the dispatcher (e.g., timings, memory usage). Default: `None`.
|
||||||
|
* `redirected_url (Optional[str])`: The original URL if the request was redirected. Default: `None`.
|
||||||
|
* `network_requests (Optional[List[Dict[str, Any]]])`: List of captured network requests if `capture_network_requests=True`. Default: `None`.
|
||||||
|
* `console_messages (Optional[List[Dict[str, Any]]])`: List of captured browser console messages if `capture_console_messages=True`. Default: `None`.
|
||||||
|
* Methods:
|
||||||
|
* `model_dump(*args, **kwargs)`: Serializes the `CrawlResult` model to a dictionary, ensuring the `_markdown` private attribute is correctly handled and included as "markdown" in the output if present.
|
||||||
|
|
||||||
|
* 4.2. Class `MarkdownGenerationResult(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model that holds various forms of Markdown generated from HTML content.
|
||||||
|
* Fields:
|
||||||
|
* `raw_markdown (str)`: The basic, direct conversion of HTML to Markdown.
|
||||||
|
* `markdown_with_citations (str)`: Markdown content with inline citations (e.g., [^1^]) and a references section.
|
||||||
|
* `references_markdown (str)`: The Markdown content for the "References" section, listing all cited links.
|
||||||
|
* `fit_markdown (Optional[str])`: Markdown generated specifically from content deemed "relevant" by a content filter (like `PruningContentFilter` or `LLMContentFilter`), if such a filter was applied. Default: `None`.
|
||||||
|
* `fit_html (Optional[str])`: The filtered HTML content that was used to generate `fit_markdown`. Default: `None`.
|
||||||
|
* Methods:
|
||||||
|
* `__str__(self) -> str`: Returns `self.raw_markdown` when the object is cast to a string.
|
||||||
|
|
||||||
|
* 4.3. Class `ScrapingResult(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model representing a standardized output from content scraping strategies.
|
||||||
|
* Fields:
|
||||||
|
* `cleaned_html (str)`: The primary sanitized and processed HTML content.
|
||||||
|
* `success (bool)`: Indicates if the scraping operation was successful.
|
||||||
|
* `media (Media)`: A `Media` object containing extracted images, videos, audio, and tables.
|
||||||
|
* `links (Links)`: A `Links` object containing extracted internal and external links.
|
||||||
|
* `metadata (Dict[str, Any])`: A dictionary of metadata extracted from the page (e.g., title, description).
|
||||||
|
|
||||||
|
* 4.4. Class `MediaItem(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model representing a generic media item like an image, video, or audio file.
|
||||||
|
* Fields:
|
||||||
|
* `src (Optional[str])`: The source URL of the media item. Default: `""`.
|
||||||
|
* `data (Optional[str])`: Base64 encoded data for inline media. Default: `""`.
|
||||||
|
* `alt (Optional[str])`: Alternative text for the media item (e.g., image alt text). Default: `""`.
|
||||||
|
* `desc (Optional[str])`: A description or surrounding text related to the media item. Default: `""`.
|
||||||
|
* `score (Optional[int])`: A relevance or importance score, if calculated by a strategy. Default: `0`.
|
||||||
|
* `type (str)`: The type of media (e.g., "image", "video", "audio"). Default: "image".
|
||||||
|
* `group_id (Optional[int])`: An identifier to group related media variants (e.g., different resolutions of the same image from a srcset). Default: `0`.
|
||||||
|
* `format (Optional[str])`: The detected file format (e.g., "jpeg", "png", "mp4"). Default: `None`.
|
||||||
|
* `width (Optional[int])`: The width of the media item in pixels, if available. Default: `None`.
|
||||||
|
|
||||||
|
* 4.5. Class `Link(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model representing an extracted hyperlink.
|
||||||
|
* Fields:
|
||||||
|
* `href (Optional[str])`: The URL (href attribute) of the link. Default: `""`.
|
||||||
|
* `text (Optional[str])`: The anchor text of the link. Default: `""`.
|
||||||
|
* `title (Optional[str])`: The title attribute of the link, if present. Default: `""`.
|
||||||
|
* `base_domain (Optional[str])`: The base domain extracted from the `href`. Default: `""`.
|
||||||
|
|
||||||
|
* 4.6. Class `Media(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model that acts as a container for lists of different types of media items found on a page.
|
||||||
|
* Fields:
|
||||||
|
* `images (List[MediaItem])`: A list of `MediaItem` objects representing images. Default: `[]`.
|
||||||
|
* `videos (List[MediaItem])`: A list of `MediaItem` objects representing videos. Default: `[]`.
|
||||||
|
* `audios (List[MediaItem])`: A list of `MediaItem` objects representing audio files. Default: `[]`.
|
||||||
|
* `tables (List[Dict])`: A list of dictionaries, where each dictionary represents an extracted HTML table with keys like "headers", "rows", "caption", "summary". Default: `[]`.
|
||||||
|
|
||||||
|
* 4.7. Class `Links(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model that acts as a container for lists of internal and external links.
|
||||||
|
* Fields:
|
||||||
|
* `internal (List[Link])`: A list of `Link` objects considered internal to the crawled site. Default: `[]`.
|
||||||
|
* `external (List[Link])`: A list of `Link` objects pointing to external sites. Default: `[]`.
|
||||||
|
|
||||||
|
* 4.8. Class `AsyncCrawlResponse(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model representing the raw response from a crawler strategy's `crawl` method. This data is then processed further to create a `CrawlResult`.
|
||||||
|
* Fields:
|
||||||
|
* `html (str)`: The raw HTML content of the page.
|
||||||
|
* `response_headers (Dict[str, str])`: A dictionary of HTTP response headers.
|
||||||
|
* `js_execution_result (Optional[Dict[str, Any]])`: The result from any JavaScript code executed on the page. Default: `None`.
|
||||||
|
* `status_code (int)`: The HTTP status code of the response.
|
||||||
|
* `screenshot (Optional[str])`: Base64 encoded screenshot data, if captured. Default: `None`.
|
||||||
|
* `pdf_data (Optional[bytes])`: Raw PDF data, if captured. Default: `None`.
|
||||||
|
* `mhtml_data (Optional[str])`: MHTML snapshot data, if captured. Default: `None`.
|
||||||
|
* `downloaded_files (Optional[List[str]])`: A list of local file paths for any files downloaded during the crawl. Default: `None`.
|
||||||
|
* `ssl_certificate (Optional[SSLCertificate])`: SSL certificate information for the site. Default: `None`.
|
||||||
|
* `redirected_url (Optional[str])`: The original URL requested if the final URL is a result of redirection. Default: `None`.
|
||||||
|
* `network_requests (Optional[List[Dict[str, Any]]])`: Captured network requests if enabled. Default: `None`.
|
||||||
|
* `console_messages (Optional[List[Dict[str, Any]]])`: Captured console messages if enabled. Default: `None`.
|
||||||
|
|
||||||
|
* 4.9. Class `TokenUsage(BaseModel)`
|
||||||
|
* Purpose: A Pydantic model to track token usage statistics for interactions with Large Language Models.
|
||||||
|
* Fields:
|
||||||
|
* `completion_tokens (int)`: Number of tokens used for the LLM's completion/response. Default: `0`.
|
||||||
|
* `prompt_tokens (int)`: Number of tokens used for the input prompt to the LLM. Default: `0`.
|
||||||
|
* `total_tokens (int)`: Total number of tokens used (prompt + completion). Default: `0`.
|
||||||
|
* `completion_tokens_details (Optional[dict])`: Provider-specific detailed breakdown of completion tokens. Default: `None`.
|
||||||
|
* `prompt_tokens_details (Optional[dict])`: Provider-specific detailed breakdown of prompt tokens. Default: `None`.
|
||||||
|
|
||||||
|
* 4.10. Class `SSLCertificate(dict)` (from `crawl4ai.ssl_certificate`)
|
||||||
|
* Purpose: Represents an SSL certificate's information, behaving like a dictionary for direct JSON serialization and easy access to its fields.
|
||||||
|
* Key Fields (accessed as dictionary keys):
|
||||||
|
* `subject (dict)`: Dictionary of subject fields (e.g., `{"CN": "example.com", "O": "Example Inc."}`).
|
||||||
|
* `issuer (dict)`: Dictionary of issuer fields.
|
||||||
|
* `version (int)`: Certificate version number.
|
||||||
|
* `serial_number (str)`: Certificate serial number (hexadecimal string).
|
||||||
|
* `not_before (str)`: Validity start date and time (ASN.1/UTC format string, e.g., "YYYYMMDDHHMMSSZ").
|
||||||
|
* `not_after (str)`: Validity end date and time (ASN.1/UTC format string).
|
||||||
|
* `fingerprint (str)`: SHA-256 fingerprint of the certificate (lowercase hex string).
|
||||||
|
* `signature_algorithm (str)`: The algorithm used to sign the certificate (e.g., "sha256WithRSAEncryption").
|
||||||
|
* `raw_cert (str)`: Base64 encoded string of the raw DER-encoded certificate.
|
||||||
|
* `extensions (List[dict])`: A list of dictionaries, each representing a certificate extension with "name" and "value" keys.
|
||||||
|
* Static Methods:
|
||||||
|
* `from_url(url: str, timeout: int = 10) -> Optional[SSLCertificate]`: Fetches the SSL certificate from the given URL and returns an `SSLCertificate` instance, or `None` on failure.
|
||||||
|
* Instance Methods:
|
||||||
|
* `to_json(filepath: Optional[str] = None) -> Optional[str]`: Exports the certificate information as a JSON string. If `filepath` is provided, writes to the file and returns `None`.
|
||||||
|
* `to_pem(filepath: Optional[str] = None) -> Optional[str]`: Exports the certificate in PEM format as a string. If `filepath` is provided, writes to the file and returns `None`.
|
||||||
|
* `to_der(filepath: Optional[str] = None) -> Optional[bytes]`: Exports the raw certificate in DER format as bytes. If `filepath` is provided, writes to the file and returns `None`.
|
||||||
|
* Example:
|
||||||
|
```python
|
||||||
|
# Assuming 'cert' is an SSLCertificate instance
|
||||||
|
# print(cert["subject"]["CN"])
|
||||||
|
# cert.to_pem("my_cert.pem")
|
||||||
|
```
|
||||||
|
|
||||||
|
* 4.11. Class `DispatchResult(BaseModel)`
|
||||||
|
* Purpose: Contains metadata about a task's execution when processed by a dispatcher (e.g., in `arun_many`).
|
||||||
|
* Fields:
|
||||||
|
* `task_id (str)`: A unique identifier for the dispatched task.
|
||||||
|
* `memory_usage (float)`: Memory usage (in MB) recorded during the task's execution.
|
||||||
|
* `peak_memory (float)`: Peak memory usage (in MB) recorded during the task's execution.
|
||||||
|
* `start_time (Union[datetime, float])`: The start time of the task (can be a `datetime` object or a Unix timestamp float).
|
||||||
|
* `end_time (Union[datetime, float])`: The end time of the task.
|
||||||
|
* `error_message (str)`: Any error message if the task failed during dispatch or execution. Default: `""`.
|
||||||
|
|
||||||
|
* 4.12. `CrawlResultContainer(Generic[CrawlResultT])`
|
||||||
|
* Purpose: A generic container for `CrawlResult` objects, primarily used as the return type for `arun_many` when `stream=False`. It behaves like a list, allowing iteration, indexing, and length checking.
|
||||||
|
* Methods:
|
||||||
|
* `__iter__(self)`: Allows iteration over the contained `CrawlResult` objects.
|
||||||
|
* `__getitem__(self, index)`: Allows accessing `CrawlResult` objects by index.
|
||||||
|
* `__len__(self)`: Returns the number of `CrawlResult` objects contained.
|
||||||
|
* `__repr__(self)`: Provides a string representation of the container.
|
||||||
|
* Attribute:
|
||||||
|
* `_results (List[CrawlResultT])`: The internal list holding the `CrawlResult` objects.
|
||||||
|
|
||||||
|
* 4.13. `RunManyReturn` (Type Alias from `crawl4ai.models`)
|
||||||
|
* Purpose: A type alias defining the possible return types for the `arun_many` method of `AsyncWebCrawler`.
|
||||||
|
* Definition: `Union[CrawlResultContainer[CrawlResult], AsyncGenerator[CrawlResult, None]]`
|
||||||
|
* This means `arun_many` will return a `CrawlResultContainer` (a list-like object of all `CrawlResult` instances) if `CrawlerRunConfig.stream` is `False` (the default).
|
||||||
|
* It will return an `AsyncGenerator` yielding individual `CrawlResult` instances if `CrawlerRunConfig.stream` is `True`.
|
||||||
|
|
||||||
|
## 5. Core Crawler Strategies (from `crawl4ai.async_crawler_strategy`)
|
||||||
|
|
||||||
|
* 5.1. Abstract Base Class `AsyncCrawlerStrategy(ABC)`
|
||||||
|
* Purpose: Defines the common interface that all asynchronous crawler strategies must implement. This allows `AsyncWebCrawler` to use different fetching mechanisms (e.g., Playwright, HTTP requests) interchangeably.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(self, browser_config: BrowserConfig, logger: AsyncLoggerBase):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `browser_config (BrowserConfig)`: The browser configuration to be used by the strategy.
|
||||||
|
* `logger (AsyncLoggerBase)`: The logger instance for logging strategy-specific events.
|
||||||
|
* Key Abstract Methods (must be implemented by concrete subclasses):
|
||||||
|
* `async crawl(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse`:
|
||||||
|
* Purpose: Fetches the content from the given URL according to the `config`.
|
||||||
|
* Returns: An `AsyncCrawlResponse` object containing the raw fetched data.
|
||||||
|
* `async __aenter__(self)`:
|
||||||
|
* Purpose: Asynchronous context manager entry, typically for initializing resources (e.g., launching a browser).
|
||||||
|
* `async __aexit__(self, exc_type, exc_val, exc_tb)`:
|
||||||
|
* Purpose: Asynchronous context manager exit, for cleaning up resources.
|
||||||
|
* Key Concrete Methods (available to all strategies):
|
||||||
|
* `set_custom_headers(self, headers: dict) -> None`:
|
||||||
|
* Purpose: Sets custom HTTP headers to be used by the strategy for subsequent requests.
|
||||||
|
* `update_user_agent(self, user_agent: str) -> None`:
|
||||||
|
* Purpose: Updates the User-Agent string used by the strategy.
|
||||||
|
* `set_hook(self, hook_name: str, callback: Callable) -> None`:
|
||||||
|
* Purpose: Registers a callback function for a specific hook point in the crawling lifecycle.
|
||||||
|
* `async_run_hook(self, hook_name: str, *args, **kwargs) -> Any`:
|
||||||
|
* Purpose: Executes a registered hook with the given arguments.
|
||||||
|
* `async_get_default_context(self) -> BrowserContext`:
|
||||||
|
* Purpose: Retrieves the default browser context (Playwright specific, might raise `NotImplementedError` in non-Playwright strategies).
|
||||||
|
* `async_create_new_page(self, context: BrowserContext) -> Page`:
|
||||||
|
* Purpose: Creates a new page within a given browser context (Playwright specific).
|
||||||
|
* `async_get_page(self, url: str, config: CrawlerRunConfig, session_id: Optional[str]) -> Tuple[Page, BrowserContext]`:
|
||||||
|
* Purpose: Gets an existing page/context for a session or creates a new one (Playwright specific, managed by `BrowserManager`).
|
||||||
|
* `async_close_page(self, page: Page, session_id: Optional[str]) -> None`:
|
||||||
|
* Purpose: Closes a page, potentially keeping the associated context/session alive (Playwright specific).
|
||||||
|
* `async_kill_session(self, session_id: str) -> None`:
|
||||||
|
* Purpose: Kills (closes) a specific browser session, including its page and context (Playwright specific).
|
||||||
|
|
||||||
|
* 5.2. Class `AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy)`
|
||||||
|
* Purpose: The default crawler strategy, using Playwright to control a web browser for fetching and interacting with web pages. It supports complex JavaScript execution and provides hooks for various stages of the crawl.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
|
logger: Optional[AsyncLoggerBase] = None,
|
||||||
|
browser_manager: Optional[BrowserManager] = None
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `browser_config (Optional[BrowserConfig])`: Browser configuration. Defaults to a new `BrowserConfig()` if not provided.
|
||||||
|
* `logger (Optional[AsyncLoggerBase])`: Logger instance. Defaults to a new `AsyncLogger()`.
|
||||||
|
* `browser_manager (Optional[BrowserManager])`: An instance of `BrowserManager` to manage browser lifecycles and contexts. If `None`, a new `BrowserManager` is created internally.
|
||||||
|
* Key Overridden/Implemented Methods:
|
||||||
|
* `async crawl(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse`:
|
||||||
|
* Purpose: Implements the crawling logic using Playwright. It navigates to the URL, executes JavaScript if specified, waits for conditions, captures screenshots/PDFs if requested, and returns the page content and other metadata.
|
||||||
|
* `async aprocess_html(self, url: str, html: str, config: CrawlerRunConfig, **kwargs) -> CrawlResult`:
|
||||||
|
* Purpose: (Note: While `AsyncWebCrawler` calls this, the default implementation is in `AsyncPlaywrightCrawlerStrategy` for convenience, acting as a bridge to the scraping strategy.) Processes the fetched HTML to produce a `CrawlResult`. This involves using the `scraping_strategy` from the `config` (defaults to `WebScrapingStrategy`) to clean HTML, extract media/links, and then uses the `markdown_generator` to produce Markdown.
|
||||||
|
* Specific Public Methods:
|
||||||
|
* `async_create_new_context(self, config: Optional[CrawlerRunConfig] = None) -> BrowserContext`:
|
||||||
|
* Purpose: Creates a new Playwright `BrowserContext` based on the global `BrowserConfig` and optional overrides from `CrawlerRunConfig`.
|
||||||
|
* `async_setup_context_default(self, context: BrowserContext, config: Optional[CrawlerRunConfig] = None) -> None`:
|
||||||
|
* Purpose: Applies default settings to a `BrowserContext`, such as viewport size, user agent, custom headers, locale, timezone, and geolocation, based on `BrowserConfig` and `CrawlerRunConfig`.
|
||||||
|
* `async_setup_context_hooks(self, context: BrowserContext, config: CrawlerRunConfig) -> None`:
|
||||||
|
* Purpose: Sets up event listeners on the context for capturing network requests and console messages if `config.capture_network_requests` or `config.capture_console_messages` is `True`.
|
||||||
|
* `async_handle_storage_state(self, context: BrowserContext, config: CrawlerRunConfig) -> None`:
|
||||||
|
* Purpose: Loads cookies and localStorage from a `storage_state` file or dictionary (specified in `BrowserConfig` or `CrawlerRunConfig`) into the given `BrowserContext`.
|
||||||
|
* Hooks (Callable via `set_hook(hook_name, callback)` and executed by `async_run_hook`):
|
||||||
|
* `on_browser_created`: Called after the Playwright browser instance is launched or connected. Callback receives `(browser, **kwargs)`.
|
||||||
|
* `on_page_context_created`: Called after a new Playwright `BrowserContext` and `Page` are created. Callback receives `(page, context, **kwargs)`.
|
||||||
|
* `before_goto`: Called just before `page.goto(url)` is executed. Callback receives `(page, context, url, **kwargs)`.
|
||||||
|
* `after_goto`: Called after `page.goto(url)` completes successfully. Callback receives `(page, context, url, response, **kwargs)`.
|
||||||
|
* `on_user_agent_updated`: Called when the User-Agent string is updated for a context. Callback receives `(page, context, user_agent, **kwargs)`.
|
||||||
|
* `on_execution_started`: Called when `js_code` execution begins on a page. Callback receives `(page, context, **kwargs)`.
|
||||||
|
* `before_retrieve_html`: Called just before the final HTML content is retrieved from the page. Callback receives `(page, context, **kwargs)`.
|
||||||
|
* `before_return_html`: Called just before the `AsyncCrawlResponse` is returned by the `crawl()` method of the strategy. Callback receives `(page, context, html_content, **kwargs)`.
|
||||||
|
|
||||||
|
* 5.3. Class `AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy)`
|
||||||
|
* Purpose: A lightweight crawler strategy that uses direct HTTP requests (via `httpx`) instead of a full browser. Suitable for static sites or when JavaScript execution is not needed.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(self, http_config: Optional[HTTPCrawlerConfig] = None, logger: Optional[AsyncLoggerBase] = None):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `http_config (Optional[HTTPCrawlerConfig])`: Configuration for HTTP requests (method, headers, data, etc.). Defaults to a new `HTTPCrawlerConfig()`.
|
||||||
|
* `logger (Optional[AsyncLoggerBase])`: Logger instance. Defaults to a new `AsyncLogger()`.
|
||||||
|
* Key Overridden/Implemented Methods:
|
||||||
|
* `async crawl(self, url: str, http_config: Optional[HTTPCrawlerConfig] = None, **kwargs) -> AsyncCrawlResponse`:
|
||||||
|
* Purpose: Fetches content from the URL using an HTTP GET or POST request via `httpx`. Does not execute JavaScript. Returns an `AsyncCrawlResponse` with HTML, status code, and headers. Screenshot, PDF, and MHTML capabilities are not available with this strategy.
|
||||||
|
|
||||||
|
## 6. Browser Management (from `crawl4ai.browser_manager`)
|
||||||
|
|
||||||
|
* 6.1. Class `BrowserManager`
|
||||||
|
* Purpose: Manages the lifecycle of Playwright browser instances and their contexts. It handles launching/connecting to browsers, creating new contexts with specific configurations, managing sessions for page reuse, and cleaning up resources.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(self, browser_config: BrowserConfig, logger: Optional[AsyncLoggerBase] = None):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `browser_config (BrowserConfig)`: The global browser configuration settings.
|
||||||
|
* `logger (Optional[AsyncLoggerBase])`: Logger instance for browser management events.
|
||||||
|
* Key Methods:
|
||||||
|
* `async start() -> None`: Initializes the Playwright instance and launches or connects to the browser based on `browser_config` (e.g., launches a new browser instance or connects to an existing CDP endpoint via `ManagedBrowser`).
|
||||||
|
* `async create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> playwright.async_api.BrowserContext`: Creates a new browser context. If `crawlerRunConfig` is provided, its settings (e.g., locale, viewport, proxy) can override the global `BrowserConfig`.
|
||||||
|
* `async setup_context(self, context: playwright.async_api.BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None, is_default: bool = False) -> None`: Applies various settings to a given browser context, including headers, cookies, viewport, geolocation, permissions, and storage state, based on `BrowserConfig` and `CrawlerRunConfig`.
|
||||||
|
* `async get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[playwright.async_api.Page, playwright.async_api.BrowserContext]`: Retrieves an existing page and context for a given `session_id` (if present in `crawlerRunConfig` and the session is active) or creates a new page and context. Manages context reuse based on a signature derived from `CrawlerRunConfig` to ensure contexts with different core settings (like proxy, locale) are isolated.
|
||||||
|
* `async kill_session(self, session_id: str) -> None`: Closes the page and browser context associated with the given `session_id`, effectively ending that session.
|
||||||
|
* `async close() -> None`: Closes all managed browser contexts and the main browser instance.
|
||||||
|
|
||||||
|
* 6.2. Class `ManagedBrowser`
|
||||||
|
* Purpose: Manages the lifecycle of a single, potentially persistent, browser process. It's used when `BrowserConfig.use_managed_browser` is `True` or `BrowserConfig.use_persistent_context` is `True`. It handles launching the browser with a specific user data directory and connecting via CDP.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
browser_type: str = "chromium",
|
||||||
|
user_data_dir: Optional[str] = None,
|
||||||
|
headless: bool = False,
|
||||||
|
logger=None,
|
||||||
|
host: str = "localhost",
|
||||||
|
debugging_port: int = 9222,
|
||||||
|
cdp_url: Optional[str] = None, # Added as per code_analysis
|
||||||
|
browser_config: Optional[BrowserConfig] = None # Added as per code_analysis
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `browser_type (str)`: "chromium", "firefox", or "webkit". Default: "chromium".
|
||||||
|
* `user_data_dir (Optional[str])`: Path to the user data directory for the browser profile. If `None`, a temporary directory might be created.
|
||||||
|
* `headless (bool)`: Whether to launch the browser in headless mode. Default: `False` (typically for managed/persistent scenarios).
|
||||||
|
* `logger`: Logger instance.
|
||||||
|
* `host (str)`: Host for the debugging port. Default: "localhost".
|
||||||
|
* `debugging_port (int)`: Port for the Chrome DevTools Protocol. Default: `9222`.
|
||||||
|
* `cdp_url (Optional[str])`: If provided, attempts to connect to an existing browser at this CDP URL instead of launching a new one.
|
||||||
|
* `browser_config (Optional[BrowserConfig])`: The `BrowserConfig` object providing overall browser settings.
|
||||||
|
* Key Methods:
|
||||||
|
* `async start() -> str`: Starts the browser process (if not connecting to an existing `cdp_url`). If a new browser is launched, it uses the specified `user_data_dir` and `debugging_port`.
|
||||||
|
* Returns: The CDP endpoint URL (e.g., "http://localhost:9222").
|
||||||
|
* `async cleanup() -> None`: Terminates the browser process (if launched by this instance) and removes any temporary user data directory created by it.
|
||||||
|
* Static Methods:
|
||||||
|
* `async create_profile(cls, browser_config: Optional[BrowserConfig] = None, profile_name: Optional[str] = None, logger=None) -> str`:
|
||||||
|
* Purpose: Launches a browser instance with a new or existing user profile, allowing interactive setup (e.g., manual login, cookie acceptance). The browser remains open until the user closes it.
|
||||||
|
* Parameters:
|
||||||
|
* `browser_config (Optional[BrowserConfig])`: Optional browser configuration to use.
|
||||||
|
* `profile_name (Optional[str])`: Name for the profile. If `None`, a default name is used.
|
||||||
|
* `logger`: Logger instance.
|
||||||
|
* Returns: The path to the created/used user data directory, which can then be passed to `BrowserConfig.user_data_dir`.
|
||||||
|
* `list_profiles(cls) -> List[str]`:
|
||||||
|
* Purpose: Lists the names of all browser profiles stored in the default Crawl4AI profiles directory (`~/.crawl4ai/profiles`).
|
||||||
|
* Returns: A list of profile name strings.
|
||||||
|
* `delete_profile(cls, profile_name_or_path: str) -> bool`:
|
||||||
|
* Purpose: Deletes a browser profile either by its name (if in the default directory) or by its full path.
|
||||||
|
* Returns: `True` if deletion was successful, `False` otherwise.
|
||||||
|
|
||||||
|
* 6.3. Function `clone_runtime_state(src: BrowserContext, dst: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None, browserConfig: Optional[BrowserConfig] = None) -> None`
|
||||||
|
* Purpose: Asynchronously copies runtime state (cookies, localStorage, session storage) from a source `BrowserContext` to a destination `BrowserContext`. Can also apply headers and geolocation from `CrawlerRunConfig` or `BrowserConfig` to the destination context.
|
||||||
|
* Parameters:
|
||||||
|
* `src (BrowserContext)`: The source browser context.
|
||||||
|
* `dst (BrowserContext)`: The destination browser context.
|
||||||
|
* `crawlerRunConfig (Optional[CrawlerRunConfig])`: Optional run configuration to apply to `dst`.
|
||||||
|
* `browserConfig (Optional[BrowserConfig])`: Optional browser configuration to apply to `dst`.
|
||||||
|
|
||||||
|
## 7. Proxy Rotation Strategies (from `crawl4ai.proxy_strategy`)
|
||||||
|
|
||||||
|
* 7.1. Abstract Base Class `ProxyRotationStrategy(ABC)`
|
||||||
|
* Purpose: Defines the interface for strategies that provide a sequence of proxy configurations, enabling proxy rotation.
|
||||||
|
* Abstract Methods:
|
||||||
|
* `async get_next_proxy(self) -> Optional[ProxyConfig]`:
|
||||||
|
* Purpose: Asynchronously retrieves the next `ProxyConfig` from the strategy.
|
||||||
|
* Returns: A `ProxyConfig` object or `None` if no more proxies are available or an error occurs.
|
||||||
|
* `add_proxies(self, proxies: List[ProxyConfig]) -> None`:
|
||||||
|
* Purpose: Adds a list of `ProxyConfig` objects to the strategy's pool of proxies.
|
||||||
|
|
||||||
|
* 7.2. Class `RoundRobinProxyStrategy(ProxyRotationStrategy)`
|
||||||
|
* Purpose: A simple proxy rotation strategy that cycles through a list of provided proxies in a round-robin fashion.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(self, proxies: Optional[List[ProxyConfig]] = None):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `proxies (Optional[List[ProxyConfig]])`: An initial list of `ProxyConfig` objects. If `None`, the list is empty and proxies must be added via `add_proxies`.
|
||||||
|
* Methods:
|
||||||
|
* `add_proxies(self, proxies: List[ProxyConfig]) -> None`: Adds new `ProxyConfig` objects to the internal list of proxies and reinitializes the cycle.
|
||||||
|
* `async get_next_proxy(self) -> Optional[ProxyConfig]`: Returns the next `ProxyConfig` from the list, cycling back to the beginning when the end is reached. Returns `None` if the list is empty.
|
||||||
|
|
||||||
|
## 8. Logging (from `crawl4ai.async_logger`)
|
||||||
|
|
||||||
|
* 8.1. Abstract Base Class `AsyncLoggerBase(ABC)`
|
||||||
|
* Purpose: Defines the basic interface for an asynchronous logger. Concrete implementations should provide methods for logging messages at different levels.
|
||||||
|
* 8.2. Class `AsyncLogger(AsyncLoggerBase)`
|
||||||
|
* Purpose: The default asynchronous logger for `crawl4ai`. It provides structured logging to both the console and optionally to a file, with customizable icons, colors, and verbosity levels.
|
||||||
|
* Initialization (`__init__`):
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
log_file: Optional[str] = None,
|
||||||
|
verbose: bool = True,
|
||||||
|
tag_width: int = 15, # outline had 10, code has 15
|
||||||
|
icons: Optional[Dict[str, str]] = None,
|
||||||
|
colors: Optional[Dict[LogLevel, LogColor]] = None, # Corrected type annotation
|
||||||
|
log_level: LogLevel = LogLevel.INFO # Assuming LogLevel.INFO is a typical default
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* Parameters:
|
||||||
|
* `log_file (Optional[str])`: Path to a file where logs should be written. If `None`, logs only to console.
|
||||||
|
* `verbose (bool)`: If `True`, enables more detailed logging (DEBUG level). Default: `True`.
|
||||||
|
* `tag_width (int)`: Width for the tag part of the log message. Default: `15`.
|
||||||
|
* `icons (Optional[Dict[str, str]])`: Custom icons for different log tags.
|
||||||
|
* `colors (Optional[Dict[LogLevel, LogColor]])`: Custom colors for different log levels.
|
||||||
|
* `log_level (LogLevel)`: Minimum log level to output.
|
||||||
|
* Key Methods (for logging):
|
||||||
|
* `info(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs an informational message.
|
||||||
|
* `warning(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs a warning message.
|
||||||
|
* `error(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs an error message.
|
||||||
|
* `debug(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs a debug message (only if `verbose=True` or `log_level` is DEBUG).
|
||||||
|
* `url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", **params) -> None`: Logs the status of a URL fetch operation, including success/failure and timing.
|
||||||
|
* `error_status(self, url: str, error: str, tag: str = "ERROR", **params) -> None`: Logs an error encountered for a specific URL.
|
||||||
|
|
||||||
|
## 9. Core Utility Functions (from `crawl4ai.async_configs`)
|
||||||
|
* 9.1. `to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`
|
||||||
|
* Purpose: Recursively converts a Python object (often a Pydantic model or a dataclass instance used for configuration) into a dictionary that is safe for JSON serialization. It handles nested objects, enums, and basic types.
|
||||||
|
* Parameters:
|
||||||
|
* `obj (Any)`: The object to be serialized.
|
||||||
|
* `ignore_default_value (bool)`: If `True`, fields whose current value is the same as their default value (if applicable, e.g., for Pydantic models) might be omitted from the resulting dictionary. Default: `False`.
|
||||||
|
* Returns: `Dict` - A JSON-serializable dictionary representation of the object.
|
||||||
|
* 9.2. `from_serializable_dict(data: Any) -> Any`
|
||||||
|
* Purpose: Recursively reconstructs Python objects from a dictionary representation (typically one created by `to_serializable_dict`). It attempts to instantiate classes based on a "type" key in the dictionary if present.
|
||||||
|
* Parameters:
|
||||||
|
* `data (Any)`: The dictionary (or basic type) to be deserialized.
|
||||||
|
* Returns: `Any` - The reconstructed Python object or the original data if no special deserialization rule applies.
|
||||||
|
* 9.3. `is_empty_value(value: Any) -> bool`
|
||||||
|
* Purpose: Checks if a given value is considered "empty" (e.g., `None`, an empty string, an empty list, an empty dictionary).
|
||||||
|
* Returns: `bool` - `True` if the value is empty, `False` otherwise.
|
||||||
|
|
||||||
|
## 10. Enumerations (Key Enums used in Core)
|
||||||
|
* 10.1. `CacheMode` (from `crawl4ai.cache_context`, defined in `crawl4ai.async_configs` as per provided code)
|
||||||
|
* Purpose: Defines the caching behavior for crawl operations.
|
||||||
|
* Members:
|
||||||
|
* `ENABLE`: (Value: "enable") Normal caching behavior; read from cache if available, write to cache after fetching.
|
||||||
|
* `DISABLE`: (Value: "disable") No caching at all; always fetch fresh content and do not write to cache.
|
||||||
|
* `READ_ONLY`: (Value: "read_only") Only read from the cache; do not write new or updated content to the cache.
|
||||||
|
* `WRITE_ONLY`: (Value: "write_only") Only write to the cache after fetching; do not read from the cache.
|
||||||
|
* `BYPASS`: (Value: "bypass") Skip the cache entirely for this specific operation; fetch fresh content and do not write to cache. This is often the default for individual `CrawlerRunConfig` instances.
|
||||||
|
* 10.2. `DisplayMode` (from `crawl4ai.models`, used by `CrawlerMonitor`)
|
||||||
|
* Purpose: Defines the display mode for the `CrawlerMonitor`.
|
||||||
|
* Members:
|
||||||
|
* `DETAILED`: Shows detailed information for each task.
|
||||||
|
* `AGGREGATED`: Shows summary statistics and overall progress.
|
||||||
|
* 10.3. `CrawlStatus` (from `crawl4ai.models`, used by `CrawlStats`)
|
||||||
|
* Purpose: Represents the status of a crawl task.
|
||||||
|
* Members:
|
||||||
|
* `QUEUED`: Task is waiting to be processed.
|
||||||
|
* `IN_PROGRESS`: Task is currently being processed.
|
||||||
|
* `COMPLETED`: Task finished successfully.
|
||||||
|
* `FAILED`: Task failed.
|
||||||
|
|
||||||
|
## 11. Versioning
|
||||||
|
* 11.1. Accessing Library Version:
|
||||||
|
* The current version of the `crawl4ai` library can be accessed programmatically via the `__version__` attribute of the top-level `crawl4ai` package.
|
||||||
|
* Example:
|
||||||
|
```python
|
||||||
|
from crawl4ai import __version__ as crawl4ai_version
|
||||||
|
print(f"Crawl4AI Version: {crawl4ai_version}")
|
||||||
|
# Expected output based on provided code: Crawl4AI Version: 0.6.3
|
||||||
|
```
|
||||||
|
|
||||||
|
## 12. Basic Usage Examples
|
||||||
|
|
||||||
|
* 12.1. Minimal Crawl:
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="http://example.com")
|
||||||
|
if result.success:
|
||||||
|
print("Markdown (first 300 chars):")
|
||||||
|
print(result.markdown.raw_markdown[:300]) # Accessing raw_markdown
|
||||||
|
else:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
* 12.2. Crawl with Basic Configuration:
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
browser_cfg = BrowserConfig(headless=True, browser_type="firefox")
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
word_count_threshold=50
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||||
|
result = await crawler.arun(url="http://example.com", config=run_cfg)
|
||||||
|
if result.success:
|
||||||
|
print(f"Status Code: {result.status_code}")
|
||||||
|
print(f"Cleaned HTML length: {len(result.cleaned_html)}")
|
||||||
|
else:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
* 12.3. Accessing Links and Images from Result:
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="http://example.com")
|
||||||
|
if result.success:
|
||||||
|
print(f"Found {len(result.links.internal)} internal links.")
|
||||||
|
if result.links.internal:
|
||||||
|
print(f"First internal link: {result.links.internal[0].href}")
|
||||||
|
|
||||||
|
print(f"Found {len(result.media.images)} images.")
|
||||||
|
if result.media.images:
|
||||||
|
print(f"First image src: {result.media.images[0].src}")
|
||||||
|
else:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
```
|
||||||
1539
docs/md_v2/assets/llmtxt/crawl4ai_core_reasoning_content.llm.txt
Normal file
1539
docs/md_v2/assets/llmtxt/crawl4ai_core_reasoning_content.llm.txt
Normal file
File diff suppressed because it is too large
Load Diff
5460
docs/md_v2/assets/llmtxt/crawl4ai_deep_crawling.llm.full.txt
Normal file
5460
docs/md_v2/assets/llmtxt/crawl4ai_deep_crawling.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,813 @@
|
|||||||
|
Okay, I have read and processed the information. I will now generate the detailed "Foundational Memory" document for the `deep_crawling` component of `crawl4ai`, based on the outline from the previous step and the provided code snippets.
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Detailed Foundational Memory for crawl4ai - deep_crawling Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_deep_crawling.md`
|
||||||
|
**Library Version Context:** 0.6.3 (from `crawl4ai/__version__.py`)
|
||||||
|
**Outline Generation Date:** 2024-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction to Deep Crawling
|
||||||
|
|
||||||
|
* 1.1. Purpose: The `deep_crawling` component provides functionalities for recursively crawling web pages starting from an initial URL. It includes strategies for different traversal orders (BFS, DFS, Best-First), mechanisms for filtering which URLs to visit, and methods for scoring URLs to prioritize crawling.
|
||||||
|
* 1.2. Core Concepts:
|
||||||
|
* 1.2.1. Definition of Deep Crawling in Crawl4ai context: The process of discovering and fetching multiple web pages by following links from an initial set of URLs, adhering to specified depth, page limits, and filtering/scoring rules.
|
||||||
|
* 1.2.2. Key Abstractions:
|
||||||
|
* `DeepCrawlStrategy`: Defines the algorithm for traversing linked web pages (e.g., BFS, DFS).
|
||||||
|
* `URLFilter`: Determines whether a discovered URL should be considered for crawling.
|
||||||
|
* `URLScorer`: Assigns a score to URLs to influence crawling priority, especially in strategies like Best-First.
|
||||||
|
|
||||||
|
## 2. `DeepCrawlStrategy` Interface and Implementations
|
||||||
|
|
||||||
|
* **2.1. `DeepCrawlStrategy` (Abstract Base Class)**
|
||||||
|
* Source: `crawl4ai/deep_crawling/base_strategy.py`
|
||||||
|
* 2.1.1. Purpose: Defines the abstract base class for all deep crawling strategies, outlining the core methods required for traversal logic, resource management, URL validation, and link discovery.
|
||||||
|
* 2.1.2. Key Abstract Methods:
|
||||||
|
* `async def _arun_batch(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> List[CrawlResult]`:
|
||||||
|
* Description: Core logic for batch (non-streaming) deep crawling. Processes URLs level by level (or according to strategy) and returns all results once the crawl is complete or limits are met.
|
||||||
|
* `async def _arun_stream(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> AsyncGenerator[CrawlResult, None]`:
|
||||||
|
* Description: Core logic for streaming deep crawling. Processes URLs and yields `CrawlResult` objects as they become available.
|
||||||
|
* `async def shutdown(self) -> None`:
|
||||||
|
* Description: Cleans up any resources used by the deep crawl strategy, such as signaling cancellation events.
|
||||||
|
* `async def can_process_url(self, url: str, depth: int) -> bool`:
|
||||||
|
* Description: Validates a given URL and current depth against configured filters and limits to decide if it should be processed.
|
||||||
|
* `async def link_discovery(self, result: CrawlResult, source_url: str, current_depth: int, visited: Set[str], next_level: List[tuple], depths: Dict[str, int]) -> None`:
|
||||||
|
* Description: Extracts links from a `CrawlResult`, validates them using `can_process_url`, optionally scores them, and appends valid URLs (and their parent references) to the `next_level` list. Updates the `depths` dictionary for newly discovered URLs.
|
||||||
|
* 2.1.3. Key Concrete Methods:
|
||||||
|
* `async def arun(self, start_url: str, crawler: AsyncWebCrawler, config: Optional[CrawlerRunConfig] = None) -> RunManyReturn`:
|
||||||
|
* Description: Main entry point for initiating a deep crawl. It checks if a `CrawlerRunConfig` is provided and then delegates to either `_arun_stream` or `_arun_batch` based on the `config.stream` flag.
|
||||||
|
* `def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig)`:
|
||||||
|
* Description: Makes the strategy instance callable, directly invoking the `arun` method.
|
||||||
|
* 2.1.4. Attributes:
|
||||||
|
* `_cancel_event (asyncio.Event)`: Event to signal cancellation of the crawl.
|
||||||
|
* `_pages_crawled (int)`: Counter for the number of pages successfully crawled.
|
||||||
|
|
||||||
|
* **2.2. `BFSDeepCrawlStrategy`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/bfs_strategy.py`
|
||||||
|
* 2.2.1. Purpose: Implements a Breadth-First Search (BFS) deep crawling strategy, exploring all URLs at the current depth level before moving to the next.
|
||||||
|
* 2.2.2. Inheritance: `DeepCrawlStrategy`
|
||||||
|
* 2.2.3. Initialization (`__init__`)
|
||||||
|
* 2.2.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_depth: int,
|
||||||
|
filter_chain: FilterChain = FilterChain(),
|
||||||
|
url_scorer: Optional[URLScorer] = None,
|
||||||
|
include_external: bool = False,
|
||||||
|
score_threshold: float = -float('inf'),
|
||||||
|
max_pages: int = float('inf'),
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 2.2.3.2. Parameters:
|
||||||
|
* `max_depth (int)`: Maximum depth to crawl relative to the `start_url`.
|
||||||
|
* `filter_chain (FilterChain`, default: `FilterChain()`)`: A `FilterChain` instance to apply to discovered URLs.
|
||||||
|
* `url_scorer (Optional[URLScorer]`, default: `None`)`: An optional `URLScorer` to score URLs. If provided, URLs below `score_threshold` are skipped, and for crawls exceeding `max_pages`, higher-scored URLs are prioritized.
|
||||||
|
* `include_external (bool`, default: `False`)`: If `True`, allows crawling of URLs from external domains.
|
||||||
|
* `score_threshold (float`, default: `-float('inf')`)`: Minimum score (if `url_scorer` is used) for a URL to be processed.
|
||||||
|
* `max_pages (int`, default: `float('inf')`)`: Maximum total number of pages to crawl.
|
||||||
|
* `logger (Optional[logging.Logger]`, default: `None`)`: An optional logger instance. If `None`, a default logger is created.
|
||||||
|
* 2.2.4. Key Implemented Methods:
|
||||||
|
* `_arun_batch(...)`: Implements BFS traversal by processing URLs level by level. It collects all results from a level before discovering links for the next level. All results are returned as a list upon completion.
|
||||||
|
* `_arun_stream(...)`: Implements BFS traversal, yielding `CrawlResult` objects as soon as they are processed within a level. Link discovery for the next level happens after all URLs in the current level are processed and their results yielded.
|
||||||
|
* `can_process_url(...)`: Validates URL format, applies the `filter_chain`, and checks depth limits. For the start URL (depth 0), filtering is bypassed.
|
||||||
|
* `link_discovery(...)`: Extracts internal (and optionally external) links, normalizes them, checks against `visited` set and `can_process_url`. If a `url_scorer` is present and `max_pages` limit is a concern, it scores and sorts valid links, selecting the top ones within `remaining_capacity`.
|
||||||
|
* `shutdown(...)`: Sets an internal `_cancel_event` to signal graceful termination and records the end time in `stats`.
|
||||||
|
* 2.2.5. Key Attributes/Properties:
|
||||||
|
* `stats (TraversalStats)`: [Read-only] - Instance of `TraversalStats` tracking the progress and statistics of the crawl.
|
||||||
|
* `max_depth (int)`: Maximum crawl depth.
|
||||||
|
* `filter_chain (FilterChain)`: The filter chain used.
|
||||||
|
* `url_scorer (Optional[URLScorer])`: The URL scorer used.
|
||||||
|
* `include_external (bool)`: Flag for including external URLs.
|
||||||
|
* `score_threshold (float)`: URL score threshold.
|
||||||
|
* `max_pages (int)`: Maximum pages to crawl.
|
||||||
|
|
||||||
|
* **2.3. `DFSDeepCrawlStrategy`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/dfs_strategy.py`
|
||||||
|
* 2.3.1. Purpose: Implements a Depth-First Search (DFS) deep crawling strategy, exploring as far as possible along each branch before backtracking.
|
||||||
|
* 2.3.2. Inheritance: `BFSDeepCrawlStrategy` (Note: Leverages much of the `BFSDeepCrawlStrategy`'s infrastructure but overrides traversal logic to use a stack.)
|
||||||
|
* 2.3.3. Initialization (`__init__`)
|
||||||
|
* 2.3.3.1. Signature: (Same as `BFSDeepCrawlStrategy`)
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_depth: int,
|
||||||
|
filter_chain: FilterChain = FilterChain(),
|
||||||
|
url_scorer: Optional[URLScorer] = None,
|
||||||
|
include_external: bool = False,
|
||||||
|
score_threshold: float = -float('inf'),
|
||||||
|
max_pages: int = infinity,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 2.3.3.2. Parameters: Same as `BFSDeepCrawlStrategy`.
|
||||||
|
* 2.3.4. Key Overridden/Implemented Methods:
|
||||||
|
* `_arun_batch(...)`: Implements DFS traversal using a LIFO stack. Processes one URL at a time, discovers its links, and adds them to the stack (typically in reverse order of discovery to maintain a natural DFS path). Collects all results in a list.
|
||||||
|
* `_arun_stream(...)`: Implements DFS traversal using a LIFO stack, yielding `CrawlResult` for each processed URL as it becomes available. Discovered links are added to the stack for subsequent processing.
|
||||||
|
|
||||||
|
* **2.4. `BestFirstCrawlingStrategy`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/bff_strategy.py`
|
||||||
|
* 2.4.1. Purpose: Implements a Best-First Search deep crawling strategy, prioritizing URLs based on scores assigned by a `URLScorer`. It uses a priority queue to manage URLs to visit.
|
||||||
|
* 2.4.2. Inheritance: `DeepCrawlStrategy`
|
||||||
|
* 2.4.3. Initialization (`__init__`)
|
||||||
|
* 2.4.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_depth: int,
|
||||||
|
filter_chain: FilterChain = FilterChain(),
|
||||||
|
url_scorer: Optional[URLScorer] = None,
|
||||||
|
include_external: bool = False,
|
||||||
|
max_pages: int = float('inf'),
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 2.4.3.2. Parameters:
|
||||||
|
* `max_depth (int)`: Maximum depth to crawl.
|
||||||
|
* `filter_chain (FilterChain`, default: `FilterChain()`)`: Chain of filters to apply.
|
||||||
|
* `url_scorer (Optional[URLScorer]`, default: `None`)`: Scorer to rank URLs. Crucial for this strategy; if not provided, URLs might effectively be processed in FIFO order (score 0).
|
||||||
|
* `include_external (bool`, default: `False`)`: Whether to include external links.
|
||||||
|
* `max_pages (int`, default: `float('inf')`)`: Maximum number of pages to crawl.
|
||||||
|
* `logger (Optional[logging.Logger]`, default: `None`)`: Logger instance.
|
||||||
|
* 2.4.4. Key Implemented Methods:
|
||||||
|
* `_arun_batch(...)`: Aggregates results from `_arun_best_first` into a list.
|
||||||
|
* `_arun_stream(...)`: Yields results from `_arun_best_first` as they are generated.
|
||||||
|
* `_arun_best_first(...)`: Core logic for best-first traversal. Uses an `asyncio.PriorityQueue` where items are `(score, depth, url, parent_url)`. URLs are processed in batches (default size 10) from the priority queue. Discovered links are scored and added to the queue.
|
||||||
|
* 2.4.5. Key Attributes/Properties:
|
||||||
|
* `stats (TraversalStats)`: [Read-only] - Traversal statistics object.
|
||||||
|
* `BATCH_SIZE (int)`: [Class constant, default: 10] - Number of URLs to process concurrently from the priority queue.
|
||||||
|
|
||||||
|
## 3. URL Filtering Mechanisms
|
||||||
|
|
||||||
|
* **3.1. `URLFilter` (Abstract Base Class)**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.1.1. Purpose: Defines the abstract base class for all URL filters, providing a common interface for deciding whether a URL should be processed.
|
||||||
|
* 3.1.2. Key Abstract Methods:
|
||||||
|
* `apply(self, url: str) -> bool`:
|
||||||
|
* Description: Abstract method that must be implemented by subclasses. It takes a URL string and returns `True` if the URL passes the filter (should be processed), and `False` otherwise.
|
||||||
|
* 3.1.3. Key Attributes/Properties:
|
||||||
|
* `name (str)`: [Read-only] - The name of the filter, typically the class name.
|
||||||
|
* `stats (FilterStats)`: [Read-only] - An instance of `FilterStats` to track how many URLs were processed, passed, and rejected by this filter.
|
||||||
|
* `logger (logging.Logger)`: [Read-only] - A logger instance specific to this filter, initialized lazily.
|
||||||
|
* 3.1.4. Key Concrete Methods:
|
||||||
|
* `_update_stats(self, passed: bool) -> None`: Updates the `stats` object (total, passed, rejected counts).
|
||||||
|
|
||||||
|
* **3.2. `FilterChain`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.2.1. Purpose: Manages a sequence of `URLFilter` instances. A URL must pass all filters in the chain to be considered valid.
|
||||||
|
* 3.2.2. Initialization (`__init__`)
|
||||||
|
* 3.2.2.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, filters: List[URLFilter] = None):
|
||||||
|
```
|
||||||
|
* 3.2.2.2. Parameters:
|
||||||
|
* `filters (List[URLFilter]`, default: `None`)`: An optional list of `URLFilter` instances to initialize the chain with. If `None`, an empty chain is created.
|
||||||
|
* 3.2.3. Key Public Methods:
|
||||||
|
* `add_filter(self, filter_: URLFilter) -> FilterChain`:
|
||||||
|
* Description: Adds a new `URLFilter` instance to the end of the chain.
|
||||||
|
* Returns: `(FilterChain)` - The `FilterChain` instance itself, allowing for method chaining.
|
||||||
|
* `async def apply(self, url: str) -> bool`:
|
||||||
|
* Description: Applies each filter in the chain to the given URL. If any filter returns `False` (rejects the URL), this method immediately returns `False`. If all filters pass, it returns `True`. Handles both synchronous and asynchronous `apply` methods of individual filters.
|
||||||
|
* Returns: `(bool)` - `True` if the URL passes all filters, `False` otherwise.
|
||||||
|
* 3.2.4. Key Attributes/Properties:
|
||||||
|
* `filters (Tuple[URLFilter, ...])`: [Read-only] - An immutable tuple containing the `URLFilter` instances in the chain.
|
||||||
|
* `stats (FilterStats)`: [Read-only] - An instance of `FilterStats` tracking the aggregated statistics for the entire chain (total URLs processed, passed, and rejected by the chain as a whole).
|
||||||
|
|
||||||
|
* **3.3. `URLPatternFilter`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.3.1. Purpose: Filters URLs based on whether they match a list of specified string patterns. Supports glob-style wildcards and regular expressions.
|
||||||
|
* 3.3.2. Inheritance: `URLFilter`
|
||||||
|
* 3.3.3. Initialization (`__init__`)
|
||||||
|
* 3.3.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||||
|
use_glob: bool = True, # Deprecated, glob is always used for strings if not regex
|
||||||
|
reverse: bool = False,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 3.3.3.2. Parameters:
|
||||||
|
* `patterns (Union[str, Pattern, List[Union[str, Pattern]]])`: A single pattern string/compiled regex, or a list of such patterns. String patterns are treated as glob patterns by default unless they are identifiable as regex (e.g., start with `^`, end with `$`, contain `\d`).
|
||||||
|
* `use_glob (bool`, default: `True`)`: [Deprecated] This parameter's functionality is now implicitly handled by pattern detection.
|
||||||
|
* `reverse (bool`, default: `False`)`: If `True`, the filter rejects URLs that match any of the patterns. If `False` (default), it accepts URLs that match any pattern and rejects those that don't match any.
|
||||||
|
* 3.3.4. Key Implemented Methods:
|
||||||
|
* `apply(self, url: str) -> bool`:
|
||||||
|
* Description: Checks if the URL matches any of the configured patterns. Simple suffix/prefix/domain patterns are checked first for performance. For more complex patterns, it uses `fnmatch.translate` (for glob-like strings) or compiled regex objects. The outcome is affected by the `reverse` flag.
|
||||||
|
* 3.3.5. Internal Categorization:
|
||||||
|
* `PATTERN_TYPES`: A dictionary mapping pattern types (SUFFIX, PREFIX, DOMAIN, PATH, REGEX) to integer constants.
|
||||||
|
* `_simple_suffixes (Set[str])`: Stores simple suffix patterns (e.g., `.html`).
|
||||||
|
* `_simple_prefixes (Set[str])`: Stores simple prefix patterns (e.g., `/blog/`).
|
||||||
|
* `_domain_patterns (List[Pattern])`: Stores compiled regex for domain-specific patterns (e.g., `*.example.com`).
|
||||||
|
* `_path_patterns (List[Pattern])`: Stores compiled regex for more general path patterns.
|
||||||
|
|
||||||
|
* **3.4. `ContentTypeFilter`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.4.1. Purpose: Filters URLs based on their expected content type, primarily by inferring it from the file extension in the URL.
|
||||||
|
* 3.4.2. Inheritance: `URLFilter`
|
||||||
|
* 3.4.3. Initialization (`__init__`)
|
||||||
|
* 3.4.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
allowed_types: Union[str, List[str]],
|
||||||
|
check_extension: bool = True,
|
||||||
|
ext_map: Dict[str, str] = _MIME_MAP, # _MIME_MAP is internal
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 3.4.3.2. Parameters:
|
||||||
|
* `allowed_types (Union[str, List[str]])`: A single MIME type string (e.g., "text/html") or a list of allowed MIME types. Can also be partial types like "image/" to allow all image types.
|
||||||
|
* `check_extension (bool`, default: `True`)`: If `True` (default), the filter attempts to determine the content type by looking at the URL's file extension. If `False`, all URLs pass this filter (unless `allowed_types` is empty).
|
||||||
|
* `ext_map (Dict[str, str]`, default: `ContentTypeFilter._MIME_MAP`)`: A dictionary mapping file extensions to their corresponding MIME types. A comprehensive default map is provided.
|
||||||
|
* 3.4.4. Key Implemented Methods:
|
||||||
|
* `apply(self, url: str) -> bool`:
|
||||||
|
* Description: Extracts the file extension from the URL. If `check_extension` is `True` and an extension is found, it checks if the inferred MIME type (or the extension itself if MIME type is unknown) is among the `allowed_types`. If no extension is found, it typically allows the URL (assuming it might be an HTML page or similar).
|
||||||
|
* 3.4.5. Static Methods:
|
||||||
|
* `_extract_extension(url: str) -> str`: [Cached] Extracts the file extension from a URL path, handling query parameters and fragments.
|
||||||
|
* 3.4.6. Class Variables:
|
||||||
|
* `_MIME_MAP (Dict[str, str])`: A class-level dictionary mapping common file extensions to MIME types.
|
||||||
|
|
||||||
|
* **3.5. `DomainFilter`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.5.1. Purpose: Filters URLs based on a whitelist of allowed domains or a blacklist of blocked domains. Supports subdomain matching.
|
||||||
|
* 3.5.2. Inheritance: `URLFilter`
|
||||||
|
* 3.5.3. Initialization (`__init__`)
|
||||||
|
* 3.5.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
allowed_domains: Union[str, List[str]] = None,
|
||||||
|
blocked_domains: Union[str, List[str]] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 3.5.3.2. Parameters:
|
||||||
|
* `allowed_domains (Union[str, List[str]]`, default: `None`)`: A single domain string or a list of domain strings. If provided, only URLs whose domain (or a subdomain thereof) is in this list will pass.
|
||||||
|
* `blocked_domains (Union[str, List[str]]`, default: `None`)`: A single domain string or a list of domain strings. URLs whose domain (or a subdomain thereof) is in this list will be rejected.
|
||||||
|
* 3.5.4. Key Implemented Methods:
|
||||||
|
* `apply(self, url: str) -> bool`:
|
||||||
|
* Description: Extracts the domain from the URL. First, checks if the domain is in `_blocked_domains` (rejects if true). Then, if `_allowed_domains` is specified, checks if the domain is in that list (accepts if true). If `_allowed_domains` is not specified and the URL was not blocked, it passes.
|
||||||
|
* 3.5.5. Static Methods:
|
||||||
|
* `_normalize_domains(domains: Union[str, List[str]]) -> Set[str]`: Converts input domains to a set of lowercase strings.
|
||||||
|
* `_is_subdomain(domain: str, parent_domain: str) -> bool`: Checks if `domain` is a subdomain of (or equal to) `parent_domain`.
|
||||||
|
* `_extract_domain(url: str) -> str`: [Cached] Extracts the domain name from a URL.
|
||||||
|
|
||||||
|
* **3.6. `ContentRelevanceFilter`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.6.1. Purpose: Filters URLs by fetching their `<head>` section, extracting text content (title, meta tags), and scoring its relevance against a given query using the BM25 algorithm.
|
||||||
|
* 3.6.2. Inheritance: `URLFilter`
|
||||||
|
* 3.6.3. Initialization (`__init__`)
|
||||||
|
* 3.6.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
threshold: float,
|
||||||
|
k1: float = 1.2,
|
||||||
|
b: float = 0.75,
|
||||||
|
avgdl: int = 1000,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 3.6.3.2. Parameters:
|
||||||
|
* `query (str)`: The query string to assess relevance against.
|
||||||
|
* `threshold (float)`: The minimum BM25 score required for the URL to be considered relevant and pass the filter.
|
||||||
|
* `k1 (float`, default: `1.2`)`: BM25 k1 parameter (term frequency saturation).
|
||||||
|
* `b (float`, default: `0.75`)`: BM25 b parameter (length normalization).
|
||||||
|
* `avgdl (int`, default: `1000`)`: Assumed average document length for BM25 calculations (typically based on the head content).
|
||||||
|
* 3.6.4. Key Implemented Methods:
|
||||||
|
* `async def apply(self, url: str) -> bool`:
|
||||||
|
* Description: Asynchronously fetches the HTML `<head>` content of the URL using `HeadPeeker.peek_html`. Extracts title and meta description/keywords. Calculates the BM25 score of this combined text against the `query`. Returns `True` if the score is >= `threshold`.
|
||||||
|
* 3.6.5. Helper Methods:
|
||||||
|
* `_build_document(self, fields: Dict) -> str`: Constructs a weighted document string from title and meta tags.
|
||||||
|
* `_tokenize(self, text: str) -> List[str]`: Simple whitespace tokenizer.
|
||||||
|
* `_bm25(self, document: str) -> float`: Calculates the BM25 score.
|
||||||
|
|
||||||
|
* **3.7. `SEOFilter`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.7.1. Purpose: Filters URLs by performing a quantitative SEO quality assessment based on the content of their `<head>` section (e.g., title length, meta description presence, canonical tags, robots meta tags, schema.org markup).
|
||||||
|
* 3.7.2. Inheritance: `URLFilter`
|
||||||
|
* 3.7.3. Initialization (`__init__`)
|
||||||
|
* 3.7.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
threshold: float = 0.65,
|
||||||
|
keywords: List[str] = None,
|
||||||
|
weights: Dict[str, float] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 3.7.3.2. Parameters:
|
||||||
|
* `threshold (float`, default: `0.65`)`: The minimum aggregated SEO score (typically 0.0 to 1.0 range, though individual factor weights can exceed 1) required for the URL to pass.
|
||||||
|
* `keywords (List[str]`, default: `None`)`: A list of keywords to check for presence in the title.
|
||||||
|
* `weights (Dict[str, float]`, default: `None`)`: A dictionary to override default weights for various SEO factors (e.g., `{"title_length": 0.2, "canonical": 0.15}`).
|
||||||
|
* 3.7.4. Key Implemented Methods:
|
||||||
|
* `async def apply(self, url: str) -> bool`:
|
||||||
|
* Description: Asynchronously fetches the HTML `<head>` content. Calculates scores for individual SEO factors (title length, keyword presence, meta description, canonical tag, robots meta tag, schema.org presence, URL quality). Aggregates these scores using the defined `weights`. Returns `True` if the total score is >= `threshold`.
|
||||||
|
* 3.7.5. Helper Methods (Scoring Factors):
|
||||||
|
* `_score_title_length(self, title: str) -> float`
|
||||||
|
* `_score_keyword_presence(self, text: str) -> float`
|
||||||
|
* `_score_meta_description(self, desc: str) -> float`
|
||||||
|
* `_score_canonical(self, canonical: str, original: str) -> float`
|
||||||
|
* `_score_schema_org(self, html: str) -> float`
|
||||||
|
* `_score_url_quality(self, parsed_url) -> float`
|
||||||
|
* 3.7.6. Class Variables:
|
||||||
|
* `DEFAULT_WEIGHTS (Dict[str, float])`: Default weights for each SEO factor.
|
||||||
|
|
||||||
|
* **3.8. `FilterStats` Data Class**
|
||||||
|
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||||
|
* 3.8.1. Purpose: A data class to track statistics for URL filtering operations, including total URLs processed, passed, and rejected.
|
||||||
|
* 3.8.2. Fields:
|
||||||
|
* `_counters (array.array)`: An array of unsigned integers storing counts for `[total, passed, rejected]`.
|
||||||
|
* 3.8.3. Properties:
|
||||||
|
* `total_urls (int)`: Returns the total number of URLs processed.
|
||||||
|
* `passed_urls (int)`: Returns the number of URLs that passed the filter.
|
||||||
|
* `rejected_urls (int)`: Returns the number of URLs that were rejected.
|
||||||
|
|
||||||
|
## 4. URL Scoring Mechanisms
|
||||||
|
|
||||||
|
* **4.1. `URLScorer` (Abstract Base Class)**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.1.1. Purpose: Defines the abstract base class for all URL scorers. Scorers assign a numerical value to URLs, which can be used to prioritize crawling.
|
||||||
|
* 4.1.2. Key Abstract Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Abstract method to be implemented by subclasses. It takes a URL string and returns a raw numerical score.
|
||||||
|
* 4.1.3. Key Concrete Methods:
|
||||||
|
* `score(self, url: str) -> float`:
|
||||||
|
* Description: Calculates the final score for a URL by calling `_calculate_score` and multiplying the result by the scorer's `weight`. It also updates the internal `ScoringStats`.
|
||||||
|
* Returns: `(float)` - The weighted score.
|
||||||
|
* 4.1.4. Key Attributes/Properties:
|
||||||
|
* `weight (ctypes.c_float)`: [Read-write] - The weight assigned to this scorer. The raw score calculated by `_calculate_score` will be multiplied by this weight. Default is 1.0. Stored as `ctypes.c_float` for memory efficiency.
|
||||||
|
* `stats (ScoringStats)`: [Read-only] - An instance of `ScoringStats` that tracks statistics for this scorer (number of URLs scored, total score, min/max scores).
|
||||||
|
|
||||||
|
* **4.2. `KeywordRelevanceScorer`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.2.1. Purpose: Scores URLs based on the presence and frequency of specified keywords within the URL string itself.
|
||||||
|
* 4.2.2. Inheritance: `URLScorer`
|
||||||
|
* 4.2.3. Initialization (`__init__`)
|
||||||
|
* 4.2.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
|
||||||
|
```
|
||||||
|
* 4.2.3.2. Parameters:
|
||||||
|
* `keywords (List[str])`: A list of keyword strings to search for in the URL.
|
||||||
|
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||||
|
* `case_sensitive (bool`, default: `False`)`: If `True`, keyword matching is case-sensitive. Otherwise, both the URL and keywords are converted to lowercase for matching.
|
||||||
|
* 4.2.4. Key Implemented Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Counts how many of the provided `keywords` are present in the `url`. The score is the ratio of matched keywords to the total number of keywords (0.0 to 1.0).
|
||||||
|
* 4.2.5. Helper Methods:
|
||||||
|
* `_url_bytes(self, url: str) -> bytes`: [Cached] Converts URL to bytes, lowercasing if not case-sensitive.
|
||||||
|
|
||||||
|
* **4.3. `PathDepthScorer`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.3.1. Purpose: Scores URLs based on their path depth (number of segments in the URL path). It favors URLs closer to an `optimal_depth`.
|
||||||
|
* 4.3.2. Inheritance: `URLScorer`
|
||||||
|
* 4.3.3. Initialization (`__init__`)
|
||||||
|
* 4.3.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
||||||
|
```
|
||||||
|
* 4.3.3.2. Parameters:
|
||||||
|
* `optimal_depth (int`, default: `3`)`: The path depth considered ideal. URLs at this depth get the highest score.
|
||||||
|
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||||
|
* 4.3.4. Key Implemented Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Calculates the path depth of the URL. The score is `1.0 / (1.0 + abs(depth - optimal_depth))`, meaning URLs at `optimal_depth` score 1.0, and scores decrease as depth deviates. Uses a lookup table for common small differences for speed.
|
||||||
|
* 4.3.5. Static Methods:
|
||||||
|
* `_quick_depth(path: str) -> int`: [Cached] Efficiently calculates path depth without full URL parsing.
|
||||||
|
|
||||||
|
* **4.4. `ContentTypeScorer`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.4.1. Purpose: Scores URLs based on their inferred content type, typically derived from the file extension.
|
||||||
|
* 4.4.2. Inheritance: `URLScorer`
|
||||||
|
* 4.4.3. Initialization (`__init__`)
|
||||||
|
* 4.4.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
||||||
|
```
|
||||||
|
* 4.4.3.2. Parameters:
|
||||||
|
* `type_weights (Dict[str, float])`: A dictionary mapping file extensions (e.g., "html", "pdf") or MIME type patterns (e.g., "text/html", "image/") to scores. Patterns ending with '$' are treated as exact extension matches.
|
||||||
|
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||||
|
* 4.4.4. Key Implemented Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Extracts the file extension from the URL. Looks up the score in `type_weights` first by exact extension match (if pattern ends with '$'), then by general extension. If no direct match, it might try matching broader MIME type categories if defined in `type_weights`. Returns 0.0 if no match found.
|
||||||
|
* 4.4.5. Static Methods:
|
||||||
|
* `_quick_extension(url: str) -> str`: [Cached] Efficiently extracts file extension.
|
||||||
|
|
||||||
|
* **4.5. `FreshnessScorer`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.5.1. Purpose: Scores URLs based on dates found within the URL string, giving higher scores to more recent dates.
|
||||||
|
* 4.5.2. Inheritance: `URLScorer`
|
||||||
|
* 4.5.3. Initialization (`__init__`)
|
||||||
|
* 4.5.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, weight: float = 1.0, current_year: int = [datetime.date.today().year]): # Actual default is dynamic
|
||||||
|
```
|
||||||
|
* 4.5.3.2. Parameters:
|
||||||
|
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||||
|
* `current_year (int`, default: `datetime.date.today().year`)`: The reference year to calculate freshness against.
|
||||||
|
* 4.5.4. Key Implemented Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Uses a regex to find year patterns (YYYY) in the URL. If multiple years are found, it uses the latest valid year. The score is higher for years closer to `current_year`, using a predefined lookup for small differences or a decay function for larger differences. If no year is found, a default score (0.5) is returned.
|
||||||
|
* 4.5.5. Helper Methods:
|
||||||
|
* `_extract_year(self, url: str) -> Optional[int]`: [Cached] Extracts the most recent valid year from the URL.
|
||||||
|
|
||||||
|
* **4.6. `DomainAuthorityScorer`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.6.1. Purpose: Scores URLs based on a predefined list of domain authority weights. This allows prioritizing or de-prioritizing URLs from specific domains.
|
||||||
|
* 4.6.2. Inheritance: `URLScorer`
|
||||||
|
* 4.6.3. Initialization (`__init__`)
|
||||||
|
* 4.6.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
domain_weights: Dict[str, float],
|
||||||
|
default_weight: float = 0.5,
|
||||||
|
weight: float = 1.0,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 4.6.3.2. Parameters:
|
||||||
|
* `domain_weights (Dict[str, float])`: A dictionary mapping domain names (e.g., "example.com") to their authority scores (typically between 0.0 and 1.0).
|
||||||
|
* `default_weight (float`, default: `0.5`)`: The score to assign to URLs whose domain is not found in `domain_weights`.
|
||||||
|
* `weight (float`, default: `1.0`)`: The overall weight to apply to the calculated score.
|
||||||
|
* 4.6.4. Key Implemented Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Extracts the domain from the URL. If the domain is in `_domain_weights`, its corresponding score is returned. Otherwise, `_default_weight` is returned. Prioritizes top domains for faster lookup.
|
||||||
|
* 4.6.5. Static Methods:
|
||||||
|
* `_extract_domain(url: str) -> str`: [Cached] Efficiently extracts the domain from a URL.
|
||||||
|
|
||||||
|
* **4.7. `CompositeScorer`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.7.1. Purpose: Combines the scores from multiple `URLScorer` instances. Each constituent scorer contributes its weighted score to the final composite score.
|
||||||
|
* 4.7.2. Inheritance: `URLScorer`
|
||||||
|
* 4.7.3. Initialization (`__init__`)
|
||||||
|
* 4.7.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
||||||
|
```
|
||||||
|
* 4.7.3.2. Parameters:
|
||||||
|
* `scorers (List[URLScorer])`: A list of `URLScorer` instances to be combined.
|
||||||
|
* `normalize (bool`, default: `True`)`: If `True`, the final composite score is normalized by dividing the sum of weighted scores by the number of scorers. This can help keep scores in a more consistent range.
|
||||||
|
* 4.7.4. Key Implemented Methods:
|
||||||
|
* `_calculate_score(self, url: str) -> float`:
|
||||||
|
* Description: Iterates through all scorers in its list, calls their `score(url)` method (which applies individual weights), and sums up these scores. If `normalize` is `True`, divides the total sum by the number of scorers.
|
||||||
|
* 4.7.5. Key Concrete Methods (overrides `URLScorer.score`):
|
||||||
|
* `score(self, url: str) -> float`:
|
||||||
|
* Description: Calculates the composite score and updates its own `ScoringStats`. Note: The individual scorers' stats are updated when their `score` methods are called internally.
|
||||||
|
|
||||||
|
* **4.8. `ScoringStats` Data Class**
|
||||||
|
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||||
|
* 4.8.1. Purpose: A data class to track statistics for URL scoring operations, including the number of URLs scored, total score, and min/max scores.
|
||||||
|
* 4.8.2. Fields:
|
||||||
|
* `_urls_scored (int)`: Count of URLs scored.
|
||||||
|
* `_total_score (float)`: Sum of all scores.
|
||||||
|
* `_min_score (Optional[float])`: Minimum score encountered.
|
||||||
|
* `_max_score (Optional[float])`: Maximum score encountered.
|
||||||
|
* 4.8.3. Key Methods:
|
||||||
|
* `update(self, score: float) -> None`: Updates the statistics with a new score.
|
||||||
|
* `get_average(self) -> float`: Calculates and returns the average score.
|
||||||
|
* `get_min(self) -> float`: Lazily initializes and returns the minimum score.
|
||||||
|
* `get_max(self) -> float`: Lazily initializes and returns the maximum score.
|
||||||
|
|
||||||
|
## 5. `DeepCrawlDecorator`
|
||||||
|
|
||||||
|
* Source: `crawl4ai/deep_crawling/base_strategy.py`
|
||||||
|
* 5.1. Purpose: A decorator class that transparently adds deep crawling functionality to the `AsyncWebCrawler.arun` method if a `deep_crawl_strategy` is specified in the `CrawlerRunConfig`.
|
||||||
|
* 5.2. Initialization (`__init__`)
|
||||||
|
* 5.2.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self, crawler: AsyncWebCrawler):
|
||||||
|
```
|
||||||
|
* 5.2.2. Parameters:
|
||||||
|
* `crawler (AsyncWebCrawler)`: The `AsyncWebCrawler` instance whose `arun` method is to be decorated.
|
||||||
|
* 5.3. `__call__` Method
|
||||||
|
* 5.3.1. Signature:
|
||||||
|
```python
|
||||||
|
@wraps(original_arun)
|
||||||
|
async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
|
||||||
|
```
|
||||||
|
* 5.3.2. Functionality: This method wraps the original `arun` method of the `AsyncWebCrawler`.
|
||||||
|
* It checks if `config` is provided, has a `deep_crawl_strategy` set, and if `DeepCrawlDecorator.deep_crawl_active` context variable is `False` (to prevent recursion).
|
||||||
|
* If these conditions are met:
|
||||||
|
* It sets `DeepCrawlDecorator.deep_crawl_active` to `True`.
|
||||||
|
* It calls the `arun` method of the specified `config.deep_crawl_strategy`.
|
||||||
|
* It handles potential streaming results from the strategy by wrapping them in an async generator.
|
||||||
|
* Finally, it resets `DeepCrawlDecorator.deep_crawl_active` to `False`.
|
||||||
|
* If the conditions are not met, it calls the original `arun` method of the crawler.
|
||||||
|
* 5.4. Class Variable:
|
||||||
|
* `deep_crawl_active (ContextVar)`:
|
||||||
|
* Purpose: A `contextvars.ContextVar` used as a flag to indicate if a deep crawl is currently in progress for the current asynchronous context. This prevents the decorator from re-triggering deep crawling if the strategy itself calls the crawler's `arun` or `arun_many` methods.
|
||||||
|
* Default Value: `False`.
|
||||||
|
|
||||||
|
## 6. `TraversalStats` Data Model
|
||||||
|
|
||||||
|
* Source: `crawl4ai/models.py`
|
||||||
|
* 6.1. Purpose: A data class for storing and tracking statistics related to a deep crawl traversal.
|
||||||
|
* 6.2. Fields:
|
||||||
|
* `start_time (datetime)`: The timestamp (Python `datetime` object) when the traversal process began. Default: `datetime.now()`.
|
||||||
|
* `end_time (Optional[datetime])`: The timestamp when the traversal process completed. Default: `None`.
|
||||||
|
* `urls_processed (int)`: The total number of URLs that were successfully fetched and processed. Default: `0`.
|
||||||
|
* `urls_failed (int)`: The total number of URLs that resulted in an error during fetching or processing. Default: `0`.
|
||||||
|
* `urls_skipped (int)`: The total number of URLs that were skipped (e.g., due to filters, already visited, or depth limits). Default: `0`.
|
||||||
|
* `total_depth_reached (int)`: The maximum depth reached from the start URL during the crawl. Default: `0`.
|
||||||
|
* `current_depth (int)`: The current depth level being processed by the crawler (can fluctuate during the crawl, especially for BFS). Default: `0`.
|
||||||
|
|
||||||
|
## 7. Configuration for Deep Crawling (`CrawlerRunConfig`)
|
||||||
|
|
||||||
|
* Source: `crawl4ai/async_configs.py`
|
||||||
|
* 7.1. Purpose: `CrawlerRunConfig` is the primary configuration object passed to `AsyncWebCrawler.arun()` and `AsyncWebCrawler.arun_many()`. It contains various settings that control the behavior of a single crawl run, including those specific to deep crawling.
|
||||||
|
* 7.2. Relevant Fields:
|
||||||
|
* `deep_crawl_strategy (Optional[DeepCrawlStrategy])`:
|
||||||
|
* Type: `Optional[DeepCrawlStrategy]` (where `DeepCrawlStrategy` is the ABC from `crawl4ai.deep_crawling.base_strategy`)
|
||||||
|
* Default: `None`
|
||||||
|
* Description: Specifies the deep crawling strategy instance (e.g., `BFSDeepCrawlStrategy`, `DFSDeepCrawlStrategy`, `BestFirstCrawlingStrategy`) to be used for the crawl. If `None`, deep crawling is disabled, and only the initial URL(s) will be processed.
|
||||||
|
* *Note: Parameters like `max_depth`, `max_pages`, `filter_chain`, `url_scorer`, `score_threshold`, and `include_external` are not direct attributes of `CrawlerRunConfig` for deep crawling. Instead, they are passed to the constructor of the chosen `DeepCrawlStrategy` instance, which is then assigned to `CrawlerRunConfig.deep_crawl_strategy`.*
|
||||||
|
|
||||||
|
## 8. Utility Functions
|
||||||
|
|
||||||
|
* **8.1. `normalize_url_for_deep_crawl(url: str, source_url: str) -> str`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/utils.py` (or `crawl4ai/utils.py` if it's a general utility)
|
||||||
|
* 8.1.1. Purpose: Normalizes a URL found during deep crawling. This typically involves resolving relative URLs against the `source_url` to create absolute URLs and removing URL fragments (`#fragment`).
|
||||||
|
* 8.1.2. Signature: `def normalize_url_for_deep_crawl(url: str, source_url: str) -> str:`
|
||||||
|
* 8.1.3. Parameters:
|
||||||
|
* `url (str)`: The URL string to be normalized.
|
||||||
|
* `source_url (str)`: The URL of the page where the `url` was discovered. This is used as the base for resolving relative paths.
|
||||||
|
* 8.1.4. Returns: `(str)` - The normalized, absolute URL without fragments.
|
||||||
|
|
||||||
|
* **8.2. `efficient_normalize_url_for_deep_crawl(url: str, source_url: str) -> str`**
|
||||||
|
* Source: `crawl4ai/deep_crawling/utils.py` (or `crawl4ai/utils.py`)
|
||||||
|
* 8.2.1. Purpose: Provides a potentially more performant version of URL normalization specifically for deep crawling scenarios, likely employing optimizations to avoid repeated or complex parsing operations. (Note: Based on the provided code, this appears to be the same as `normalize_url_for_deep_crawl` if only one is present, or it might contain specific internal optimizations not exposed differently at the API level but used by strategies).
|
||||||
|
* 8.2.2. Signature: `def efficient_normalize_url_for_deep_crawl(url: str, source_url: str) -> str:`
|
||||||
|
* 8.2.3. Parameters:
|
||||||
|
* `url (str)`: The URL string to be normalized.
|
||||||
|
* `source_url (str)`: The URL of the page where the `url` was discovered.
|
||||||
|
* 8.2.4. Returns: `(str)` - The normalized, absolute URL, typically without fragments.
|
||||||
|
|
||||||
|
## 9. PDF Processing Integration (`crawl4ai.processors.pdf`)
|
||||||
|
* 9.1. Overview of PDF processing in Crawl4ai: While not directly part of the `deep_crawling` package, PDF processing components can be used in conjunction if a deep crawl discovers PDF URLs and they need to be processed. The `PDFCrawlerStrategy` can fetch PDFs, and `PDFContentScrapingStrategy` can extract content from them.
|
||||||
|
* **9.2. `PDFCrawlerStrategy`**
|
||||||
|
* Source: `crawl4ai/processors/pdf/__init__.py`
|
||||||
|
* 9.2.1. Purpose: An `AsyncCrawlerStrategy` designed to "crawl" PDF files. In practice, this usually means downloading the PDF content. It returns a minimal `AsyncCrawlResponse` that signals to a `ContentScrapingStrategy` (like `PDFContentScrapingStrategy`) that the content is a PDF.
|
||||||
|
* 9.2.2. Inheritance: `AsyncCrawlerStrategy`
|
||||||
|
* 9.2.3. Initialization (`__init__`)
|
||||||
|
* 9.2.3.1. Signature: `def __init__(self, logger: AsyncLogger = None):`
|
||||||
|
* 9.2.3.2. Parameters:
|
||||||
|
* `logger (AsyncLogger`, default: `None`)`: An optional logger instance.
|
||||||
|
* 9.2.4. Key Methods:
|
||||||
|
* `async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`:
|
||||||
|
* Description: For a PDF URL, this method typically signifies that the URL points to a PDF. It constructs an `AsyncCrawlResponse` with a `Content-Type` header of `application/pdf` and a placeholder HTML. The actual PDF processing (downloading and content extraction) is usually handled by a subsequent scraping strategy.
|
||||||
|
* **9.3. `PDFContentScrapingStrategy`**
|
||||||
|
* Source: `crawl4ai/processors/pdf/__init__.py`
|
||||||
|
* 9.3.1. Purpose: A `ContentScrapingStrategy` specialized in extracting text, images (optional), and metadata from PDF files. It uses a `PDFProcessorStrategy` (like `NaivePDFProcessorStrategy`) internally.
|
||||||
|
* 9.3.2. Inheritance: `ContentScrapingStrategy`
|
||||||
|
* 9.3.3. Initialization (`__init__`)
|
||||||
|
* 9.3.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(self,
|
||||||
|
save_images_locally: bool = False,
|
||||||
|
extract_images: bool = False,
|
||||||
|
image_save_dir: str = None,
|
||||||
|
batch_size: int = 4,
|
||||||
|
logger: AsyncLogger = None):
|
||||||
|
```
|
||||||
|
* 9.3.3.2. Parameters:
|
||||||
|
* `save_images_locally (bool`, default: `False`)`: If `True`, extracted images will be saved to the local disk.
|
||||||
|
* `extract_images (bool`, default: `False`)`: If `True`, attempts to extract images from the PDF.
|
||||||
|
* `image_save_dir (str`, default: `None`)`: The directory where extracted images will be saved if `save_images_locally` is `True`.
|
||||||
|
* `batch_size (int`, default: `4`)`: The number of PDF pages to process in parallel batches (if the underlying processor supports it).
|
||||||
|
* `logger (AsyncLogger`, default: `None`)`: An optional logger instance.
|
||||||
|
* 9.3.4. Key Methods:
|
||||||
|
* `scrape(self, url: str, html: str, **params) -> ScrapingResult`:
|
||||||
|
* Description: Takes the URL (which should point to a PDF or a local PDF path) and processes it. It downloads the PDF if it's a remote URL, then uses the internal `pdf_processor` to extract content. It formats the extracted text into basic HTML and collects image and link information.
|
||||||
|
* `async def ascrape(self, url: str, html: str, **kwargs) -> ScrapingResult`:
|
||||||
|
* Description: Asynchronous version of the `scrape` method, typically by running the synchronous `scrape` method in a separate thread.
|
||||||
|
* 9.3.5. Helper Methods:
|
||||||
|
* `_get_pdf_path(self, url: str) -> str`: Downloads a PDF from a URL to a temporary file if it's not a local path.
|
||||||
|
* **9.4. `NaivePDFProcessorStrategy`**
|
||||||
|
* Source: `crawl4ai/processors/pdf/processor.py`
|
||||||
|
* 9.4.1. Purpose: A concrete implementation of `PDFProcessorStrategy` that uses `PyPDF2` (or similar libraries if extended) to extract text, images, and metadata from PDF documents page by page or in batches.
|
||||||
|
* 9.4.2. Initialization (`__init__`)
|
||||||
|
* Signature: `def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4)`
|
||||||
|
* Parameters: [Details parameters for image extraction quality, saving, and batch processing size.]
|
||||||
|
* 9.4.3. Key Methods:
|
||||||
|
* `process(self, pdf_path: Path) -> PDFProcessResult`:
|
||||||
|
* Description: Processes a single PDF file sequentially, page by page. Extracts metadata, text, and optionally images from each page.
|
||||||
|
* `process_batch(self, pdf_path: Path) -> PDFProcessResult`:
|
||||||
|
* Description: Processes a PDF file by dividing its pages into batches and processing these batches in parallel using a thread pool, potentially speeding up extraction for large PDFs.
|
||||||
|
* 9.4.4. Helper Methods:
|
||||||
|
* `_process_page(self, page, image_dir: Optional[Path]) -> PDFPage`: Processes a single PDF page object.
|
||||||
|
* `_extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]`: Extracts images from a page.
|
||||||
|
* `_extract_links(self, page) -> List[str]`: Extracts hyperlinks from a page.
|
||||||
|
* `_extract_metadata(self, pdf_path: Path, reader=None) -> PDFMetadata`: Extracts metadata from the PDF.
|
||||||
|
* **9.5. PDF Data Models**
|
||||||
|
* Source: `crawl4ai/processors/pdf/processor.py`
|
||||||
|
* 9.5.1. `PDFMetadata`:
|
||||||
|
* Purpose: Stores metadata extracted from a PDF document.
|
||||||
|
* Fields:
|
||||||
|
* `title (Optional[str])`: The title of the PDF.
|
||||||
|
* `author (Optional[str])`: The author(s) of the PDF.
|
||||||
|
* `producer (Optional[str])`: The software used to produce the PDF.
|
||||||
|
* `created (Optional[datetime])`: The creation date of the PDF.
|
||||||
|
* `modified (Optional[datetime])`: The last modification date of the PDF.
|
||||||
|
* `pages (int)`: The total number of pages in the PDF. Default: `0`.
|
||||||
|
* `encrypted (bool)`: `True` if the PDF is encrypted, `False` otherwise. Default: `False`.
|
||||||
|
* `file_size (Optional[int])`: The size of the PDF file in bytes. Default: `None`.
|
||||||
|
* 9.5.2. `PDFPage`:
|
||||||
|
* Purpose: Stores content extracted from a single page of a PDF document.
|
||||||
|
* Fields:
|
||||||
|
* `page_number (int)`: The page number (1-indexed).
|
||||||
|
* `raw_text (str)`: The raw text extracted from the page. Default: `""`.
|
||||||
|
* `markdown (str)`: Markdown representation of the page content. Default: `""`.
|
||||||
|
* `html (str)`: Basic HTML representation of the page content. Default: `""`.
|
||||||
|
* `images (List[Dict])`: A list of dictionaries, each representing an extracted image with details like format, path/data, dimensions. Default: `[]`.
|
||||||
|
* `links (List[str])`: A list of hyperlink URLs found on the page. Default: `[]`.
|
||||||
|
* `layout (List[Dict])`: Information about the layout of text elements on the page (e.g., coordinates). Default: `[]`.
|
||||||
|
* 9.5.3. `PDFProcessResult`:
|
||||||
|
* Purpose: Encapsulates the results of processing a PDF document.
|
||||||
|
* Fields:
|
||||||
|
* `metadata (PDFMetadata)`: The metadata of the processed PDF.
|
||||||
|
* `pages (List[PDFPage])`: A list of `PDFPage` objects, one for each page processed.
|
||||||
|
* `processing_time (float)`: The time taken to process the PDF, in seconds. Default: `0.0`.
|
||||||
|
* `version (str)`: The version of the PDF processor. Default: `"1.1"`.
|
||||||
|
|
||||||
|
## 10. Version Information (`crawl4ai.__version__`)
|
||||||
|
* Source: `crawl4ai/__version__.py`
|
||||||
|
* 10.1. `__version__ (str)`: A string representing the current installed version of the `crawl4ai` library (e.g., "0.6.3").
|
||||||
|
|
||||||
|
## 11. Asynchronous Configuration (`crawl4ai.async_configs`)
|
||||||
|
* 11.1. Overview: The `crawl4ai.async_configs` module contains configuration classes used throughout the library, including those relevant for network requests like proxies (`ProxyConfig`) and general crawler/browser behavior.
|
||||||
|
* **11.2. `ProxyConfig`**
|
||||||
|
* Source: `crawl4ai/async_configs.py` (and `crawl4ai/proxy_strategy.py`)
|
||||||
|
* 11.2.1. Purpose: Represents the configuration for a single proxy server, including its address, port, and optional authentication credentials.
|
||||||
|
* 11.2.2. Initialization (`__init__`)
|
||||||
|
* 11.2.2.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
server: str,
|
||||||
|
username: Optional[str] = None,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
ip: Optional[str] = None,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 11.2.2.2. Parameters:
|
||||||
|
* `server (str)`: The proxy server URL (e.g., "http://proxy.example.com:8080", "socks5://proxy.example.com:1080").
|
||||||
|
* `username (Optional[str]`, default: `None`)`: The username for proxy authentication, if required.
|
||||||
|
* `password (Optional[str]`, default: `None`)`: The password for proxy authentication, if required.
|
||||||
|
* `ip (Optional[str]`, default: `None`)`: Optionally, the specific IP address of the proxy server. If not provided, it's inferred from the `server` URL.
|
||||||
|
* 11.2.3. Key Static Methods:
|
||||||
|
* `from_string(proxy_str: str) -> ProxyConfig`:
|
||||||
|
* Description: Creates a `ProxyConfig` instance from a string representation. Expected format is "ip:port:username:password" or "ip:port".
|
||||||
|
* Returns: `(ProxyConfig)`
|
||||||
|
* `from_dict(proxy_dict: Dict) -> ProxyConfig`:
|
||||||
|
* Description: Creates a `ProxyConfig` instance from a dictionary.
|
||||||
|
* Returns: `(ProxyConfig)`
|
||||||
|
* `from_env(env_var: str = "PROXIES") -> List[ProxyConfig]`:
|
||||||
|
* Description: Loads a list of proxy configurations from a comma-separated string in an environment variable.
|
||||||
|
* Returns: `(List[ProxyConfig])`
|
||||||
|
* 11.2.4. Key Methods:
|
||||||
|
* `to_dict(self) -> Dict`: Converts the `ProxyConfig` instance to a dictionary.
|
||||||
|
* `clone(self, **kwargs) -> ProxyConfig`: Creates a copy of the instance, optionally updating attributes with `kwargs`.
|
||||||
|
|
||||||
|
* **11.3. `ProxyRotationStrategy` (ABC)**
|
||||||
|
* Source: `crawl4ai/proxy_strategy.py`
|
||||||
|
* 11.3.1. Purpose: Abstract base class defining the interface for proxy rotation strategies.
|
||||||
|
* 11.3.2. Key Abstract Methods:
|
||||||
|
* `async def get_next_proxy(self) -> Optional[ProxyConfig]`: Asynchronously gets the next `ProxyConfig` from the strategy.
|
||||||
|
* `def add_proxies(self, proxies: List[ProxyConfig])`: Adds a list of `ProxyConfig` objects to the strategy's pool.
|
||||||
|
* **11.4. `RoundRobinProxyStrategy`**
|
||||||
|
* Source: `crawl4ai/proxy_strategy.py`
|
||||||
|
* 11.4.1. Purpose: A simple proxy rotation strategy that cycles through a list of proxies in a round-robin fashion.
|
||||||
|
* 11.4.2. Inheritance: `ProxyRotationStrategy`
|
||||||
|
* 11.4.3. Initialization (`__init__`)
|
||||||
|
* 11.4.3.1. Signature: `def __init__(self, proxies: List[ProxyConfig] = None):`
|
||||||
|
* 11.4.3.2. Parameters:
|
||||||
|
* `proxies (List[ProxyConfig]`, default: `None`)`: An optional initial list of `ProxyConfig` objects.
|
||||||
|
* 11.4.4. Key Implemented Methods:
|
||||||
|
* `add_proxies(self, proxies: List[ProxyConfig])`: Adds new proxies to the internal list and reinitializes the cycle.
|
||||||
|
* `async def get_next_proxy(self) -> Optional[ProxyConfig]`: Returns the next proxy from the cycle. Returns `None` if no proxies are available.
|
||||||
|
|
||||||
|
## 12. HTML to Markdown Conversion (`crawl4ai.markdown_generation_strategy`)
|
||||||
|
* 12.1. `MarkdownGenerationStrategy` (ABC)
|
||||||
|
* Source: `crawl4ai/markdown_generation_strategy.py`
|
||||||
|
* 12.1.1. Purpose: Abstract base class defining the interface for strategies that convert HTML content to Markdown.
|
||||||
|
* 12.1.2. Key Abstract Methods:
|
||||||
|
* `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`:
|
||||||
|
* Description: Abstract method to convert the given `input_html` string into a `MarkdownGenerationResult` object.
|
||||||
|
* Parameters:
|
||||||
|
* `input_html (str)`: The HTML content to convert.
|
||||||
|
* `base_url (str`, default: `""`)`: The base URL used for resolving relative links within the HTML.
|
||||||
|
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: Options to pass to the underlying HTML-to-text conversion library.
|
||||||
|
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional filter to apply to the HTML before Markdown conversion, potentially to extract only relevant parts.
|
||||||
|
* `citations (bool`, default: `True`)`: If `True`, attempts to convert hyperlinks into Markdown citations with a reference list.
|
||||||
|
* `**kwargs`: Additional keyword arguments.
|
||||||
|
* Returns: `(MarkdownGenerationResult)`
|
||||||
|
* 12.2. `DefaultMarkdownGenerator`
|
||||||
|
* Source: `crawl4ai/markdown_generation_strategy.py`
|
||||||
|
* 12.2.1. Purpose: The default implementation of `MarkdownGenerationStrategy`. It uses the `CustomHTML2Text` class (an enhanced `html2text.HTML2Text`) for the primary conversion and can optionally apply a `RelevantContentFilter`.
|
||||||
|
* 12.2.2. Inheritance: `MarkdownGenerationStrategy`
|
||||||
|
* 12.2.3. Initialization (`__init__`)
|
||||||
|
* 12.2.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
content_filter: Optional[RelevantContentFilter] = None,
|
||||||
|
options: Optional[Dict[str, Any]] = None,
|
||||||
|
content_source: str = "cleaned_html", # "raw_html", "fit_html"
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 12.2.3.2. Parameters:
|
||||||
|
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An instance of a content filter strategy (e.g., `BM25ContentFilter`, `PruningContentFilter`) to be applied to the `input_html` before Markdown conversion. If `None`, no pre-filtering is done.
|
||||||
|
* `options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary of options to configure the `CustomHTML2Text` converter (e.g., `{"body_width": 0, "ignore_links": False}`).
|
||||||
|
* `content_source (str`, default: `"cleaned_html"`)`: Specifies which HTML source to use for Markdown generation if multiple are available (e.g., from `CrawlResult`). Options: `"cleaned_html"` (default), `"raw_html"`, `"fit_html"`. This parameter is primarily used when the generator is part of a larger crawling pipeline.
|
||||||
|
* 12.2.4. Key Methods:
|
||||||
|
* `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`:
|
||||||
|
* Description: Converts HTML to Markdown. If a `content_filter` is provided (either at init or as an argument), it's applied first to get "fit_html". Then, `CustomHTML2Text` converts the chosen HTML (input_html or fit_html) to raw Markdown. If `citations` is True, links in the raw Markdown are converted to citation format.
|
||||||
|
* Returns: `(MarkdownGenerationResult)`
|
||||||
|
* `convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]`:
|
||||||
|
* Description: Parses Markdown text, identifies links, replaces them with citation markers (e.g., `[text]^(1)`), and generates a corresponding list of references.
|
||||||
|
* Returns: `(Tuple[str, str])` - A tuple containing the Markdown with citations and the Markdown string of references.
|
||||||
|
|
||||||
|
## 13. Content Filtering (`crawl4ai.content_filter_strategy`)
|
||||||
|
* 13.1. `RelevantContentFilter` (ABC)
|
||||||
|
* Source: `crawl4ai/content_filter_strategy.py`
|
||||||
|
* 13.1.1. Purpose: Abstract base class for strategies that filter HTML content to extract only the most relevant parts, typically before Markdown conversion or further processing.
|
||||||
|
* 13.1.2. Key Abstract Methods:
|
||||||
|
* `filter_content(self, html: str) -> List[str]`:
|
||||||
|
* Description: Abstract method that takes an HTML string and returns a list of strings, where each string is a chunk of HTML deemed relevant.
|
||||||
|
* 13.2. `BM25ContentFilter`
|
||||||
|
* Source: `crawl4ai/content_filter_strategy.py`
|
||||||
|
* 13.2.1. Purpose: Filters HTML content by extracting text chunks and scoring their relevance to a user query (or an inferred page query) using the BM25 algorithm.
|
||||||
|
* 13.2.2. Inheritance: `RelevantContentFilter`
|
||||||
|
* 13.2.3. Initialization (`__init__`)
|
||||||
|
* 13.2.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
user_query: Optional[str] = None,
|
||||||
|
bm25_threshold: float = 1.0,
|
||||||
|
language: str = "english",
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 13.2.3.2. Parameters:
|
||||||
|
* `user_query (Optional[str]`, default: `None`)`: The query to compare content against. If `None`, the filter attempts to extract a query from the page's metadata.
|
||||||
|
* `bm25_threshold (float`, default: `1.0`)`: The minimum BM25 score for a text chunk to be considered relevant.
|
||||||
|
* `language (str`, default: `"english"`)`: The language used for stemming tokens.
|
||||||
|
* 13.2.4. Key Implemented Methods:
|
||||||
|
* `filter_content(self, html: str, min_word_threshold: int = None) -> List[str]`: Parses HTML, extracts text chunks (paragraphs, list items, etc.), scores them with BM25 against the query, and returns the HTML of chunks exceeding the threshold.
|
||||||
|
* 13.3. `PruningContentFilter`
|
||||||
|
* Source: `crawl4ai/content_filter_strategy.py`
|
||||||
|
* 13.3.1. Purpose: Filters HTML content by recursively pruning less relevant parts of the DOM tree based on a composite score (text density, link density, tag weights, etc.).
|
||||||
|
* 13.3.2. Inheritance: `RelevantContentFilter`
|
||||||
|
* 13.3.3. Initialization (`__init__`)
|
||||||
|
* 13.3.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
user_query: Optional[str] = None,
|
||||||
|
min_word_threshold: Optional[int] = None,
|
||||||
|
threshold_type: str = "fixed", # or "dynamic"
|
||||||
|
threshold: float = 0.48,
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 13.3.3.2. Parameters:
|
||||||
|
* `user_query (Optional[str]`, default: `None`)`: [Not directly used by pruning logic but inherited].
|
||||||
|
* `min_word_threshold (Optional[int]`, default: `None`)`: Minimum word count for an element to be considered for scoring initially (default behavior might be more nuanced).
|
||||||
|
* `threshold_type (str`, default: `"fixed"`)`: Specifies how the `threshold` is applied. "fixed" uses the direct value. "dynamic" adjusts the threshold based on content characteristics.
|
||||||
|
* `threshold (float`, default: `0.48`)`: The score threshold for pruning. Elements below this score are removed.
|
||||||
|
* 13.3.4. Key Implemented Methods:
|
||||||
|
* `filter_content(self, html: str, min_word_threshold: int = None) -> List[str]`: Parses HTML, applies the pruning algorithm to the body, and returns the remaining significant HTML blocks as a list of strings.
|
||||||
|
* 13.4. `LLMContentFilter`
|
||||||
|
* Source: `crawl4ai/content_filter_strategy.py`
|
||||||
|
* 13.4.1. Purpose: Uses a Large Language Model (LLM) to determine the relevance of HTML content chunks based on a given instruction.
|
||||||
|
* 13.4.2. Inheritance: `RelevantContentFilter`
|
||||||
|
* 13.4.3. Initialization (`__init__`)
|
||||||
|
* 13.4.3.1. Signature:
|
||||||
|
```python
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
llm_config: Optional[LLMConfig] = None,
|
||||||
|
instruction: Optional[str] = None,
|
||||||
|
chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD, # Default from config
|
||||||
|
overlap_rate: float = OVERLAP_RATE, # Default from config
|
||||||
|
word_token_rate: float = WORD_TOKEN_RATE, # Default from config
|
||||||
|
verbose: bool = False,
|
||||||
|
logger: Optional[AsyncLogger] = None,
|
||||||
|
ignore_cache: bool = True
|
||||||
|
):
|
||||||
|
```
|
||||||
|
* 13.4.3.2. Parameters:
|
||||||
|
* `llm_config (Optional[LLMConfig])`: Configuration for the LLM (provider, API key, model, etc.).
|
||||||
|
* `instruction (Optional[str])`: The instruction given to the LLM to guide content filtering (e.g., "Extract only the main article content, excluding headers, footers, and ads.").
|
||||||
|
* `chunk_token_threshold (int)`: Maximum number of tokens per chunk sent to the LLM.
|
||||||
|
* `overlap_rate (float)`: Percentage of overlap between consecutive chunks.
|
||||||
|
* `word_token_rate (float)`: Estimated ratio of words to tokens, used for chunking.
|
||||||
|
* `verbose (bool`, default: `False`)`: Enables verbose logging for LLM operations.
|
||||||
|
* `logger (Optional[AsyncLogger]`, default: `None`)`: Custom logger instance.
|
||||||
|
* `ignore_cache (bool`, default: `True`)`: If `True`, bypasses any LLM response caching for this operation.
|
||||||
|
* 13.4.4. Key Implemented Methods:
|
||||||
|
* `filter_content(self, html: str, ignore_cache: bool = True) -> List[str]`:
|
||||||
|
* Description: Chunks the input HTML. For each chunk, it sends a request to the configured LLM with the chunk and the `instruction`. The LLM is expected to return the relevant part of the chunk. These relevant parts are then collected and returned.
|
||||||
|
```
|
||||||
File diff suppressed because it is too large
Load Diff
3837
docs/md_v2/assets/llmtxt/crawl4ai_deployment.llm.full.txt
Normal file
3837
docs/md_v2/assets/llmtxt/crawl4ai_deployment.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,537 @@
|
|||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - deployment Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_deployment.md`
|
||||||
|
**Library Version Context:** 0.6.0 (as per Dockerfile ARG `C4AI_VER` from provided `Dockerfile` content)
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction to Deployment
|
||||||
|
* 1.1. Purpose: This document provides a factual reference for installing the `crawl4ai` library and deploying its server component using Docker. It covers basic and advanced library installation, various Docker deployment methods, server configuration, and an overview of the API for interaction.
|
||||||
|
* 1.2. Scope:
|
||||||
|
* Installation of the `crawl4ai` Python library.
|
||||||
|
* Setup and diagnostic commands for the library.
|
||||||
|
* Deployment of the `crawl4ai` server using Docker, including pre-built images, Docker Compose, and manual builds.
|
||||||
|
* Explanation of Dockerfile parameters and server configuration via `config.yml`.
|
||||||
|
* Details of API interaction, including the Playground UI, Python SDK, and direct REST API calls.
|
||||||
|
* Overview of additional server API endpoints and Model Context Protocol (MCP) support.
|
||||||
|
* High-level understanding of the server's internal logic relevant to users.
|
||||||
|
* The library's version numbering scheme.
|
||||||
|
|
||||||
|
## 2. Library Installation
|
||||||
|
|
||||||
|
* 2.1. **Basic Library Installation**
|
||||||
|
* 2.1.1. Standard Installation
|
||||||
|
* Command: `pip install crawl4ai`
|
||||||
|
* Purpose: Installs the core `crawl4ai` library and its essential dependencies for performing web crawling and scraping tasks. This provides the fundamental `AsyncWebCrawler` and related configuration objects.
|
||||||
|
* 2.1.2. Post-Installation Setup
|
||||||
|
* Command: `crawl4ai-setup`
|
||||||
|
* Purpose:
|
||||||
|
* Initializes the user's home directory structure for Crawl4ai (e.g., `~/.crawl4ai/cache`).
|
||||||
|
* Installs or updates necessary Playwright browsers (Chromium is installed by default) required for browser-based crawling. The `crawl4ai-setup` script internally calls `playwright install --with-deps chromium`.
|
||||||
|
* Performs OS-level checks for common missing libraries that Playwright might depend on, providing guidance if issues are found.
|
||||||
|
* Creates a default `global.yml` configuration file if one doesn't exist.
|
||||||
|
* 2.1.3. Diagnostic Check
|
||||||
|
* Command: `crawl4ai-doctor`
|
||||||
|
* Purpose:
|
||||||
|
* Verifies Python version compatibility.
|
||||||
|
* Confirms Playwright installation and browser integrity by attempting a simple crawl of `https://crawl4ai.com`.
|
||||||
|
* Inspects essential environment variables and potential library conflicts that might affect Crawl4ai's operation.
|
||||||
|
* Provides diagnostic messages indicating success or failure of these checks, with suggestions for resolving common issues.
|
||||||
|
* 2.1.4. Verification Process
|
||||||
|
* Purpose: To confirm that the basic installation and setup were successful and Crawl4ai can perform a simple crawl.
|
||||||
|
* Script Example (as inferred from `crawl4ai-doctor` logic and typical usage):
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
browser_type="chromium",
|
||||||
|
ignore_https_errors=True,
|
||||||
|
light_mode=True,
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=720,
|
||||||
|
)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
screenshot=True,
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
print("Testing crawling capabilities...")
|
||||||
|
result = await crawler.arun(url="https://crawl4ai.com", config=run_config)
|
||||||
|
if result and result.markdown:
|
||||||
|
print("✅ Crawling test passed!")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Test failed: Failed to get content")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
* Expected Outcome: The script should print "✅ Crawling test passed!" and successfully output Markdown content from the crawled page.
|
||||||
|
|
||||||
|
* 2.2. **Advanced Library Installation (Optional Features)**
|
||||||
|
* 2.2.1. Installation of Optional Extras
|
||||||
|
* Purpose: To install additional dependencies required for specific advanced features of Crawl4ai, such as those involving machine learning models.
|
||||||
|
* Options (as defined in `pyproject.toml`):
|
||||||
|
* `pip install crawl4ai[pdf]`:
|
||||||
|
* Purpose: Installs `PyPDF2` for PDF processing capabilities.
|
||||||
|
* `pip install crawl4ai[torch]`:
|
||||||
|
* Purpose: Installs `torch`, `nltk`, and `scikit-learn`. Enables features relying on PyTorch models, such as some advanced text clustering or semantic analysis within extraction strategies.
|
||||||
|
* `pip install crawl4ai[transformer]`:
|
||||||
|
* Purpose: Installs `transformers` and `tokenizers`. Enables the use of Hugging Face Transformers models for tasks like summarization, question answering, or other advanced NLP features within Crawl4ai.
|
||||||
|
* `pip install crawl4ai[cosine]`:
|
||||||
|
* Purpose: Installs `torch`, `transformers`, and `nltk`. Specifically for features utilizing cosine similarity with embeddings (implies model usage).
|
||||||
|
* `pip install crawl4ai[sync]`:
|
||||||
|
* Purpose: Installs `selenium` for synchronous crawling capabilities (less common, as Crawl4ai primarily focuses on async).
|
||||||
|
* `pip install crawl4ai[all]`:
|
||||||
|
* Purpose: Installs all optional dependencies listed above (`PyPDF2`, `torch`, `nltk`, `scikit-learn`, `transformers`, `tokenizers`, `selenium`), providing the complete suite of Crawl4ai capabilities.
|
||||||
|
* 2.2.2. Model Pre-fetching
|
||||||
|
* Command: `crawl4ai-download-models` (maps to `crawl4ai.model_loader:main`)
|
||||||
|
* Purpose: Downloads and caches machine learning models (e.g., specific sentence transformers or classification models from Hugging Face) that are used by certain optional features, particularly those installed via `crawl4ai[transformer]` or `crawl4ai[cosine]`. This avoids runtime downloads and ensures models are available offline.
|
||||||
|
|
||||||
|
## 3. Docker Deployment (Server Mode)
|
||||||
|
|
||||||
|
* 3.1. **Prerequisites**
|
||||||
|
* 3.1.1. Docker: A working Docker installation. (Link: `https://docs.docker.com/get-docker/`)
|
||||||
|
* 3.1.2. Git: Required for cloning the `crawl4ai` repository if building locally or using Docker Compose from the repository. (Link: `https://git-scm.com/book/en/v2/Getting-Started-Installing-Git`)
|
||||||
|
* 3.1.3. RAM Requirements:
|
||||||
|
* Minimum: 2GB for the basic server without intensive LLM tasks. The `Dockerfile` HEALTCHECK indicates a warning if less than 2GB RAM is available.
|
||||||
|
* Recommended for LLM support: 4GB+ (as specified in `docker-compose.yml` limits).
|
||||||
|
* Shared Memory (`/dev/shm`): Recommended size is 1GB (`--shm-size=1g`) for optimal Chromium browser performance, as specified in `docker-compose.yml` and run commands.
|
||||||
|
* 3.2. **Installation Options**
|
||||||
|
* 3.2.1. **Using Pre-built Images from Docker Hub**
|
||||||
|
* 3.2.1.1. Image Source: `unclecode/crawl4ai:<tag>`
|
||||||
|
* Explanation of `<tag>`:
|
||||||
|
* `latest`: Points to the most recent stable release of Crawl4ai.
|
||||||
|
* Specific version tags (e.g., `0.6.0`, `0.5.1`): Correspond to specific library releases.
|
||||||
|
* Pre-release tags (e.g., `0.6.0-rc1`, `0.7.0-devN`): Development or release candidate versions for testing.
|
||||||
|
* 3.2.1.2. Pulling the Image
|
||||||
|
* Command: `docker pull unclecode/crawl4ai:<tag>` (e.g., `docker pull unclecode/crawl4ai:latest`)
|
||||||
|
* 3.2.1.3. Environment Setup (`.llm.env`)
|
||||||
|
* File Name: `.llm.env` (to be created by the user in the directory where `docker run` or `docker-compose` commands are executed).
|
||||||
|
* Purpose: To securely provide API keys for various LLM providers used by Crawl4ai for features like LLM-based extraction or Q&A.
|
||||||
|
* Example Content (based on `docker-compose.yml`):
|
||||||
|
```env
|
||||||
|
OPENAI_API_KEY=your_openai_api_key
|
||||||
|
DEEPSEEK_API_KEY=your_deepseek_api_key
|
||||||
|
ANTHROPIC_API_KEY=your_anthropic_api_key
|
||||||
|
GROQ_API_KEY=your_groq_api_key
|
||||||
|
TOGETHER_API_KEY=your_together_api_key
|
||||||
|
MISTRAL_API_KEY=your_mistral_api_key
|
||||||
|
GEMINI_API_TOKEN=your_gemini_api_token
|
||||||
|
```
|
||||||
|
* Creation: Users should create this file and populate it with their API keys. An example (`.llm.env.example`) might be provided in the repository.
|
||||||
|
* 3.2.1.4. Running the Container
|
||||||
|
* Basic Run (without LLM support):
|
||||||
|
* Command: `docker run -d -p 11235:11235 --shm-size=1g --name crawl4ai-server unclecode/crawl4ai:<tag>`
|
||||||
|
* Port Mapping: `-p 11235:11235` maps port 11235 on the host to port 11235 in the container (default server port).
|
||||||
|
* Shared Memory: `--shm-size=1g` allocates 1GB of shared memory for the browser.
|
||||||
|
* Run with LLM Support (mounting `.llm.env`):
|
||||||
|
* Command: `docker run -d -p 11235:11235 --env-file .llm.env --shm-size=1g --name crawl4ai-server unclecode/crawl4ai:<tag>`
|
||||||
|
* 3.2.1.5. Stopping the Container
|
||||||
|
* Command: `docker stop crawl4ai-server`
|
||||||
|
* Command (to remove): `docker rm crawl4ai-server`
|
||||||
|
* 3.2.1.6. Docker Hub Versioning:
|
||||||
|
* Docker image tags on Docker Hub (e.g., `unclecode/crawl4ai:0.6.0`) directly correspond to `crawl4ai` library releases. The `latest` tag usually points to the most recent stable release. Pre-release tags include suffixes like `-devN`, `-aN`, `-bN`, or `-rcN`.
|
||||||
|
|
||||||
|
* 3.2.2. **Using Docker Compose (`docker-compose.yml`)**
|
||||||
|
* 3.2.2.1. Cloning the Repository
|
||||||
|
* Command: `git clone https://github.com/unclecode/crawl4ai.git`
|
||||||
|
* Command: `cd crawl4ai`
|
||||||
|
* 3.2.2.2. Environment Setup (`.llm.env`)
|
||||||
|
* File Name: `.llm.env` (should be created in the root of the cloned `crawl4ai` repository).
|
||||||
|
* Purpose: Same as above, to provide LLM API keys.
|
||||||
|
* 3.2.2.3. Running Pre-built Images
|
||||||
|
* Command: `docker-compose up -d`
|
||||||
|
* Behavior: Uses the image specified in `docker-compose.yml` (e.g., `${IMAGE:-unclecode/crawl4ai}:${TAG:-latest}`).
|
||||||
|
* Overriding image tag: `TAG=0.6.0 docker-compose up -d` or `IMAGE=mycustom/crawl4ai TAG=mytag docker-compose up -d`.
|
||||||
|
* 3.2.2.4. Building Locally with Docker Compose
|
||||||
|
* Command: `docker-compose up -d --build`
|
||||||
|
* Build Arguments (passed from environment variables to `docker-compose.yml` which then passes to `Dockerfile`):
|
||||||
|
* `INSTALL_TYPE`: (e.g., `default`, `torch`, `all`)
|
||||||
|
* Purpose: To include optional Python dependencies during the Docker image build process.
|
||||||
|
* Example: `INSTALL_TYPE=all docker-compose up -d --build`
|
||||||
|
* `ENABLE_GPU`: (e.g., `true`, `false`)
|
||||||
|
* Purpose: To include GPU support (e.g., CUDA toolkits) in the Docker image if the build hardware and target runtime support it.
|
||||||
|
* Example: `ENABLE_GPU=true docker-compose up -d --build`
|
||||||
|
* 3.2.2.5. Stopping Docker Compose Services
|
||||||
|
* Command: `docker-compose down`
|
||||||
|
|
||||||
|
* 3.2.3. **Manual Local Build & Run**
|
||||||
|
* 3.2.3.1. Cloning the Repository: (As above)
|
||||||
|
* 3.2.3.2. Environment Setup (`.llm.env`): (As above)
|
||||||
|
* 3.2.3.3. Building with `docker buildx`
|
||||||
|
* Command Example:
|
||||||
|
```bash
|
||||||
|
docker buildx build --platform linux/amd64,linux/arm64 \
|
||||||
|
--build-arg C4AI_VER=0.6.0 \
|
||||||
|
--build-arg INSTALL_TYPE=all \
|
||||||
|
--build-arg ENABLE_GPU=false \
|
||||||
|
--build-arg USE_LOCAL=true \
|
||||||
|
-t my-crawl4ai-image:custom .
|
||||||
|
```
|
||||||
|
* Purpose of `docker buildx`: A Docker CLI plugin that extends the `docker build` command with full support for BuildKit builder capabilities, including multi-architecture builds.
|
||||||
|
* Explanation of `--platform`: Specifies the target platform(s) for the build (e.g., `linux/amd64`, `linux/arm64`).
|
||||||
|
* Explanation of `--build-arg`: Passes build-time variables defined in the `Dockerfile` (see section 3.3).
|
||||||
|
* 3.2.3.4. Running the Custom-Built Container
|
||||||
|
* Basic Run: `docker run -d -p 11235:11235 --shm-size=1g --name my-crawl4ai-server my-crawl4ai-image:custom`
|
||||||
|
* Run with LLM Support: `docker run -d -p 11235:11235 --env-file .llm.env --shm-size=1g --name my-crawl4ai-server my-crawl4ai-image:custom`
|
||||||
|
* 3.2.3.5. Stopping the Container: (As above)
|
||||||
|
|
||||||
|
* 3.3. **Dockerfile Parameters (`ARG` values)**
|
||||||
|
* 3.3.1. `C4AI_VER`: (Default: `0.6.0`)
|
||||||
|
* Role: Specifies the version of the `crawl4ai` library. Used for labeling the image and potentially for version-specific logic.
|
||||||
|
* 3.3.2. `APP_HOME`: (Default: `/app`)
|
||||||
|
* Role: Defines the working directory inside the Docker container where the application code and related files are stored and executed.
|
||||||
|
* 3.3.3. `GITHUB_REPO`: (Default: `https://github.com/unclecode/crawl4ai.git`)
|
||||||
|
* Role: The URL of the GitHub repository to clone if `USE_LOCAL` is set to `false`.
|
||||||
|
* 3.3.4. `GITHUB_BRANCH`: (Default: `main`)
|
||||||
|
* Role: The specific branch of the GitHub repository to clone if `USE_LOCAL` is `false`.
|
||||||
|
* 3.3.5. `USE_LOCAL`: (Default: `true`)
|
||||||
|
* Role: A boolean flag. If `true`, the `Dockerfile` installs `crawl4ai` from the local source code copied into `/tmp/project/` during the build context. If `false`, it clones the repository specified by `GITHUB_REPO` and `GITHUB_BRANCH`.
|
||||||
|
* 3.3.6. `PYTHON_VERSION`: (Default: `3.12`)
|
||||||
|
* Role: Specifies the Python version for the base image (e.g., `python:3.12-slim-bookworm`).
|
||||||
|
* 3.3.7. `INSTALL_TYPE`: (Default: `default`)
|
||||||
|
* Role: Controls which optional dependencies of `crawl4ai` are installed. Possible values: `default` (core), `pdf`, `torch`, `transformer`, `cosine`, `sync`, `all`.
|
||||||
|
* 3.3.8. `ENABLE_GPU`: (Default: `false`)
|
||||||
|
* Role: A boolean flag. If `true` and `TARGETARCH` is `amd64`, the `Dockerfile` attempts to install the NVIDIA CUDA toolkit for GPU acceleration.
|
||||||
|
* 3.3.9. `TARGETARCH`:
|
||||||
|
* Role: An automatic build argument provided by Docker, indicating the target architecture of the build (e.g., `amd64`, `arm64`). Used for conditional logic in the `Dockerfile`, such as installing platform-specific optimized libraries or CUDA for `amd64`.
|
||||||
|
|
||||||
|
* 3.4. **Server Configuration (`config.yml`)**
|
||||||
|
* 3.4.1. Location: The server loads its configuration from `/app/config.yml` inside the container by default. This path is relative to `APP_HOME`.
|
||||||
|
* 3.4.2. Structure Overview (based on `deploy/docker/config.yml`):
|
||||||
|
* `app`: General application settings.
|
||||||
|
* `title (str)`: API title (e.g., "Crawl4AI API").
|
||||||
|
* `version (str)`: API version (e.g., "1.0.0").
|
||||||
|
* `host (str)`: Host address for the server to bind to (e.g., "0.0.0.0").
|
||||||
|
* `port (int)`: Port for the server to listen on (e.g., 11234, though Docker usually maps to 11235).
|
||||||
|
* `reload (bool)`: Enable/disable auto-reload for development (default: `false`).
|
||||||
|
* `workers (int)`: Number of worker processes (default: 1).
|
||||||
|
* `timeout_keep_alive (int)`: Keep-alive timeout in seconds (default: 300).
|
||||||
|
* `llm`: Default LLM configuration.
|
||||||
|
* `provider (str)`: Default LLM provider string (e.g., "openai/gpt-4o-mini").
|
||||||
|
* `api_key_env (str)`: Environment variable name to read the API key from (e.g., "OPENAI_API_KEY").
|
||||||
|
* `api_key (Optional[str])`: Directly pass API key (overrides `api_key_env`).
|
||||||
|
* `redis`: Redis connection details.
|
||||||
|
* `host (str)`: Redis host (e.g., "localhost").
|
||||||
|
* `port (int)`: Redis port (e.g., 6379).
|
||||||
|
* `db (int)`: Redis database number (e.g., 0).
|
||||||
|
* `password (str)`: Redis password (default: "").
|
||||||
|
* `ssl (bool)`: Enable SSL for Redis connection (default: `false`).
|
||||||
|
* `ssl_cert_reqs (Optional[str])`: SSL certificate requirements (e.g., "none", "optional", "required").
|
||||||
|
* `ssl_ca_certs (Optional[str])`: Path to CA certificate file.
|
||||||
|
* `ssl_certfile (Optional[str])`: Path to SSL certificate file.
|
||||||
|
* `ssl_keyfile (Optional[str])`: Path to SSL key file.
|
||||||
|
* `rate_limiting`: Configuration for API rate limits.
|
||||||
|
* `enabled (bool)`: Enable/disable rate limiting (default: `true`).
|
||||||
|
* `default_limit (str)`: Default rate limit (e.g., "1000/minute").
|
||||||
|
* `trusted_proxies (List[str])`: List of trusted proxy IP addresses.
|
||||||
|
* `storage_uri (str)`: Storage URI for rate limit counters (e.g., "memory://", "redis://localhost:6379").
|
||||||
|
* `security`: Security-related settings.
|
||||||
|
* `enabled (bool)`: Master switch for security features (default: `false`).
|
||||||
|
* `jwt_enabled (bool)`: Enable/disable JWT authentication (default: `false`).
|
||||||
|
* `https_redirect (bool)`: Enable/disable HTTPS redirection (default: `false`).
|
||||||
|
* `trusted_hosts (List[str])`: List of allowed host headers (e.g., `["*"]` or specific domains).
|
||||||
|
* `headers (Dict[str, str])`: Default security headers to add to responses (e.g., `X-Content-Type-Options`, `Content-Security-Policy`).
|
||||||
|
* `crawler`: Default crawler behavior.
|
||||||
|
* `base_config (Dict[str, Any])`: Base parameters for `CrawlerRunConfig`.
|
||||||
|
* `simulate_user (bool)`: (default: `true`).
|
||||||
|
* `memory_threshold_percent (float)`: Memory usage threshold for adaptive dispatcher (default: `95.0`).
|
||||||
|
* `rate_limiter (Dict[str, Any])`: Configuration for the internal rate limiter for crawling.
|
||||||
|
* `enabled (bool)`: (default: `true`).
|
||||||
|
* `base_delay (List[float, float])`: Min/max delay range (e.g., `[1.0, 2.0]`).
|
||||||
|
* `timeouts (Dict[str, float])`: Timeouts for different crawler operations.
|
||||||
|
* `stream_init (float)`: Timeout for stream initialization (default: `30.0`).
|
||||||
|
* `batch_process (float)`: Timeout for batch processing (default: `300.0`).
|
||||||
|
* `pool (Dict[str, Any])`: Browser pool settings.
|
||||||
|
* `max_pages (int)`: Max concurrent browser pages (default: `40`).
|
||||||
|
* `idle_ttl_sec (int)`: Time-to-live for idle crawlers in seconds (default: `1800`).
|
||||||
|
* `browser (Dict[str, Any])`: Default `BrowserConfig` parameters.
|
||||||
|
* `kwargs (Dict[str, Any])`: Keyword arguments for `BrowserConfig`.
|
||||||
|
* `headless (bool)`: (default: `true`).
|
||||||
|
* `text_mode (bool)`: (default: `true`).
|
||||||
|
* `extra_args (List[str])`: List of additional browser launch arguments (e.g., `"--no-sandbox"`).
|
||||||
|
* `logging`: Logging configuration.
|
||||||
|
* `level (str)`: Logging level (e.g., "INFO", "DEBUG").
|
||||||
|
* `format (str)`: Log message format string.
|
||||||
|
* `observability`: Observability settings.
|
||||||
|
* `prometheus (Dict[str, Any])`: Prometheus metrics configuration.
|
||||||
|
* `enabled (bool)`: (default: `true`).
|
||||||
|
* `endpoint (str)`: Metrics endpoint path (e.g., "/metrics").
|
||||||
|
* `health_check (Dict[str, str])`: Health check endpoint configuration.
|
||||||
|
* `endpoint (str)`: Health check endpoint path (e.g., "/health").
|
||||||
|
* 3.4.3. JWT Authentication
|
||||||
|
* Enabling: Set `security.enabled: true` and `security.jwt_enabled: true` in `config.yml`.
|
||||||
|
* Secret Key: Configured via `security.jwt_secret_key`. This value can be overridden by the environment variable `JWT_SECRET_KEY`.
|
||||||
|
* Algorithm: Configured via `security.jwt_algorithm` (default: `HS256`).
|
||||||
|
* Token Expiry: Configured via `security.jwt_expire_minutes` (default: `30`).
|
||||||
|
* Usage:
|
||||||
|
* 1. Client obtains a token by sending a POST request to the `/token` endpoint with an email in the request body (e.g., `{"email": "user@example.com"}`). The email domain might be validated if configured.
|
||||||
|
* 2. Client includes the received token in the `Authorization` header of subsequent requests to protected API endpoints: `Authorization: Bearer <your_jwt_token>`.
|
||||||
|
* 3.4.4. Customizing `config.yml`
|
||||||
|
* 3.4.4.1. Modifying Before Build:
|
||||||
|
* Method: Edit the `deploy/docker/config.yml` file within the cloned `crawl4ai` repository before building the Docker image. This new configuration will be baked into the image.
|
||||||
|
* 3.4.4.2. Runtime Mount:
|
||||||
|
* Method: Mount a custom `config.yml` file from the host machine to `/app/config.yml` (or the path specified by `APP_HOME`) inside the running Docker container.
|
||||||
|
* Example Command: `docker run -d -p 11235:11235 -v /path/on/host/my-config.yml:/app/config.yml --name crawl4ai-server unclecode/crawl4ai:latest`
|
||||||
|
* 3.4.5. Key Configuration Recommendations
|
||||||
|
* Security:
|
||||||
|
* Enable JWT (`security.jwt_enabled: true`) if the server is exposed to untrusted networks.
|
||||||
|
* Use a strong, unique `jwt_secret_key`.
|
||||||
|
* Configure `security.trusted_hosts` to a specific list of allowed hostnames instead of `["*"]` for production.
|
||||||
|
* If using a reverse proxy for SSL termination, ensure `https_redirect` is appropriately configured or disabled if the proxy handles it.
|
||||||
|
* Resource Management:
|
||||||
|
* Adjust `crawler.pool.max_pages` based on server resources to prevent overwhelming the system.
|
||||||
|
* Tune `crawler.pool.idle_ttl_sec` to balance resource usage and responsiveness for pooled browser instances.
|
||||||
|
* Monitoring:
|
||||||
|
* Keep `observability.prometheus.enabled: true` for production monitoring via the `/metrics` endpoint.
|
||||||
|
* Ensure the `/health` endpoint is accessible to health checking systems.
|
||||||
|
* Performance:
|
||||||
|
* Review and customize `crawler.browser.extra_args` for headless browser optimization (e.g., disabling GPU, sandbox if appropriate for your environment).
|
||||||
|
* Set reasonable `crawler.timeouts` to prevent long-stalled crawls.
|
||||||
|
|
||||||
|
* 3.5. **API Usage (Interacting with the Dockerized Server)**
|
||||||
|
* 3.5.1. **Playground Interface**
|
||||||
|
* Access URL: `http://localhost:11235/playground` (assuming default port mapping).
|
||||||
|
* Purpose: An interactive web UI (Swagger UI/OpenAPI) allowing users to explore API endpoints, view schemas, construct requests, and test API calls directly from their browser.
|
||||||
|
* 3.5.2. **Python SDK (`Crawl4aiDockerClient`)**
|
||||||
|
* Class Name: `Crawl4aiDockerClient`
|
||||||
|
* Location: (Typically imported as `from crawl4ai.docker_client import Crawl4aiDockerClient`) - Actual import might vary based on final library structure; refer to `docs/examples/docker_example.py` or `docs/examples/docker_python_sdk.py`.
|
||||||
|
* Initialization:
|
||||||
|
* Signature: `Crawl4aiDockerClient(base_url: str = "http://localhost:11235", api_token: Optional[str] = None, timeout: int = 300)`
|
||||||
|
* Parameters:
|
||||||
|
* `base_url (str)`: The base URL of the Crawl4ai server. Default: `"http://localhost:11235"`.
|
||||||
|
* `api_token (Optional[str])`: JWT token for authentication if enabled on the server. Default: `None`.
|
||||||
|
* `timeout (int)`: Default timeout in seconds for HTTP requests to the server. Default: `300`.
|
||||||
|
* Authentication (JWT):
|
||||||
|
* Method: Pass the `api_token` during client initialization. The token can be obtained from the server's `/token` endpoint or other authentication mechanisms.
|
||||||
|
* `crawl()` Method:
|
||||||
|
* Signature (Conceptual, based on typical SDK patterns and server capabilities): `async def crawl(self, urls: Union[str, List[str]], browser_config: Optional[Dict] = None, crawler_config: Optional[Dict] = None, stream: bool = False) -> Union[List[Dict], AsyncGenerator[Dict, None]]`
|
||||||
|
*Note: SDK might take `BrowserConfig` and `CrawlerRunConfig` objects directly, which it then serializes.*
|
||||||
|
* Key Parameters:
|
||||||
|
* `urls (Union[str, List[str]])`: A single URL string or a list of URL strings to crawl.
|
||||||
|
* `browser_config (Optional[Dict])`: A dictionary representing the `BrowserConfig` object, or a `BrowserConfig` instance itself.
|
||||||
|
* `crawler_config (Optional[Dict])`: A dictionary representing the `CrawlerRunConfig` object, or a `CrawlerRunConfig` instance itself.
|
||||||
|
* `stream (bool)`: If `True`, the method returns an async generator yielding individual `CrawlResult` dictionaries as they are processed by the server. If `False` (default), it returns a list containing all `CrawlResult` dictionaries after all URLs are processed.
|
||||||
|
* Return Type: `List[Dict]` (for `stream=False`) or `AsyncGenerator[Dict, None]` (for `stream=True`), where each `Dict` represents a `CrawlResult`.
|
||||||
|
* Streaming Behavior:
|
||||||
|
* `stream=True`: Allows processing of results incrementally, suitable for long crawl jobs or real-time data feeds.
|
||||||
|
* `stream=False`: Collects all results before returning, simpler for smaller batches.
|
||||||
|
* `get_schema()` Method:
|
||||||
|
* Signature: `async def get_schema(self) -> dict`
|
||||||
|
* Return Type: `dict`.
|
||||||
|
* Purpose: Fetches the JSON schemas for `BrowserConfig` and `CrawlerRunConfig` from the server's `/schema` endpoint. This helps in constructing valid configuration payloads.
|
||||||
|
* 3.5.3. **JSON Request Schema for Configurations**
|
||||||
|
* Structure: `{"type": "ClassName", "params": {...}}`
|
||||||
|
* Purpose: This structure is used by the server (and expected by the Python SDK internally) to deserialize JSON payloads back into Pydantic configuration objects like `BrowserConfig`, `CrawlerRunConfig`, and their nested strategy objects (e.g., `LLMExtractionStrategy`, `PruningContentFilter`). The `type` field specifies the Python class name, and `params` holds the keyword arguments for its constructor.
|
||||||
|
* Example (`BrowserConfig`):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"headless": true,
|
||||||
|
"browser_type": "chromium",
|
||||||
|
"viewport_width": 1920,
|
||||||
|
"viewport_height": 1080
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
* Example (`CrawlerRunConfig` with a nested `LLMExtractionStrategy`):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"cache_mode": {"type": "CacheMode", "params": "BYPASS"},
|
||||||
|
"screenshot": false,
|
||||||
|
"extraction_strategy": {
|
||||||
|
"type": "LLMExtractionStrategy",
|
||||||
|
"params": {
|
||||||
|
"llm_config": {
|
||||||
|
"type": "LLMConfig",
|
||||||
|
"params": {"provider": "openai/gpt-4o-mini"}
|
||||||
|
},
|
||||||
|
"instruction": "Extract the main title and summary."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
* 3.5.4. **REST API Examples**
|
||||||
|
* `/crawl` Endpoint:
|
||||||
|
* URL: `http://localhost:11235/crawl`
|
||||||
|
* HTTP Method: `POST`
|
||||||
|
* Payload Structure (`CrawlRequest` model from `deploy/docker/schemas.py`):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": { // JSON representation of BrowserConfig
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {"headless": true}
|
||||||
|
},
|
||||||
|
"crawler_config": { // JSON representation of CrawlerRunConfig
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {"screenshot": true}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
* Response Structure: A JSON object, typically `{"success": true, "results": [CrawlResult, ...], "server_processing_time_s": float, ...}`.
|
||||||
|
* `/crawl/stream` Endpoint:
|
||||||
|
* URL: `http://localhost:11235/crawl/stream`
|
||||||
|
* HTTP Method: `POST`
|
||||||
|
* Payload Structure: Same as `/crawl` (`CrawlRequest` model).
|
||||||
|
* Response Structure: Newline Delimited JSON (NDJSON, `application/x-ndjson`). Each line is a JSON string representing a `CrawlResult` object.
|
||||||
|
* Headers: Includes `Content-Type: application/x-ndjson` and `X-Stream-Status: active` while streaming, and a final JSON object `{"status": "completed"}`.
|
||||||
|
|
||||||
|
* 3.6. **Additional API Endpoints (from `server.py`)**
|
||||||
|
* 3.6.1. `/html`
|
||||||
|
* Endpoint URL: `/html`
|
||||||
|
* HTTP Method: `POST`
|
||||||
|
* Purpose: Crawls the given URL, preprocesses its raw HTML content specifically for schema extraction purposes (e.g., by sanitizing and simplifying the structure), and returns the processed HTML.
|
||||||
|
* Request Body (`HTMLRequest` from `deploy/docker/schemas.py`):
|
||||||
|
* `url (str)`: The URL to fetch and process.
|
||||||
|
* Response Structure (JSON):
|
||||||
|
* `html (str)`: The preprocessed HTML string.
|
||||||
|
* `url (str)`: The original URL requested.
|
||||||
|
* `success (bool)`: Indicates if the operation was successful.
|
||||||
|
* 3.6.2. `/screenshot`
|
||||||
|
* Endpoint URL: `/screenshot`
|
||||||
|
* HTTP Method: `POST`
|
||||||
|
* Purpose: Captures a full-page PNG screenshot of the specified URL. Allows an optional delay before capture and an option to save the file server-side.
|
||||||
|
* Request Body (`ScreenshotRequest` from `deploy/docker/schemas.py`):
|
||||||
|
* `url (str)`: The URL to take a screenshot of.
|
||||||
|
* `screenshot_wait_for (Optional[float])`: Seconds to wait before taking the screenshot. Default: `2.0`.
|
||||||
|
* `output_path (Optional[str])`: If provided, the screenshot is saved to this path on the server, and the path is returned. Otherwise, the base64 encoded image is returned. Default: `None`.
|
||||||
|
* Response Structure (JSON):
|
||||||
|
* `success (bool)`: Indicates if the screenshot was successfully taken.
|
||||||
|
* `screenshot (Optional[str])`: Base64 encoded PNG image data, if `output_path` was not provided.
|
||||||
|
* `path (Optional[str])`: The absolute server-side path to the saved screenshot, if `output_path` was provided.
|
||||||
|
* 3.6.3. `/pdf`
|
||||||
|
* Endpoint URL: `/pdf`
|
||||||
|
* HTTP Method: `POST`
|
||||||
|
* Purpose: Generates a PDF document of the rendered content of the specified URL.
|
||||||
|
* Request Body (`PDFRequest` from `deploy/docker/schemas.py`):
|
||||||
|
* `url (str)`: The URL to convert to PDF.
|
||||||
|
* `output_path (Optional[str])`: If provided, the PDF is saved to this path on the server, and the path is returned. Otherwise, the base64 encoded PDF data is returned. Default: `None`.
|
||||||
|
* Response Structure (JSON):
|
||||||
|
* `success (bool)`: Indicates if the PDF generation was successful.
|
||||||
|
* `pdf (Optional[str])`: Base64 encoded PDF data, if `output_path` was not provided.
|
||||||
|
* `path (Optional[str])`: The absolute server-side path to the saved PDF, if `output_path` was provided.
|
||||||
|
* 3.6.4. `/execute_js`
|
||||||
|
* Endpoint URL: `/execute_js`
|
||||||
|
* HTTP Method: `POST`
|
||||||
|
* Purpose: Executes a list of JavaScript snippets on the specified URL in the browser context and returns the full `CrawlResult` object, including any modifications or data retrieved by the scripts.
|
||||||
|
* Request Body (`JSEndpointRequest` from `deploy/docker/schemas.py`):
|
||||||
|
* `url (str)`: The URL on which to execute the JavaScript.
|
||||||
|
* `scripts (List[str])`: A list of JavaScript code snippets to execute sequentially. Each script should be an expression that returns a value.
|
||||||
|
* Response Structure (JSON): A `CrawlResult` object (serialized to a dictionary) containing the state of the page after JS execution, including `js_execution_result`.
|
||||||
|
* 3.6.5. `/ask` (Endpoint defined as `/ask` in `server.py`)
|
||||||
|
* Endpoint URL: `/ask`
|
||||||
|
* HTTP Method: `GET`
|
||||||
|
* Purpose: Retrieves context about the Crawl4ai library itself, either code snippets or documentation sections, filtered by a query. This is designed for AI assistants or RAG systems needing information about Crawl4ai.
|
||||||
|
* Parameters (Query):
|
||||||
|
* `context_type (str, default="all", enum=["code", "doc", "all"])`: Specifies whether to return "code", "doc", or "all" (both).
|
||||||
|
* `query (Optional[str])`: A search query string used to filter relevant chunks using BM25 ranking. If `None`, returns all context of the specified type(s).
|
||||||
|
* `score_ratio (float, default=0.5, ge=0.0, le=1.0)`: The minimum score (as a fraction of the maximum possible score for the query) for a chunk to be included in the results.
|
||||||
|
* `max_results (int, default=20, ge=1)`: The maximum number of result chunks to return.
|
||||||
|
* Response Structure (JSON):
|
||||||
|
* If `query` is provided:
|
||||||
|
* `code_results (Optional[List[Dict[str, Union[str, float]]]])`: A list of dictionaries, where each dictionary contains `{"text": "code_chunk...", "score": bm25_score}`. Present if `context_type` is "code" or "all".
|
||||||
|
* `doc_results (Optional[List[Dict[str, Union[str, float]]]])`: A list of dictionaries, where each dictionary contains `{"text": "doc_chunk...", "score": bm25_score}`. Present if `context_type` is "doc" or "all".
|
||||||
|
* If `query` is not provided:
|
||||||
|
* `code_context (Optional[str])`: The full concatenated code context as a single string. Present if `context_type` is "code" or "all".
|
||||||
|
* `doc_context (Optional[str])`: The full concatenated documentation context as a single string. Present if `context_type` is "doc" or "all".
|
||||||
|
|
||||||
|
* 3.7. **MCP (Model Context Protocol) Support**
|
||||||
|
* 3.7.1. Explanation of MCP:
|
||||||
|
* Purpose: The Model Context Protocol (MCP) is a standardized way for AI models (like Anthropic's Claude with Code Interpreter capabilities) to discover and interact with external tools and data sources. Crawl4ai's MCP server exposes its functionalities as tools that an MCP-compatible AI can use.
|
||||||
|
* 3.7.2. Connection Endpoints (defined in `mcp_bridge.py` and attached to FastAPI app):
|
||||||
|
* `/mcp/sse`: Server-Sent Events (SSE) endpoint for MCP communication.
|
||||||
|
* `/mcp/ws`: WebSocket endpoint for MCP communication.
|
||||||
|
* `/mcp/messages`: Endpoint for clients to POST messages in the SSE transport.
|
||||||
|
* 3.7.3. Usage with Claude Code Example:
|
||||||
|
* Command: `claude mcp add -t sse c4ai-sse http://localhost:11235/mcp/sse`
|
||||||
|
* Purpose: This command (specific to the Claude Code CLI) registers the Crawl4ai MCP server as a tool provider named `c4ai-sse` using the SSE transport. The AI can then discover and invoke tools from this source.
|
||||||
|
* 3.7.4. List of Available MCP Tools (defined by `@mcp_tool` decorators in `server.py`):
|
||||||
|
* `md`: Fetches Markdown for a URL.
|
||||||
|
* Parameters (derived from `get_markdown` function signature): `url (str)`, `filter_type (FilterType)`, `query (Optional[str])`, `cache (Optional[str])`.
|
||||||
|
* `html`: Generates preprocessed HTML for a URL.
|
||||||
|
* Parameters (derived from `generate_html` function signature): `url (str)`.
|
||||||
|
* `screenshot`: Generates a screenshot of a URL.
|
||||||
|
* Parameters (derived from `generate_screenshot` function signature): `url (str)`, `screenshot_wait_for (Optional[float])`, `output_path (Optional[str])`.
|
||||||
|
* `pdf`: Generates a PDF of a URL.
|
||||||
|
* Parameters (derived from `generate_pdf` function signature): `url (str)`, `output_path (Optional[str])`.
|
||||||
|
* `execute_js`: Executes JavaScript on a URL.
|
||||||
|
* Parameters (derived from `execute_js` function signature): `url (str)`, `scripts (List[str])`.
|
||||||
|
* `crawl`: Performs a full crawl operation.
|
||||||
|
* Parameters (derived from `crawl` function signature): `urls (List[str])`, `browser_config (Optional[Dict])`, `crawler_config (Optional[Dict])`.
|
||||||
|
* `ask`: Retrieves library context.
|
||||||
|
* Parameters (derived from `get_context` function signature): `context_type (str)`, `query (Optional[str])`, `score_ratio (float)`, `max_results (int)`.
|
||||||
|
* 3.7.5. Testing MCP Connections:
|
||||||
|
* Method: Use an MCP client tool (e.g., `claude mcp call c4ai-sse.md url=https://example.com`) to invoke a tool and verify the response.
|
||||||
|
* 3.7.6. Accessing MCP Schemas:
|
||||||
|
* Endpoint URL: `/mcp/schema`
|
||||||
|
* Purpose: Returns a JSON response detailing all registered MCP tools, including their names, descriptions, and input schemas, enabling clients to understand how to use them.
|
||||||
|
|
||||||
|
* 3.8. **Metrics & Monitoring Endpoints**
|
||||||
|
* 3.8.1. `/health`
|
||||||
|
* Purpose: Provides a basic health check for the server, indicating if it's running and responsive.
|
||||||
|
* Response Structure (JSON from `server.py`): `{"status": "ok", "timestamp": float, "version": str}` (where version is `__version__` from `server.py`).
|
||||||
|
* Configuration: Path configurable via `observability.health_check.endpoint` in `config.yml`.
|
||||||
|
* 3.8.2. `/metrics`
|
||||||
|
* Purpose: Exposes application metrics in a format compatible with Prometheus for monitoring and alerting.
|
||||||
|
* Response Format: Prometheus text format.
|
||||||
|
* Configuration: Enabled via `observability.prometheus.enabled: true` and endpoint path via `observability.prometheus.endpoint` in `config.yml`.
|
||||||
|
|
||||||
|
* 3.9. **Underlying Server Logic (`server.py` - High-Level Understanding)**
|
||||||
|
* 3.9.1. FastAPI Application:
|
||||||
|
* Framework: The server is built using the FastAPI Python web framework for creating APIs.
|
||||||
|
* 3.9.2. `crawler_pool` (`CrawlerPool` from `deploy.docker.crawler_pool`):
|
||||||
|
* Role: Manages a pool of `AsyncWebCrawler` instances to reuse browser resources efficiently.
|
||||||
|
* `get_crawler(BrowserConfig)`: Fetches an existing idle crawler compatible with the `BrowserConfig` or creates a new one if none are available or compatible.
|
||||||
|
* `close_all()`: Iterates through all pooled crawlers and closes them.
|
||||||
|
* `janitor()`: An `asyncio.Task` that runs periodically to close and remove crawler instances that have been idle for longer than `crawler.pool.idle_ttl_sec` (configured in `config.yml`).
|
||||||
|
* 3.9.3. Global Page Semaphore (`GLOBAL_SEM`):
|
||||||
|
* Type: `asyncio.Semaphore`.
|
||||||
|
* Purpose: A global semaphore that limits the total number of concurrently open browser pages across all `AsyncWebCrawler` instances managed by the server. This acts as a hard cap to prevent excessive resource consumption.
|
||||||
|
* Configuration: The maximum number of concurrent pages is set by `crawler.pool.max_pages` in `config.yml` (default: `30` in `server.py`, but `40` in `config.yml`). The `AsyncWebCrawler.arun` method acquires this semaphore.
|
||||||
|
* 3.9.4. Job Router (`init_job_router` from `deploy.docker.job`):
|
||||||
|
* Role: Manages asynchronous, long-running tasks, particularly for the `/crawl` (non-streaming batch) endpoint.
|
||||||
|
* Mechanism: Uses Redis (configured in `config.yml`) as a backend for task queuing (storing task metadata like status, creation time, URL, result, error) and status tracking.
|
||||||
|
* User Interaction: When a job is submitted to an endpoint using this router (e.g., `/crawl/job`), a `task_id` is returned. The client then polls an endpoint like `/task/{task_id}` to get the status and eventual result or error.
|
||||||
|
* 3.9.5. Rate Limiting Middleware:
|
||||||
|
* Implementation: Uses the `slowapi` library, integrated with FastAPI.
|
||||||
|
* Purpose: To protect the server from abuse by limiting the number of requests an IP address can make within a specified time window.
|
||||||
|
* Configuration: Settings like `enabled`, `default_limit`, `storage_uri` (e.g., `memory://` or `redis://...`) are managed in the `rate_limiting` section of `config.yml`.
|
||||||
|
* 3.9.6. Security Middleware:
|
||||||
|
* Implementations: `HTTPSRedirectMiddleware` and `TrustedHostMiddleware` from FastAPI, plus custom logic for adding security headers.
|
||||||
|
* Purpose:
|
||||||
|
* `HTTPSRedirectMiddleware`: Redirects HTTP requests to HTTPS if `security.https_redirect` is true.
|
||||||
|
* `TrustedHostMiddleware`: Ensures requests are only served if their `Host` header matches an entry in `security.trusted_hosts`.
|
||||||
|
* Custom header logic: Adds HTTP security headers like `X-Content-Type-Options`, `X-Frame-Options`, `Content-Security-Policy`, `Strict-Transport-Security` to all responses if `security.enabled` is true. These are defined in `security.headers` in `config.yml`.
|
||||||
|
* 3.9.7. API Request Mapping:
|
||||||
|
* Request Models: Pydantic models defined in `deploy/docker/schemas.py` (e.g., `CrawlRequest`, `MarkdownRequest`, `HTMLRequest`, `ScreenshotRequest`, `PDFRequest`, `JSEndpointRequest`, `TokenRequest`, `RawCode`) define the expected JSON structure for incoming API request bodies.
|
||||||
|
* Endpoint Logic: Functions decorated with `@app.post(...)`, `@app.get(...)`, etc., in `server.py` handle incoming HTTP requests. These functions use FastAPI's dependency injection to parse and validate request bodies against the Pydantic models.
|
||||||
|
* `AsyncWebCrawler` Interaction:
|
||||||
|
* The parameters from the parsed request models (e.g., `CrawlRequest.urls`, `CrawlRequest.browser_config`, `CrawlRequest.crawler_config`) are used.
|
||||||
|
* `BrowserConfig` and `CrawlerRunConfig` objects are created by calling their respective `.load()` class methods with the dictionary payloads received in the request (e.g., `BrowserConfig.load(crawl_request.browser_config)`).
|
||||||
|
* These configuration objects are then passed to an `AsyncWebCrawler` instance obtained from the `crawler_pool`, typically to its `arun()` (for single URL or when JS execution context is critical) or `arun_many()` (for batch processing of multiple URLs) methods.
|
||||||
|
* Result Serialization: The `CrawlResult` objects (or lists/generators of them) returned by the `AsyncWebCrawler` are usually serialized to JSON using their `.model_dump()` method before being included in the HTTP response. For streaming endpoints, each `CrawlResult` is serialized and sent as a separate NDJSON line.
|
||||||
|
|
||||||
|
## 4. Version Numbering Scheme
|
||||||
|
|
||||||
|
* 4.1. **Standard Versioning (`MAJOR.MINOR.PATCH`)**
|
||||||
|
* `MAJOR`: Incremented when incompatible API changes are made.
|
||||||
|
* `MINOR`: Incremented when functionality is added in a backward-compatible manner.
|
||||||
|
* `PATCH`: Incremented for backward-compatible bug fixes.
|
||||||
|
* 4.2. **Pre-release Suffixes**
|
||||||
|
* `devN`: (e.g., `0.6.0.dev1`) Development release. These are typically unstable and used for internal testing or early feedback on new, unrefined features.
|
||||||
|
* `aN`: (e.g., `0.6.0a1`) Alpha release. Indicates an early preview of a new version, potentially unstable, and APIs might still change.
|
||||||
|
* `bN`: (e.g., `0.6.0b1`) Beta release. Generally feature-complete for the targeted minor or major version but may still contain bugs. APIs are mostly stable at this point.
|
||||||
|
* `rcN`: (e.g., `0.6.0rc1`) Release Candidate. A version that is potentially the final release, undergoing final testing to catch critical bugs before official release.
|
||||||
|
```
|
||||||
File diff suppressed because it is too large
Load Diff
5672
docs/md_v2/assets/llmtxt/crawl4ai_extraction.llm.full.txt
Normal file
5672
docs/md_v2/assets/llmtxt/crawl4ai_extraction.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,494 @@
|
|||||||
|
Okay, I will now generate the "Foundational Memory" document for the `extraction` component of `crawl4ai`, based on the outline you provided and the information I've processed from the codebase and existing documentation.
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - extraction Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_extraction.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2024-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Overview of Data Extraction in Crawl4ai
|
||||||
|
|
||||||
|
* 1.1. Purpose of the Extraction Component: The extraction component in Crawl4ai is responsible for parsing structured data from web content (HTML, text, Markdown) or PDF documents. It allows users to define how data should be identified and extracted, using various strategies ranging from rule-based (CSS, XPath, Regex) to LLM-powered approaches. Its goal is to transform raw crawled content into usable, structured information.
|
||||||
|
* 1.2. Core Concepts:
|
||||||
|
* 1.2.1. `ExtractionStrategy`: This is an abstract base class (interface) that defines the contract for all specific extraction methods. Each strategy implements how data is extracted from the provided content.
|
||||||
|
* 1.2.2. `ChunkingStrategy`: This is an abstract base class (interface) for strategies that preprocess content by splitting it into smaller, manageable chunks. This is particularly relevant for LLM-based extraction strategies that have token limits for their input.
|
||||||
|
* 1.2.3. Schemas: Schemas define the structure of the data to be extracted. For non-LLM strategies like `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`, schemas are typically dictionary-based, specifying selectors and field types. For `LLMExtractionStrategy`, schemas can be Pydantic models or JSON schema dictionaries that guide the LLM in structuring its output.
|
||||||
|
* 1.2.4. `CrawlerRunConfig`: The `CrawlerRunConfig` object allows users to specify which `extraction_strategy` and `chunking_strategy` (if applicable) should be used for a particular crawl operation via its `arun()` method.
|
||||||
|
|
||||||
|
## 2. `ExtractionStrategy` Interface
|
||||||
|
|
||||||
|
* 2.1. Purpose: The `ExtractionStrategy` class, found in `crawl4ai.extraction_strategy`, serves as an abstract base class (ABC) defining the standard interface for all data extraction strategies within the Crawl4ai library. Implementations of this class provide specific methods for extracting structured data from content.
|
||||||
|
* 2.2. Key Abstract Methods:
|
||||||
|
* `extract(self, url: str, content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Abstract method intended to extract meaningful blocks or chunks from the given content. Subclasses must implement this.
|
||||||
|
* Parameters:
|
||||||
|
* `url (str)`: The URL of the webpage.
|
||||||
|
* `content (str)`: The HTML, Markdown, or text content of the webpage.
|
||||||
|
* `*q`: Variable positional arguments.
|
||||||
|
* `**kwargs`: Variable keyword arguments.
|
||||||
|
* Returns: `List[Dict[str, Any]]` - A list of extracted blocks or chunks, typically as dictionaries.
|
||||||
|
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Abstract method to process sections of text, often in parallel by default implementations in subclasses. Subclasses must implement this.
|
||||||
|
* Parameters:
|
||||||
|
* `url (str)`: The URL of the webpage.
|
||||||
|
* `sections (List[str])`: List of sections (strings) to process.
|
||||||
|
* `*q`: Variable positional arguments.
|
||||||
|
* `**kwargs`: Variable keyword arguments.
|
||||||
|
* Returns: `List[Dict[str, Any]]` - A list of processed JSON blocks.
|
||||||
|
* 2.3. Input Format Property:
|
||||||
|
* `input_format (str)`: [Read-only] - An attribute indicating the expected input format for the content to be processed by the strategy (e.g., "markdown", "html", "fit_html", "text"). Default is "markdown".
|
||||||
|
|
||||||
|
## 3. Non-LLM Based Extraction Strategies
|
||||||
|
|
||||||
|
* ### 3.1. Class `NoExtractionStrategy`
|
||||||
|
* 3.1.1. Purpose: A baseline `ExtractionStrategy` that performs no actual data extraction. It returns the input content as is, typically useful for scenarios where only raw or cleaned HTML/Markdown is needed without further structuring.
|
||||||
|
* 3.1.2. Inheritance: `ExtractionStrategy`
|
||||||
|
* 3.1.3. Initialization (`__init__`):
|
||||||
|
* 3.1.3.1. Signature: `NoExtractionStrategy(**kwargs)`
|
||||||
|
* 3.1.3.2. Parameters:
|
||||||
|
* `**kwargs`: Passed to the base `ExtractionStrategy` initializer.
|
||||||
|
* 3.1.4. Key Public Methods:
|
||||||
|
* `extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Returns the provided `html` content wrapped in a list containing a single dictionary: `[{"index": 0, "content": html}]`.
|
||||||
|
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Returns a list where each input section is wrapped in a dictionary: `[{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]`.
|
||||||
|
|
||||||
|
* ### 3.2. Class `JsonCssExtractionStrategy`
|
||||||
|
* 3.2.1. Purpose: Extracts structured data from HTML content using a JSON schema that defines CSS selectors to locate and extract data for specified fields. It uses BeautifulSoup4 for parsing and selection.
|
||||||
|
* 3.2.2. Inheritance: `JsonElementExtractionStrategy` (which inherits from `ExtractionStrategy`)
|
||||||
|
* 3.2.3. Initialization (`__init__`):
|
||||||
|
* 3.2.3.1. Signature: `JsonCssExtractionStrategy(schema: Dict[str, Any], **kwargs)`
|
||||||
|
* 3.2.3.2. Parameters:
|
||||||
|
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules.
|
||||||
|
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
|
||||||
|
* 3.2.4. Schema Definition for `JsonCssExtractionStrategy`:
|
||||||
|
* 3.2.4.1. `name (str)`: A descriptive name for the schema (e.g., "ProductDetails").
|
||||||
|
* 3.2.4.2. `baseSelector (str)`: The primary CSS selector that identifies each root element representing an item to be extracted (e.g., "div.product-item").
|
||||||
|
* 3.2.4.3. `fields (List[Dict[str, Any]])`: A list of dictionaries, each defining a field to be extracted from within each `baseSelector` element.
|
||||||
|
* Each field dictionary:
|
||||||
|
* `name (str)`: The key for this field in the output JSON object.
|
||||||
|
* `selector (str)`: The CSS selector for this field, relative to its parent element (either the `baseSelector` or a parent "nested" field).
|
||||||
|
* `type (str)`: Specifies how to extract the data. Common values:
|
||||||
|
* `"text"`: Extracts the text content of the selected element.
|
||||||
|
* `"attribute"`: Extracts the value of a specified HTML attribute.
|
||||||
|
* `"html"`: Extracts the raw inner HTML of the selected element.
|
||||||
|
* `"list"`: Extracts a list of items. The `fields` sub-key then defines the structure of each item in the list (if objects) or the `selector` directly targets list elements for primitive values.
|
||||||
|
* `"nested"`: Extracts a nested JSON object. The `fields` sub-key defines the structure of this nested object.
|
||||||
|
* `attribute (str, Optional)`: Required if `type` is "attribute". Specifies the name of the HTML attribute to extract (e.g., "href", "src").
|
||||||
|
* `fields (List[Dict[str, Any]], Optional)`: Required if `type` is "list" (for a list of objects) or "nested". Defines the structure of the nested object or list items.
|
||||||
|
* `transform (str, Optional)`: A string indicating a transformation to apply to the extracted value (e.g., "lowercase", "uppercase", "strip").
|
||||||
|
* `default (Any, Optional)`: A default value to use if the selector does not find an element or the attribute is missing.
|
||||||
|
* 3.2.5. Key Public Methods:
|
||||||
|
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Parses the `html_content` and applies the defined schema to extract structured data using CSS selectors.
|
||||||
|
* 3.2.6. Features:
|
||||||
|
* 3.2.6.1. Nested Extraction: Supports extracting complex, nested JSON objects by defining "nested" type fields within the schema.
|
||||||
|
* 3.2.6.2. List Handling: Supports extracting lists of primitive values (e.g., list of strings from multiple `<li>` tags) or lists of structured objects (e.g., a list of product details, each with its own fields).
|
||||||
|
|
||||||
|
* ### 3.3. Class `JsonXPathExtractionStrategy`
|
||||||
|
* 3.3.1. Purpose: Extracts structured data from HTML/XML content using a JSON schema that defines XPath expressions to locate and extract data. It uses `lxml` for parsing and XPath evaluation.
|
||||||
|
* 3.3.2. Inheritance: `JsonElementExtractionStrategy` (which inherits from `ExtractionStrategy`)
|
||||||
|
* 3.3.3. Initialization (`__init__`):
|
||||||
|
* 3.3.3.1. Signature: `JsonXPathExtractionStrategy(schema: Dict[str, Any], **kwargs)`
|
||||||
|
* 3.3.3.2. Parameters:
|
||||||
|
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules, where selectors are XPath expressions.
|
||||||
|
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
|
||||||
|
* 3.3.4. Schema Definition: The schema structure is identical to `JsonCssExtractionStrategy` (see 3.2.4), but the `baseSelector` and field `selector` values must be valid XPath expressions.
|
||||||
|
* 3.3.5. Key Public Methods:
|
||||||
|
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Parses the `html_content` using `lxml` and applies the defined schema to extract structured data using XPath expressions.
|
||||||
|
|
||||||
|
* ### 3.4. Class `JsonLxmlExtractionStrategy`
|
||||||
|
* 3.4.1. Purpose: Provides an alternative CSS selector-based extraction strategy leveraging the `lxml` library for parsing and selection, which can offer performance benefits over BeautifulSoup4 in some cases.
|
||||||
|
* 3.4.2. Inheritance: `JsonCssExtractionStrategy` (and thus `JsonElementExtractionStrategy`, `ExtractionStrategy`)
|
||||||
|
* 3.4.3. Initialization (`__init__`):
|
||||||
|
* 3.4.3.1. Signature: `JsonLxmlExtractionStrategy(schema: Dict[str, Any], **kwargs)`
|
||||||
|
* 3.4.3.2. Parameters:
|
||||||
|
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules, using CSS selectors.
|
||||||
|
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
|
||||||
|
* 3.4.4. Schema Definition: Identical to `JsonCssExtractionStrategy` (see 3.2.4).
|
||||||
|
* 3.4.5. Key Public Methods:
|
||||||
|
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Parses the `html_content` using `lxml` and applies the defined schema to extract structured data using lxml's CSS selector capabilities (which often translates CSS to XPath internally).
|
||||||
|
|
||||||
|
* ### 3.5. Class `RegexExtractionStrategy`
|
||||||
|
* 3.5.1. Purpose: Extracts data from text content (HTML, Markdown, or plain text) using a collection of regular expression patterns. Each match is returned as a structured dictionary.
|
||||||
|
* 3.5.2. Inheritance: `ExtractionStrategy`
|
||||||
|
* 3.5.3. Initialization (`__init__`):
|
||||||
|
* 3.5.3.1. Signature: `RegexExtractionStrategy(patterns: Union[Dict[str, str], List[Tuple[str, str]], "RegexExtractionStrategy._B"] = _B.NOTHING, input_format: str = "fit_html", **kwargs)`
|
||||||
|
* 3.5.3.2. Parameters:
|
||||||
|
* `patterns (Union[Dict[str, str], List[Tuple[str, str]], "_B"], default: _B.NOTHING)`:
|
||||||
|
* Description: Defines the regex patterns to use.
|
||||||
|
* Can be a dictionary mapping labels to regex strings (e.g., `{"email": r"..."}`).
|
||||||
|
* Can be a list of (label, regex_string) tuples.
|
||||||
|
* Can be a bitwise OR combination of `RegexExtractionStrategy._B` enum members for using built-in patterns (e.g., `RegexExtractionStrategy.Email | RegexExtractionStrategy.Url`).
|
||||||
|
* `input_format (str, default: "fit_html")`: Specifies the input format for the content. Options: "html" (raw HTML), "markdown" (Markdown from HTML), "text" (plain text from HTML), "fit_html" (content filtered for relevance before regex application).
|
||||||
|
* `**kwargs`: Passed to the base `ExtractionStrategy`.
|
||||||
|
* 3.5.4. Built-in Patterns (`RegexExtractionStrategy._B` Enum - an `IntFlag`):
|
||||||
|
* `EMAIL (auto())`: Matches email addresses. Example pattern: `r"[\\w.+-]+@[\\w-]+\\.[\\w.-]+"`
|
||||||
|
* `PHONE_INTL (auto())`: Matches international phone numbers. Example pattern: `r"\\+?\\d[\\d .()-]{7,}\\d"`
|
||||||
|
* `PHONE_US (auto())`: Matches US phone numbers. Example pattern: `r"\\(?\\d{3}\\)?[-. ]?\\d{3}[-. ]?\\d{4}"`
|
||||||
|
* `URL (auto())`: Matches URLs. Example pattern: `r"https?://[^\\s\\'\"<>]+"`
|
||||||
|
* `IPV4 (auto())`: Matches IPv4 addresses. Example pattern: `r"(?:\\d{1,3}\\.){3}\\d{1,3}"`
|
||||||
|
* `IPV6 (auto())`: Matches IPv6 addresses. Example pattern: `r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}"`
|
||||||
|
* `UUID (auto())`: Matches UUIDs. Example pattern: `r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}"`
|
||||||
|
* `CURRENCY (auto())`: Matches currency amounts. Example pattern: `r"(?:USD|EUR|RM|\\$|€|¥|£)\\s?\\d+(?:[.,]\\d{2})?"`
|
||||||
|
* `PERCENTAGE (auto())`: Matches percentages. Example pattern: `r"\\d+(?:\\.\\d+)?%"`
|
||||||
|
* `NUMBER (auto())`: Matches numbers (integers, decimals). Example pattern: `r"\\b\\d{1,3}(?:[,.]?\\d{3})*(?:\\.\\d+)?\\b"`
|
||||||
|
* `DATE_ISO (auto())`: Matches ISO 8601 dates (YYYY-MM-DD). Example pattern: `r"\\d{4}-\\d{2}-\\d{2}"`
|
||||||
|
* `DATE_US (auto())`: Matches US-style dates (MM/DD/YYYY or MM/DD/YY). Example pattern: `r"\\d{1,2}/\\d{1,2}/\\d{2,4}"`
|
||||||
|
* `TIME_24H (auto())`: Matches 24-hour time formats (HH:MM or HH:MM:SS). Example pattern: `r"\\b(?:[01]?\\d|2[0-3]):[0-5]\\d(?:[:.][0-5]\\d)?\\b"`
|
||||||
|
* `POSTAL_US (auto())`: Matches US postal codes (ZIP codes). Example pattern: `r"\\b\\d{5}(?:-\\d{4})?\\b"`
|
||||||
|
* `POSTAL_UK (auto())`: Matches UK postal codes. Example pattern: `r"\\b[A-Z]{1,2}\\d[A-Z\\d]? ?\\d[A-Z]{2}\\b"`
|
||||||
|
* `HTML_COLOR_HEX (auto())`: Matches HTML hex color codes. Example pattern: `r"#[0-9A-Fa-f]{6}\\b"`
|
||||||
|
* `TWITTER_HANDLE (auto())`: Matches Twitter handles. Example pattern: `r"@[\\w]{1,15}"`
|
||||||
|
* `HASHTAG (auto())`: Matches hashtags. Example pattern: `r"#[\\w-]+"`
|
||||||
|
* `MAC_ADDR (auto())`: Matches MAC addresses. Example pattern: `r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}"`
|
||||||
|
* `IBAN (auto())`: Matches IBANs. Example pattern: `r"[A-Z]{2}\\d{2}[A-Z0-9]{11,30}"`
|
||||||
|
* `CREDIT_CARD (auto())`: Matches common credit card numbers. Example pattern: `r"\\b(?:4\\d{12}(?:\\d{3})?|5[1-5]\\d{14}|3[47]\\d{13}|6(?:011|5\\d{2})\\d{12})\\b"`
|
||||||
|
* `ALL (_B(-1).value & ~_B.NOTHING.value)`: Includes all built-in patterns except `NOTHING`.
|
||||||
|
* `NOTHING (_B(0).value)`: Includes no built-in patterns.
|
||||||
|
* 3.5.5. Key Public Methods:
|
||||||
|
* `extract(self, url: str, content: str, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Applies all configured regex patterns (built-in and custom) to the input `content`.
|
||||||
|
* Returns: `List[Dict[str, Any]]` - A list of dictionaries, where each dictionary represents a match and contains:
|
||||||
|
* `"url" (str)`: The source URL.
|
||||||
|
* `"label" (str)`: The label of the matching regex pattern.
|
||||||
|
* `"value" (str)`: The actual matched string.
|
||||||
|
* `"span" (Tuple[int, int])`: The start and end indices of the match within the content.
|
||||||
|
* 3.5.6. Static Method: `generate_pattern`
|
||||||
|
* 3.5.6.1. Signature: `staticmethod generate_pattern(label: str, html: str, query: Optional[str] = None, examples: Optional[List[str]] = None, llm_config: Optional[LLMConfig] = None, **kwargs) -> Dict[str, str]`
|
||||||
|
* 3.5.6.2. Purpose: Uses an LLM to automatically generate a Python-compatible regular expression pattern for a given label, based on sample HTML content, an optional natural language query describing the target, and/or examples of desired matches.
|
||||||
|
* 3.5.6.3. Parameters:
|
||||||
|
* `label (str)`: A descriptive label for the pattern to be generated (e.g., "product_price", "article_date").
|
||||||
|
* `html (str)`: The HTML content from which the pattern should be inferred.
|
||||||
|
* `query (Optional[str], default: None)`: A natural language description of what kind of data the regex should capture (e.g., "Extract the publication date", "Find all ISBN numbers").
|
||||||
|
* `examples (Optional[List[str]], default: None)`: A list of example strings that the generated regex should successfully match from the provided HTML.
|
||||||
|
* `llm_config (Optional[LLMConfig], default: None)`: Configuration for the LLM to be used. If `None`, uses default `LLMConfig`.
|
||||||
|
* `**kwargs`: Additional arguments passed to the LLM completion request (e.g., `temperature`, `max_tokens`).
|
||||||
|
* 3.5.6.4. Returns: `Dict[str, str]` - A dictionary containing the generated pattern, in the format `{label: "regex_pattern_string"}`.
|
||||||
|
|
||||||
|
## 4. LLM-Based Extraction Strategies
|
||||||
|
|
||||||
|
* ### 4.1. Class `LLMExtractionStrategy`
|
||||||
|
* 4.1.1. Purpose: Employs Large Language Models (LLMs) to extract either structured data according to a schema or relevant blocks of text based on natural language instructions from various content formats (HTML, Markdown, text).
|
||||||
|
* 4.1.2. Inheritance: `ExtractionStrategy`
|
||||||
|
* 4.1.3. Initialization (`__init__`):
|
||||||
|
* 4.1.3.1. Signature: `LLMExtractionStrategy(llm_config: Optional[LLMConfig] = None, instruction: Optional[str] = None, schema: Optional[Union[Dict[str, Any], "BaseModel"]] = None, extraction_type: str = "block", chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD, overlap_rate: float = OVERLAP_RATE, word_token_rate: float = WORD_TOKEN_RATE, apply_chunking: bool = True, force_json_response: bool = False, **kwargs)`
|
||||||
|
* 4.1.3.2. Parameters:
|
||||||
|
* `llm_config (Optional[LLMConfig], default: None)`: Configuration for the LLM. If `None`, a default `LLMConfig` is created.
|
||||||
|
* `instruction (Optional[str], default: None)`: Natural language instructions to guide the LLM's extraction process (e.g., "Extract the main article content", "Summarize the key points").
|
||||||
|
* `schema (Optional[Union[Dict[str, Any], "BaseModel"]], default: None)`: A Pydantic model class or a dictionary representing a JSON schema. Used when `extraction_type` is "schema" to define the desired output structure.
|
||||||
|
* `extraction_type (str, default: "block")`: Determines the extraction mode.
|
||||||
|
* `"block"`: LLM identifies and extracts relevant blocks/chunks of text based on the `instruction`.
|
||||||
|
* `"schema"`: LLM attempts to populate the fields defined in `schema` from the content.
|
||||||
|
* `chunk_token_threshold (int, default: CHUNK_TOKEN_THRESHOLD)`: The target maximum number of tokens for each chunk of content sent to the LLM. `CHUNK_TOKEN_THRESHOLD` is defined in `crawl4ai.config` (default value: 10000).
|
||||||
|
* `overlap_rate (float, default: OVERLAP_RATE)`: The percentage of overlap between consecutive chunks to ensure context continuity. `OVERLAP_RATE` is defined in `crawl4ai.config` (default value: 0.1, i.e., 10%).
|
||||||
|
* `word_token_rate (float, default: WORD_TOKEN_RATE)`: An estimated ratio of words to tokens (e.g., 0.75 words per token). Used for approximating chunk boundaries. `WORD_TOKEN_RATE` is defined in `crawl4ai.config` (default value: 0.75).
|
||||||
|
* `apply_chunking (bool, default: True)`: If `True`, the input content is chunked before being sent to the LLM. If `False`, the entire content is sent (which might exceed token limits for large inputs).
|
||||||
|
* `force_json_response (bool, default: False)`: If `True` and `extraction_type` is "schema", instructs the LLM to strictly adhere to JSON output format.
|
||||||
|
* `**kwargs`: Passed to `ExtractionStrategy` and potentially to the underlying LLM API calls (e.g., `temperature`, `max_tokens` if not set in `llm_config`).
|
||||||
|
* 4.1.4. Key Public Methods:
|
||||||
|
* `extract(self, url: str, content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Processes the input `content`. If `apply_chunking` is `True`, it first chunks the content using the specified `chunking_strategy` (or a default one if `LLMExtractionStrategy` manages it internally). Then, for each chunk (or the whole content if not chunked), it constructs a prompt based on `instruction` and/or `schema` and sends it to the configured LLM.
|
||||||
|
* Returns: `List[Dict[str, Any]]` - A list of dictionaries.
|
||||||
|
* If `extraction_type` is "block", each dictionary typically contains `{"index": int, "content": str, "tags": List[str]}`.
|
||||||
|
* If `extraction_type` is "schema", each dictionary is an instance of the extracted structured data, ideally conforming to the provided `schema`. If the LLM returns multiple JSON objects in a list, they are parsed and returned.
|
||||||
|
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||||
|
* Description: Processes a list of content `sections` in parallel (using `ThreadPoolExecutor`). Each section is passed to the `extract` method logic.
|
||||||
|
* Returns: `List[Dict[str, Any]]` - Aggregated list of results from processing all sections.
|
||||||
|
* 4.1.5. `TokenUsage` Tracking:
|
||||||
|
* `total_usage (TokenUsage)`: [Read-only Public Attribute] - An instance of `TokenUsage` that accumulates the token counts (prompt, completion, total) from all LLM API calls made by this `LLMExtractionStrategy` instance.
|
||||||
|
* `usages (List[TokenUsage])`: [Read-only Public Attribute] - A list containing individual `TokenUsage` objects for each separate LLM API call made during the extraction process. This allows for detailed tracking of token consumption per call.
|
||||||
|
|
||||||
|
## 5. `ChunkingStrategy` Interface and Implementations
|
||||||
|
|
||||||
|
* ### 5.1. Interface `ChunkingStrategy`
|
||||||
|
* 5.1.1. Purpose: The `ChunkingStrategy` class, found in `crawl4ai.chunking_strategy`, is an abstract base class (ABC) that defines the interface for different content chunking algorithms. Chunking is used to break down large pieces of text or HTML into smaller, manageable segments, often before feeding them to an LLM or other processing steps.
|
||||||
|
* 5.1.2. Key Abstract Methods:
|
||||||
|
* `chunk(self, content: str) -> List[str]`:
|
||||||
|
* Description: Abstract method that must be implemented by subclasses to split the input `content` string into a list of string chunks.
|
||||||
|
* Parameters:
|
||||||
|
* `content (str)`: The content to be chunked.
|
||||||
|
* Returns: `List[str]` - A list of content chunks.
|
||||||
|
|
||||||
|
* ### 5.2. Class `RegexChunking`
|
||||||
|
* 5.2.1. Purpose: Implements `ChunkingStrategy` by splitting content based on a list of regular expression patterns. It can also attempt to merge smaller chunks to meet a target `chunk_size`.
|
||||||
|
* 5.2.2. Inheritance: `ChunkingStrategy`
|
||||||
|
* 5.2.3. Initialization (`__init__`):
|
||||||
|
* 5.2.3.1. Signature: `RegexChunking(patterns: Optional[List[str]] = None, chunk_size: Optional[int] = None, overlap: Optional[int] = None, word_token_ratio: Optional[float] = WORD_TOKEN_RATE, **kwargs)`
|
||||||
|
* 5.2.3.2. Parameters:
|
||||||
|
* `patterns (Optional[List[str]], default: None)`: A list of regex patterns used to split the text. If `None`, defaults to paragraph-based splitting (`["\\n\\n+"]`).
|
||||||
|
* `chunk_size (Optional[int], default: None)`: The target token size for each chunk. If specified, the strategy will try to merge smaller chunks created by regex splitting to approximate this size.
|
||||||
|
* `overlap (Optional[int], default: None)`: The target token overlap between consecutive chunks when `chunk_size` is active.
|
||||||
|
* `word_token_ratio (Optional[float], default: WORD_TOKEN_RATE)`: The estimated ratio of words to tokens, used if `chunk_size` or `overlap` are specified. `WORD_TOKEN_RATE` is defined in `crawl4ai.config` (default value: 0.75).
|
||||||
|
* `**kwargs`: Additional keyword arguments.
|
||||||
|
* 5.2.4. Key Public Methods:
|
||||||
|
* `chunk(self, content: str) -> List[str]`:
|
||||||
|
* Description: Splits the input `content` using the configured regex patterns. If `chunk_size` is set, it then merges these initial chunks to meet the target size with the specified overlap.
|
||||||
|
|
||||||
|
* ### 5.3. Class `IdentityChunking`
|
||||||
|
* 5.3.1. Purpose: A `ChunkingStrategy` that does not perform any actual chunking. It returns the input content as a single chunk in a list.
|
||||||
|
* 5.3.2. Inheritance: `ChunkingStrategy`
|
||||||
|
* 5.3.3. Initialization (`__init__`):
|
||||||
|
* 5.3.3.1. Signature: `IdentityChunking(**kwargs)`
|
||||||
|
* 5.3.3.2. Parameters:
|
||||||
|
* `**kwargs`: Additional keyword arguments.
|
||||||
|
* 5.3.4. Key Public Methods:
|
||||||
|
* `chunk(self, content: str) -> List[str]`:
|
||||||
|
* Description: Returns the input `content` as a single-element list: `[content]`.
|
||||||
|
|
||||||
|
## 6. Defining Schemas for Extraction
|
||||||
|
|
||||||
|
* 6.1. Purpose: Schemas provide a structured way to define what data needs to be extracted from content and how it should be organized. This allows for consistent and predictable output from the extraction process.
|
||||||
|
* 6.2. Schemas for CSS/XPath/LXML-based Extraction (`JsonCssExtractionStrategy`, etc.):
|
||||||
|
* 6.2.1. Format: These strategies use a dictionary-based JSON-like schema.
|
||||||
|
* 6.2.2. Key elements: As detailed in section 3.2.4 for `JsonCssExtractionStrategy`:
|
||||||
|
* `name (str)`: Name of the schema.
|
||||||
|
* `baseSelector (str)`: CSS selector (for CSS strategies) or XPath expression (for XPath strategy) identifying the repeating parent elements.
|
||||||
|
* `fields (List[Dict[str, Any]])`: A list defining each field to extract. Each field definition includes:
|
||||||
|
* `name (str)`: Output key for the field.
|
||||||
|
* `selector (str)`: CSS/XPath selector relative to the `baseSelector` or parent "nested" element.
|
||||||
|
* `type (str)`: "text", "attribute", "html", "list", "nested".
|
||||||
|
* `attribute (str, Optional)`: Name of HTML attribute (if type is "attribute").
|
||||||
|
* `fields (List[Dict], Optional)`: For "list" (of objects) or "nested" types.
|
||||||
|
* `transform (str, Optional)`: e.g., "lowercase".
|
||||||
|
* `default (Any, Optional)`: Default value if not found.
|
||||||
|
* 6.3. Schemas for LLM-based Extraction (`LLMExtractionStrategy`):
|
||||||
|
* 6.3.1. Format: `LLMExtractionStrategy` accepts schemas in two main formats when `extraction_type="schema"`:
|
||||||
|
* Pydantic models: The Pydantic model class itself.
|
||||||
|
* Dictionary: A Python dictionary representing a valid JSON schema.
|
||||||
|
* 6.3.2. Pydantic Models:
|
||||||
|
* Definition: Users can define a Pydantic `BaseModel` where each field represents a piece of data to be extracted. Field types and descriptions are automatically inferred.
|
||||||
|
* Conversion: `LLMExtractionStrategy` internally converts the Pydantic model to its JSON schema representation (`model_json_schema()`) to guide the LLM.
|
||||||
|
* 6.3.3. Dictionary-based JSON Schema:
|
||||||
|
* Structure: Users can provide a dictionary that conforms to the JSON Schema specification. This typically includes a `type: "object"` at the root and a `properties` dictionary defining each field, its type (e.g., "string", "number", "array", "object"), and optionally a `description`.
|
||||||
|
* Usage: This schema is passed to the LLM to instruct it on the desired output format.
|
||||||
|
|
||||||
|
## 7. Configuration with `CrawlerRunConfig`
|
||||||
|
|
||||||
|
* 7.1. Purpose: The `CrawlerRunConfig` class (from `crawl4ai.async_configs`) is used to configure the behavior of a specific `arun()` or `arun_many()` call on an `AsyncWebCrawler` instance. It allows specifying various runtime parameters, including the extraction and chunking strategies.
|
||||||
|
* 7.2. Key Attributes:
|
||||||
|
* `extraction_strategy (Optional[ExtractionStrategy], default: None)`:
|
||||||
|
* Purpose: Specifies the `ExtractionStrategy` instance to be used for processing the content obtained during the crawl. If `None`, no structured extraction beyond basic Markdown generation occurs (unless a default is applied by the crawler).
|
||||||
|
* Type: An instance of a class inheriting from `ExtractionStrategy`.
|
||||||
|
* `chunking_strategy (Optional[ChunkingStrategy], default: RegexChunking())`:
|
||||||
|
* Purpose: Specifies the `ChunkingStrategy` instance to be used for breaking down content into smaller pieces before it's passed to an `ExtractionStrategy` (particularly `LLMExtractionStrategy`).
|
||||||
|
* Type: An instance of a class inheriting from `ChunkingStrategy`.
|
||||||
|
* Default: An instance of `RegexChunking()` with its default parameters (paragraph-based splitting).
|
||||||
|
|
||||||
|
## 8. LLM-Specific Configuration and Models
|
||||||
|
|
||||||
|
* ### 8.1. Class `LLMConfig`
|
||||||
|
* 8.1.1. Purpose: The `LLMConfig` class (from `crawl4ai.async_configs`) centralizes configuration parameters for interacting with Large Language Models (LLMs) through various providers.
|
||||||
|
* 8.1.2. Initialization (`__init__`):
|
||||||
|
* 8.1.2.1. Signature:
|
||||||
|
```python
|
||||||
|
class LLMConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
provider: str = DEFAULT_PROVIDER,
|
||||||
|
api_token: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
): ...
|
||||||
|
```
|
||||||
|
* 8.1.2.2. Parameters:
|
||||||
|
* `provider (str, default: DEFAULT_PROVIDER)`: Specifies the LLM provider and model, e.g., "openai/gpt-4o-mini", "ollama/llama3.3". `DEFAULT_PROVIDER` is "openai/gpt-4o-mini".
|
||||||
|
* `api_token (Optional[str], default: None)`: API token for the LLM provider. If `None`, the system attempts to read it from environment variables (e.g., `OPENAI_API_KEY`, `GEMINI_API_KEY`, `GROQ_API_KEY` based on provider). Can also be prefixed with "env:" (e.g., "env:MY_CUSTOM_LLM_KEY").
|
||||||
|
* `base_url (Optional[str], default: None)`: Custom base URL for the LLM API endpoint, for self-hosted or alternative provider endpoints.
|
||||||
|
* `temperature (Optional[float], default: None)`: Controls randomness in LLM generation. Higher values (e.g., 0.8) make output more random, lower (e.g., 0.2) more deterministic.
|
||||||
|
* `max_tokens (Optional[int], default: None)`: Maximum number of tokens the LLM should generate in its response.
|
||||||
|
* `top_p (Optional[float], default: None)`: Nucleus sampling parameter. An alternative to temperature; controls the cumulative probability mass of tokens considered for generation.
|
||||||
|
* `frequency_penalty (Optional[float], default: None)`: Penalizes new tokens based on their existing frequency in the text so far, decreasing repetition.
|
||||||
|
* `presence_penalty (Optional[float], default: None)`: Penalizes new tokens based on whether they have appeared in the text so far, encouraging new topics.
|
||||||
|
* `stop (Optional[List[str]], default: None)`: A list of sequences where the API will stop generating further tokens.
|
||||||
|
* `n (Optional[int], default: None)`: Number of completions to generate for each prompt.
|
||||||
|
* 8.1.3. Helper Methods:
|
||||||
|
* `from_kwargs(kwargs: dict) -> LLMConfig`:
|
||||||
|
* Description: [Static method] Creates an `LLMConfig` instance from a dictionary of keyword arguments.
|
||||||
|
* `to_dict() -> dict`:
|
||||||
|
* Description: Converts the `LLMConfig` instance into a dictionary representation.
|
||||||
|
* `clone(**kwargs) -> LLMConfig`:
|
||||||
|
* Description: Creates a new `LLMConfig` instance as a copy of the current one, allowing specific attributes to be overridden with `kwargs`.
|
||||||
|
|
||||||
|
* ### 8.2. Dataclass `TokenUsage`
|
||||||
|
* 8.2.1. Purpose: The `TokenUsage` dataclass (from `crawl4ai.models`) is used to store information about the number of tokens consumed during an LLM API call.
|
||||||
|
* 8.2.2. Fields:
|
||||||
|
* `completion_tokens (int, default: 0)`: The number of tokens generated by the LLM in the completion.
|
||||||
|
* `prompt_tokens (int, default: 0)`: The number of tokens in the prompt sent to the LLM.
|
||||||
|
* `total_tokens (int, default: 0)`: The sum of `completion_tokens` and `prompt_tokens`.
|
||||||
|
* `completion_tokens_details (Optional[dict], default: None)`: Provider-specific detailed breakdown of completion tokens, if available.
|
||||||
|
* `prompt_tokens_details (Optional[dict], default: None)`: Provider-specific detailed breakdown of prompt tokens, if available.
|
||||||
|
|
||||||
|
## 9. PDF Processing and Extraction
|
||||||
|
|
||||||
|
* ### 9.1. Overview of PDF Processing
|
||||||
|
* 9.1.1. Purpose: Crawl4ai provides specialized strategies to handle PDF documents, enabling the fetching of PDF content and subsequent extraction of text, images, and metadata. This allows PDFs to be treated as a primary content source similar to HTML web pages.
|
||||||
|
* 9.1.2. Key Components:
|
||||||
|
* `PDFCrawlerStrategy`: For fetching/identifying PDF content.
|
||||||
|
* `PDFContentScrapingStrategy`: For processing PDF content using an underlying PDF processor.
|
||||||
|
* `NaivePDFProcessorStrategy`: The default logic for parsing PDF files.
|
||||||
|
|
||||||
|
* ### 9.2. Class `PDFCrawlerStrategy`
|
||||||
|
* 9.2.1. Purpose: An implementation of `AsyncCrawlerStrategy` specifically for handling PDF documents. It doesn't perform typical browser interactions but focuses on fetching PDF content and setting the appropriate response headers to indicate a PDF document, which then allows `PDFContentScrapingStrategy` to process it.
|
||||||
|
* 9.2.2. Inheritance: `AsyncCrawlerStrategy` (from `crawl4ai.async_crawler_strategy`)
|
||||||
|
* 9.2.3. Initialization (`__init__`):
|
||||||
|
* 9.2.3.1. Signature: `PDFCrawlerStrategy(logger: Optional[AsyncLogger] = None)`
|
||||||
|
* 9.2.3.2. Parameters:
|
||||||
|
* `logger (Optional[AsyncLogger], default: None)`: An optional logger instance for logging messages.
|
||||||
|
* 9.2.4. Key Public Methods:
|
||||||
|
* `crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`:
|
||||||
|
* Description: Fetches the content from the given `url`. If the content is identified as a PDF (either by URL extension or `Content-Type` header for remote URLs), it sets `response_headers={"Content-Type": "application/pdf"}` in the returned `AsyncCrawlResponse`. The `html` field of the response will contain a placeholder message as the actual PDF processing happens in the scraping strategy.
|
||||||
|
* `close(self) -> None`:
|
||||||
|
* Description: Placeholder for cleanup, typically does nothing in this strategy.
|
||||||
|
* `__aenter__(self) -> "PDFCrawlerStrategy"`:
|
||||||
|
* Description: Async context manager entry point.
|
||||||
|
* `__aexit__(self, exc_type, exc_val, exc_tb) -> None`:
|
||||||
|
* Description: Async context manager exit point, calls `close()`.
|
||||||
|
|
||||||
|
* ### 9.3. Class `PDFContentScrapingStrategy`
|
||||||
|
* 9.3.1. Purpose: An implementation of `ContentScrapingStrategy` designed to process PDF documents. It uses an underlying `PDFProcessorStrategy` (by default, `NaivePDFProcessorStrategy`) to extract text, images, and metadata from the PDF, then formats this information into a `ScrapingResult`.
|
||||||
|
* 9.3.2. Inheritance: `ContentScrapingStrategy` (from `crawl4ai.content_scraping_strategy`)
|
||||||
|
* 9.3.3. Initialization (`__init__`):
|
||||||
|
* 9.3.3.1. Signature: `PDFContentScrapingStrategy(save_images_locally: bool = False, extract_images: bool = False, image_save_dir: Optional[str] = None, batch_size: int = 4, logger: Optional[AsyncLogger] = None)`
|
||||||
|
* 9.3.3.2. Parameters:
|
||||||
|
* `save_images_locally (bool, default: False)`: If `True`, extracted images will be saved to the local filesystem.
|
||||||
|
* `extract_images (bool, default: False)`: If `True`, the strategy will attempt to extract images from the PDF.
|
||||||
|
* `image_save_dir (Optional[str], default: None)`: The directory where extracted images will be saved if `save_images_locally` is `True`. If `None`, a default or temporary directory might be used.
|
||||||
|
* `batch_size (int, default: 4)`: The number of PDF pages to process in parallel by the underlying `NaivePDFProcessorStrategy`.
|
||||||
|
* `logger (Optional[AsyncLogger], default: None)`: An optional logger instance.
|
||||||
|
* 9.3.4. Key Attributes:
|
||||||
|
* `pdf_processor (NaivePDFProcessorStrategy)`: An instance of `NaivePDFProcessorStrategy` configured with the provided image and batch settings, used to do the actual PDF parsing.
|
||||||
|
* 9.3.5. Key Public Methods:
|
||||||
|
* `scrape(self, url: str, html: str, **params) -> ScrapingResult`:
|
||||||
|
* Description: Takes a `url` (which can be a local file path or a remote HTTP/HTTPS URL pointing to a PDF) and processes it. The `html` parameter is typically a placeholder like "Scraper will handle the real work" as the content comes from the PDF file itself. It downloads remote PDFs to a temporary local file before processing.
|
||||||
|
* Returns: `ScrapingResult` containing the extracted PDF data, including `cleaned_html` (concatenated HTML of pages), `media` (extracted images), `links`, and `metadata`.
|
||||||
|
* `ascrape(self, url: str, html: str, **kwargs) -> ScrapingResult`:
|
||||||
|
* Description: Asynchronous version of `scrape`. Internally calls `scrape` using `asyncio.to_thread`.
|
||||||
|
* 9.3.6. Internal Methods (Conceptual):
|
||||||
|
* `_get_pdf_path(self, url: str) -> str`:
|
||||||
|
* Description: If `url` is an HTTP/HTTPS URL, downloads the PDF to a temporary file and returns its path. If `url` starts with "file://", it strips the prefix and returns the local path. Otherwise, assumes `url` is already a local path. Handles download timeouts and errors.
|
||||||
|
|
||||||
|
* ### 9.4. Class `NaivePDFProcessorStrategy`
|
||||||
|
* 9.4.1. Purpose: The default implementation of `PDFProcessorStrategy` in Crawl4ai. It uses the PyPDF2 library (and Pillow for image processing) to parse PDF files, extract text content page by page, attempt to extract embedded images, and gather document metadata.
|
||||||
|
* 9.4.2. Inheritance: `PDFProcessorStrategy` (from `crawl4ai.processors.pdf.processor`)
|
||||||
|
* 9.4.3. Dependencies: Requires `PyPDF2` and `Pillow`. These are installed with the `crawl4ai[pdf]` extra.
|
||||||
|
* 9.4.4. Initialization (`__init__`):
|
||||||
|
* 9.4.4.1. Signature: `NaivePDFProcessorStrategy(image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4)`
|
||||||
|
* 9.4.4.2. Parameters:
|
||||||
|
* `image_dpi (int, default: 144)`: DPI used when rendering PDF pages to images (if direct image extraction is not possible or disabled).
|
||||||
|
* `image_quality (int, default: 85)`: Quality setting (1-100) for images saved in lossy formats like JPEG.
|
||||||
|
* `extract_images (bool, default: True)`: If `True`, attempts to extract embedded images directly from the PDF's XObjects.
|
||||||
|
* `save_images_locally (bool, default: False)`: If `True`, extracted images are saved to disk. Otherwise, they are base64 encoded and returned in the `PDFPage.images` data.
|
||||||
|
* `image_save_dir (Optional[Path], default: None)`: If `save_images_locally` is True, this specifies the directory to save images. If `None`, a temporary directory (prefixed `pdf_images_`) is created and used.
|
||||||
|
* `batch_size (int, default: 4)`: The number of pages to process in parallel when using the `process_batch` method.
|
||||||
|
* 9.4.5. Key Public Methods:
|
||||||
|
* `process(self, pdf_path: Path) -> PDFProcessResult`:
|
||||||
|
* Description: Processes the PDF specified by `pdf_path` page by page sequentially.
|
||||||
|
* Returns: `PDFProcessResult` containing metadata and a list of `PDFPage` objects.
|
||||||
|
* `process_batch(self, pdf_path: Path) -> PDFProcessResult`:
|
||||||
|
* Description: Processes the PDF specified by `pdf_path` by handling pages in parallel batches using a `ThreadPoolExecutor` with `max_workers` set to `batch_size`.
|
||||||
|
* Returns: `PDFProcessResult` containing metadata and a list of `PDFPage` objects, assembled in the correct page order.
|
||||||
|
* 9.4.6. Internal Methods (Conceptual High-Level):
|
||||||
|
* `_process_page(self, page: PyPDF2PageObject, image_dir: Optional[Path]) -> PDFPage`: Extracts text, images (if `extract_images` is True), and links from a single PyPDF2 page object.
|
||||||
|
* `_extract_images(self, page: PyPDF2PageObject, image_dir: Optional[Path]) -> List[Dict]`: Iterates through XObjects on a page, identifies images, decodes them (handling FlateDecode, DCTDecode, CCITTFaxDecode, JPXDecode), and either saves them locally or base64 encodes them.
|
||||||
|
* `_extract_links(self, page: PyPDF2PageObject) -> List[str]`: Extracts URI actions from page annotations to get hyperlinks.
|
||||||
|
* `_extract_metadata(self, pdf_path: Path, reader: PyPDF2PdfReader) -> PDFMetadata`: Reads metadata from the PDF document information dictionary (e.g., /Title, /Author, /CreationDate).
|
||||||
|
|
||||||
|
* ### 9.5. Data Models for PDF Processing
|
||||||
|
* 9.5.1. Dataclass `PDFMetadata` (from `crawl4ai.processors.pdf.processor`)
|
||||||
|
* Fields:
|
||||||
|
* `title (Optional[str], default: None)`
|
||||||
|
* `author (Optional[str], default: None)`
|
||||||
|
* `producer (Optional[str], default: None)`
|
||||||
|
* `created (Optional[datetime], default: None)`
|
||||||
|
* `modified (Optional[datetime], default: None)`
|
||||||
|
* `pages (int, default: 0)`
|
||||||
|
* `encrypted (bool, default: False)`
|
||||||
|
* `file_size (Optional[int], default: None)`
|
||||||
|
* 9.5.2. Dataclass `PDFPage` (from `crawl4ai.processors.pdf.processor`)
|
||||||
|
* Fields:
|
||||||
|
* `page_number (int)`
|
||||||
|
* `raw_text (str, default: "")`
|
||||||
|
* `markdown (str, default: "")`: Markdown representation of the page's text content, processed by `clean_pdf_text`.
|
||||||
|
* `html (str, default: "")`: HTML representation of the page's text content, processed by `clean_pdf_text_to_html`.
|
||||||
|
* `images (List[Dict], default_factory: list)`: List of image dictionaries. Each dictionary contains:
|
||||||
|
* `format (str)`: e.g., "png", "jpeg", "tiff", "jp2", "bin".
|
||||||
|
* `width (int)`
|
||||||
|
* `height (int)`
|
||||||
|
* `color_space (str)`: e.g., "/DeviceRGB", "/DeviceGray".
|
||||||
|
* `bits_per_component (int)`
|
||||||
|
* `path (str, Optional)`: If `save_images_locally` was True, path to the saved image file.
|
||||||
|
* `data (str, Optional)`: If `save_images_locally` was False, base64 encoded image data.
|
||||||
|
* `page (int)`: The page number this image was extracted from.
|
||||||
|
* `links (List[str], default_factory: list)`: List of hyperlink URLs found on the page.
|
||||||
|
* `layout (List[Dict], default_factory: list)`: List of dictionaries representing text layout elements, primarily: `{"type": "text", "text": str, "x": float, "y": float}`.
|
||||||
|
* 9.5.3. Dataclass `PDFProcessResult` (from `crawl4ai.processors.pdf.processor`)
|
||||||
|
* Fields:
|
||||||
|
* `metadata (PDFMetadata)`
|
||||||
|
* `pages (List[PDFPage])`
|
||||||
|
* `processing_time (float, default: 0.0)`: Time in seconds taken to process the PDF.
|
||||||
|
* `version (str, default: "1.1")`: Version of the PDF processor strategy (e.g., "1.1" for current `NaivePDFProcessorStrategy`).
|
||||||
|
|
||||||
|
* ### 9.6. Using PDF Strategies with `AsyncWebCrawler`
|
||||||
|
* 9.6.1. Workflow:
|
||||||
|
1. Instantiate `AsyncWebCrawler`. The `crawler_strategy` parameter of `AsyncWebCrawler` should be set to an instance of `PDFCrawlerStrategy` if you intend to primarily crawl PDF URLs or local PDF files directly. If crawling mixed content where PDFs are discovered via links on HTML pages, the default `AsyncPlaywrightCrawlerStrategy` might be used initially, and then a PDF-specific scraping strategy would be applied when a PDF content type is detected.
|
||||||
|
2. In `CrawlerRunConfig`, set the `scraping_strategy` attribute to an instance of `PDFContentScrapingStrategy`. Configure this strategy with desired options like `extract_images`, `save_images_locally`, etc.
|
||||||
|
3. When `crawler.arun(url="path/to/document.pdf", config=run_config)` is called for a PDF URL or local file path:
|
||||||
|
* `PDFCrawlerStrategy` (if used) or the default crawler strategy fetches the file.
|
||||||
|
* `PDFContentScrapingStrategy.scrape()` is invoked. It uses its internal `NaivePDFProcessorStrategy` instance to parse the PDF.
|
||||||
|
* The extracted text, image data, and metadata are populated into the `CrawlResult` object (e.g., `result.markdown`, `result.media["images"]`, `result.metadata`).
|
||||||
|
* 9.6.2. Example Snippet:
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, PDFCrawlerStrategy
|
||||||
|
from crawl4ai.content_scraping_strategy import PDFContentScrapingStrategy
|
||||||
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy # Corrected import path
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Setup for PDF processing
|
||||||
|
pdf_crawler_strategy = PDFCrawlerStrategy() # Use if directly targeting PDF URLs
|
||||||
|
pdf_scraping_strategy = PDFContentScrapingStrategy(
|
||||||
|
extract_images=True,
|
||||||
|
save_images_locally=True,
|
||||||
|
image_save_dir="./pdf_images_output" # Ensure this directory exists
|
||||||
|
)
|
||||||
|
Path("./pdf_images_output").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# If crawling a website that links to PDFs, you might use the default crawler strategy
|
||||||
|
# and rely on content-type detection to switch to PDFContentScrapingStrategy if needed.
|
||||||
|
# For direct PDF URL:
|
||||||
|
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
|
||||||
|
run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)
|
||||||
|
# Example PDF URL (replace with a real one for testing)
|
||||||
|
pdf_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
|
||||||
|
result = await crawler.arun(url=pdf_url, config=run_config)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print(f"Successfully processed PDF: {result.url}")
|
||||||
|
if result.markdown:
|
||||||
|
print(f"Markdown content (first 500 chars): {result.markdown.raw_markdown[:500]}")
|
||||||
|
if result.media and result.media.images:
|
||||||
|
print(f"Extracted {len(result.media.images)} images.")
|
||||||
|
for img in result.media.images:
|
||||||
|
print(f" - Image source/path: {img.src or img.path}, Page: {img.page}")
|
||||||
|
if result.metadata:
|
||||||
|
print(f"PDF Metadata: {result.metadata}")
|
||||||
|
else:
|
||||||
|
print(f"Failed to process PDF: {result.url}, Error: {result.error_message}")
|
||||||
|
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# asyncio.run(main())
|
||||||
|
```
|
||||||
|
```
|
||||||
File diff suppressed because it is too large
Load Diff
1824
docs/md_v2/assets/llmtxt/crawl4ai_markdown.llm.full.txt
Normal file
1824
docs/md_v2/assets/llmtxt/crawl4ai_markdown.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,818 @@
|
|||||||
|
# Examples Outline for crawl4ai - markdown Component
|
||||||
|
|
||||||
|
**Target Document Type:** Examples Collection
|
||||||
|
**Target Output Filename Suggestion:** `llm_examples_markdown.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
This document provides practical, runnable code examples for the `markdown` component of the `crawl4ai` library, focusing on the `DefaultMarkdownGenerator` and its various configurations.
|
||||||
|
|
||||||
|
## 1. Basic Markdown Generation with `DefaultMarkdownGenerator`
|
||||||
|
|
||||||
|
### 1.1. Example: Generating Markdown with default `DefaultMarkdownGenerator` settings via `AsyncWebCrawler`.
|
||||||
|
This example demonstrates the most basic usage of `DefaultMarkdownGenerator` within an `AsyncWebCrawler` run.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
|
||||||
|
async def basic_markdown_generation_via_crawler():
|
||||||
|
# DefaultMarkdownGenerator will be used by default if markdown_generator is not specified,
|
||||||
|
# but we explicitly set it here for clarity.
|
||||||
|
md_generator = DefaultMarkdownGenerator()
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS # Use BYPASS for fresh content in examples
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Raw Markdown (First 300 chars) ---")
|
||||||
|
print(result.markdown.raw_markdown[:300])
|
||||||
|
print("\n--- Markdown with Citations (First 300 chars) ---")
|
||||||
|
print(result.markdown.markdown_with_citations[:300])
|
||||||
|
print("\n--- References Markdown ---")
|
||||||
|
print(result.markdown.references_markdown) # example.com has no outbound links usually
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(basic_markdown_generation_via_crawler())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 1.2. Example: Direct instantiation and use of `DefaultMarkdownGenerator`.
|
||||||
|
You can use `DefaultMarkdownGenerator` directly if you already have HTML content.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def direct_markdown_generation():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<head><title>Test Page</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Welcome to Example</h1>
|
||||||
|
<p>This is a paragraph with a <a href="https://example.org/another-page">link</a>.</p>
|
||||||
|
<p>Another paragraph follows.</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
# base_url is important for resolving relative links if any, and for citation context
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
|
||||||
|
|
||||||
|
print("--- Raw Markdown (Direct Generation) ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
print("\n--- Markdown with Citations (Direct Generation) ---")
|
||||||
|
print(result_md.markdown_with_citations)
|
||||||
|
print("\n--- References Markdown (Direct Generation) ---")
|
||||||
|
print(result_md.references_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
direct_markdown_generation()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Citation Management in Markdown
|
||||||
|
|
||||||
|
### 2.1. Example: Default citation behavior (citations enabled).
|
||||||
|
By default, `DefaultMarkdownGenerator` generates citations for links.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def default_citation_behavior():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = """
|
||||||
|
<html><body>
|
||||||
|
<p>Check out <a href="https://crawl4ai.com" title="Crawl4ai Homepage">Crawl4ai</a> and
|
||||||
|
<a href="/docs">our documentation</a>.</p>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
|
||||||
|
|
||||||
|
print("--- Raw Markdown ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
print("\n--- Markdown with Citations ---")
|
||||||
|
print(result_md.markdown_with_citations)
|
||||||
|
print("\n--- References Markdown ---")
|
||||||
|
print(result_md.references_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
default_citation_behavior()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.2. Example: Disabling citations in `DefaultMarkdownGenerator`.
|
||||||
|
You can disable citation generation by setting `citations=False` in the `generate_markdown` method.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def disabling_citations():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = """
|
||||||
|
<html><body>
|
||||||
|
<p>A link to <a href="https://anothersite.com">another site</a> will not be cited.</p>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
# Disable citations for this specific call
|
||||||
|
result_md_no_citations = generator.generate_markdown(
|
||||||
|
input_html=html_content,
|
||||||
|
base_url="https://example.com",
|
||||||
|
citations=False
|
||||||
|
)
|
||||||
|
|
||||||
|
print("--- Raw Markdown (Citations Disabled) ---")
|
||||||
|
print(result_md_no_citations.raw_markdown)
|
||||||
|
print("\n--- Markdown with Citations (Citations Disabled) ---")
|
||||||
|
# This should be the same as raw_markdown when citations=False
|
||||||
|
print(result_md_no_citations.markdown_with_citations)
|
||||||
|
print("\n--- References Markdown (Citations Disabled) ---")
|
||||||
|
# This should be empty or minimal
|
||||||
|
print(result_md_no_citations.references_markdown)
|
||||||
|
|
||||||
|
# For comparison, with citations enabled (default)
|
||||||
|
result_md_with_citations = generator.generate_markdown(
|
||||||
|
input_html=html_content,
|
||||||
|
base_url="https://example.com",
|
||||||
|
citations=True # Default
|
||||||
|
)
|
||||||
|
print("\n--- For Comparison: Markdown with Citations (Enabled) ---")
|
||||||
|
print(result_md_with_citations.markdown_with_citations)
|
||||||
|
print("\n--- For Comparison: References Markdown (Enabled) ---")
|
||||||
|
print(result_md_with_citations.references_markdown)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
disabling_citations()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.3. Example: Impact of `base_url` on citation links for relative URLs.
|
||||||
|
The `base_url` parameter is crucial for correctly resolving relative URLs in your HTML content into absolute URLs in the references.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def base_url_impact_on_citations():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = """
|
||||||
|
<html><body>
|
||||||
|
<p>Links: <a href="/features">Features</a>, <a href="pricing.html">Pricing</a>,
|
||||||
|
and an absolute link to <a href="https://external.com/resource">External Resource</a>.</p>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("--- Case 1: With base_url='https://example.com/products/' ---")
|
||||||
|
result_md_case1 = generator.generate_markdown(
|
||||||
|
input_html=html_content,
|
||||||
|
base_url="https://example.com/products/"
|
||||||
|
)
|
||||||
|
print(result_md_case1.references_markdown)
|
||||||
|
|
||||||
|
print("\n--- Case 2: With base_url='https://another-domain.net/' ---")
|
||||||
|
result_md_case2 = generator.generate_markdown(
|
||||||
|
input_html=html_content,
|
||||||
|
base_url="https://another-domain.net/"
|
||||||
|
)
|
||||||
|
print(result_md_case2.references_markdown)
|
||||||
|
|
||||||
|
print("\n--- Case 3: Without base_url (relative links might be incomplete) ---")
|
||||||
|
result_md_case3 = generator.generate_markdown(input_html=html_content)
|
||||||
|
print(result_md_case3.references_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
base_url_impact_on_citations()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.4. Example: Handling HTML with no links (empty `references_markdown`).
|
||||||
|
If the input HTML contains no hyperlinks, the `references_markdown` will be empty.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def no_links_in_html():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = "<html><body><p>This is a paragraph with no links at all.</p><b>Just some bold text.</b></body></html>"
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
|
||||||
|
|
||||||
|
print("--- Raw Markdown ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
print("\n--- Markdown with Citations ---")
|
||||||
|
print(result_md.markdown_with_citations) # Should be same as raw_markdown
|
||||||
|
print("\n--- References Markdown ---")
|
||||||
|
print(f"'{result_md.references_markdown}'") # Should be empty or contain minimal boilerplate
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
no_links_in_html()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Controlling `html2text` Conversion Options
|
||||||
|
The `DefaultMarkdownGenerator` uses the `html2text` library internally. You can pass options to `html2text` either during generator initialization (`options` parameter) or during the `generate_markdown` call (`html2text_options` parameter).
|
||||||
|
|
||||||
|
### 3.1. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore links.
|
||||||
|
This will prevent links from appearing in the Markdown output altogether (different from `citations=False` which keeps link text but omits citation markers).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def ignore_links_option():
|
||||||
|
# Initialize with html2text option to ignore links
|
||||||
|
generator = DefaultMarkdownGenerator(options={"ignore_links": True})
|
||||||
|
html_content = "<html><body><p>A link to <a href='https://example.com'>Example Site</a> and some text.</p></body></html>"
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown (ignore_links=True) ---")
|
||||||
|
print(result_md.raw_markdown) # Link text might be present or absent based on html2text behavior
|
||||||
|
print("--- Markdown with Citations (ignore_links=True) ---")
|
||||||
|
print(result_md.markdown_with_citations) # No citations as links are ignored
|
||||||
|
print("--- References (ignore_links=True) ---")
|
||||||
|
print(f"'{result_md.references_markdown}'") # Should be empty
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ignore_links_option()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3.2. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore images.
|
||||||
|
This will prevent image references (like ``) from appearing in the Markdown.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def ignore_images_option():
|
||||||
|
generator = DefaultMarkdownGenerator(options={"ignore_images": True})
|
||||||
|
html_content = "<html><body><p>An image: <img src='image.png' alt='My Test Image'></p></body></html>"
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown (ignore_images=True) ---")
|
||||||
|
print(result_md.raw_markdown) # Image markdown should be absent
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ignore_images_option()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3.3. Example: Initializing `DefaultMarkdownGenerator` with `options` for `body_width=0` (no line wrapping).
|
||||||
|
`body_width=0` tells `html2text` not to wrap lines.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def no_line_wrapping_option():
|
||||||
|
generator = DefaultMarkdownGenerator(options={"body_width": 0})
|
||||||
|
long_text = "This is a very long line of text that would normally be wrapped by html2text. " * 5
|
||||||
|
html_content = f"<html><body><p>{long_text}</p></body></html>"
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown (body_width=0) ---")
|
||||||
|
print(result_md.raw_markdown) # Observe the long line without soft wraps
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
no_line_wrapping_option()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3.4. Example: Initializing `DefaultMarkdownGenerator` to disable emphasis.
|
||||||
|
This will remove formatting for `<em>` and `<strong>` tags.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def ignore_emphasis_option():
|
||||||
|
generator = DefaultMarkdownGenerator(options={"ignore_emphasis": True})
|
||||||
|
html_content = "<html><body><p>Normal, <em>emphasized</em>, and <strong>strongly emphasized</strong> text.</p></body></html>"
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown (ignore_emphasis=True) ---")
|
||||||
|
print(result_md.raw_markdown) # Emphasis should be gone
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ignore_emphasis_option()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3.5. Example: Overriding `html2text_options` at `generate_markdown` call time.
|
||||||
|
Options passed to `generate_markdown` via `html2text_options` take precedence.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def override_html2text_options():
|
||||||
|
# Initial generator might have some defaults
|
||||||
|
generator = DefaultMarkdownGenerator(options={"ignore_links": False})
|
||||||
|
html_content = "<html><body><p>Link: <a href='https://example.com'>Example</a>.</p></body></html>"
|
||||||
|
|
||||||
|
# Override at call time to protect links
|
||||||
|
result_md = generator.generate_markdown(
|
||||||
|
input_html=html_content,
|
||||||
|
html2text_options={"protect_links": True} # Links will be <URL>
|
||||||
|
)
|
||||||
|
|
||||||
|
print("--- Markdown (protect_links=True via call-time override) ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
override_html2text_options()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3.6. Example: Combining multiple `html2text` options.
|
||||||
|
Multiple options can be combined for fine-grained control over the Markdown output.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def combined_html2text_options():
|
||||||
|
generator = DefaultMarkdownGenerator(options={
|
||||||
|
"ignore_links": True,
|
||||||
|
"ignore_images": True,
|
||||||
|
"body_width": 60 # Wrap at 60 characters
|
||||||
|
})
|
||||||
|
html_content = """
|
||||||
|
<html><body>
|
||||||
|
<p>This is a paragraph with a <a href='https://example.com'>link to ignore</a> and an
|
||||||
|
<img src='image.png' alt='image to ignore'>. It also has some long text to demonstrate wrapping.
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||||
|
</p>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown (Combined Options: ignore_links, ignore_images, body_width=60) ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
combined_html2text_options()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Selecting the HTML Content Source for Markdown Generation
|
||||||
|
The `DefaultMarkdownGenerator` can generate Markdown from different HTML sources within the `CrawlResult`.
|
||||||
|
|
||||||
|
### 4.1. Example: Markdown from `cleaned_html` (default `content_source`).
|
||||||
|
This is the default behavior. `cleaned_html` is the HTML after `WebScrapingStrategy` (e.g., `LXMLWebScrapingStrategy`) has processed it.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
|
||||||
|
async def markdown_from_cleaned_html():
|
||||||
|
# Default content_source is "cleaned_html"
|
||||||
|
md_generator = DefaultMarkdownGenerator()
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Using a more complex page to see the effect of cleaning
|
||||||
|
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Markdown from Cleaned HTML (Default - First 300 chars) ---")
|
||||||
|
print(result.markdown.raw_markdown[:300])
|
||||||
|
# For comparison, show a snippet of cleaned_html
|
||||||
|
print("\n--- Cleaned HTML (Source - First 300 chars) ---")
|
||||||
|
print(result.cleaned_html[:300])
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(markdown_from_cleaned_html())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4.2. Example: Markdown from `raw_html`.
|
||||||
|
This example uses the original, unprocessed HTML fetched from the URL as the source for Markdown generation.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
|
||||||
|
async def markdown_from_raw_html():
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_source="raw_html")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Markdown from Raw HTML (First 300 chars) ---")
|
||||||
|
print(result.markdown.raw_markdown[:300])
|
||||||
|
print("\n--- Raw Page HTML (Source - First 300 chars for comparison) ---")
|
||||||
|
print(result.html[:300]) # result.html contains the raw HTML
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(markdown_from_raw_html())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4.3. Example: Markdown from `fit_html` (requires a `ContentFilterStrategy`).
|
||||||
|
`fit_html` is the HTML content after a `ContentFilterStrategy` (like `PruningContentFilter`) has processed it.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
async def markdown_from_fit_html():
|
||||||
|
# A content filter must run to produce fit_html
|
||||||
|
pruning_filter = PruningContentFilter()
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=pruning_filter,
|
||||||
|
content_source="fit_html" # Explicitly use the output of the filter
|
||||||
|
)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Using a news site which PruningContentFilter can work on
|
||||||
|
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Markdown from Fit HTML (Output of PruningFilter - First 300 chars) ---")
|
||||||
|
# When content_source="fit_html", result.markdown.raw_markdown IS from fit_html
|
||||||
|
print(result.markdown.raw_markdown[:300])
|
||||||
|
print("\n--- Fit HTML itself (Source - First 300 chars for comparison) ---")
|
||||||
|
print(result.markdown.fit_html[:300])
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(markdown_from_fit_html())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Integration with Content Filters
|
||||||
|
`DefaultMarkdownGenerator` can work in conjunction with `ContentFilterStrategy` instances. If a filter is provided, it will produce `fit_html` and `fit_markdown`.
|
||||||
|
|
||||||
|
### 5.1. Example: `DefaultMarkdownGenerator` with `PruningContentFilter`.
|
||||||
|
The `PruningContentFilter` attempts to remove boilerplate and keep main content.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
async def md_with_pruning_filter():
|
||||||
|
pruning_filter = PruningContentFilter()
|
||||||
|
# By default, raw_markdown is from cleaned_html, fit_markdown is from fit_html
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Raw Markdown (from cleaned_html - First 200 chars) ---")
|
||||||
|
print(result.markdown.raw_markdown[:200])
|
||||||
|
print("\n--- Fit Markdown (from PruningFilter's fit_html - First 200 chars) ---")
|
||||||
|
print(result.markdown.fit_markdown[:200])
|
||||||
|
print("\n--- Fit HTML (Source for Fit Markdown - First 200 chars) ---")
|
||||||
|
print(result.markdown.fit_html[:200])
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(md_with_pruning_filter())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5.2. Example: `DefaultMarkdownGenerator` with `BM25ContentFilter`.
|
||||||
|
`BM25ContentFilter` filters content based on relevance to a user query.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
|
||||||
|
async def md_with_bm25_filter():
|
||||||
|
bm25_filter = BM25ContentFilter(user_query="Python programming language features")
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Using a relevant page for the query
|
||||||
|
result = await crawler.arun(url="https://docs.python.org/3/tutorial/classes.html", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Fit Markdown (from BM25Filter - First 300 chars) ---")
|
||||||
|
print(result.markdown.fit_markdown[:300])
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(md_with_bm25_filter())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5.3. Example: `DefaultMarkdownGenerator` with `LLMContentFilter`.
|
||||||
|
`LLMContentFilter` uses an LLM to intelligently filter or summarize content based on instructions. (Requires API Key)
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, LLMConfig, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
|
async def md_with_llm_filter():
|
||||||
|
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
if not openai_api_key:
|
||||||
|
print("OPENAI_API_KEY not found. Skipping LLMContentFilter example.")
|
||||||
|
return
|
||||||
|
|
||||||
|
llm_config = LLMConfig(api_token=openai_api_key, provider="openai/gpt-3.5-turbo")
|
||||||
|
llm_filter = LLMContentFilter(
|
||||||
|
llm_config=llm_config,
|
||||||
|
instruction="Summarize the main arguments presented in this Hacker News discussion thread."
|
||||||
|
)
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=llm_filter)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS # Fresh run for LLM
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Example Hacker News discussion
|
||||||
|
result = await crawler.arun(url="https://news.ycombinator.com/item?id=39000000", config=config) # A past popular item
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Fit Markdown (from LLMContentFilter - First 500 chars) ---")
|
||||||
|
print(result.markdown.fit_markdown[:500])
|
||||||
|
llm_filter.show_usage() # Show token usage
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(md_with_llm_filter())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5.4. Example: Forcing Markdown generation from `fit_html` when a filter is active.
|
||||||
|
This example shows how to ensure the `raw_markdown` itself is generated from the `fit_html` (output of the filter) rather than `cleaned_html`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
async def md_forced_from_fit_html():
|
||||||
|
pruning_filter = PruningContentFilter()
|
||||||
|
# Explicitly set content_source to "fit_html"
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=pruning_filter,
|
||||||
|
content_source="fit_html"
|
||||||
|
)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Raw Markdown (forced from fit_html - First 300 chars) ---")
|
||||||
|
# This raw_markdown is now generated from the output of PruningFilter
|
||||||
|
print(result.markdown.raw_markdown[:300])
|
||||||
|
print("\n--- Fit HTML (Source for Raw Markdown - First 300 chars) ---")
|
||||||
|
print(result.markdown.fit_html[:300])
|
||||||
|
print("\n--- Fit Markdown (should be same as Raw Markdown here - First 300 chars) ---")
|
||||||
|
print(result.markdown.fit_markdown[:300])
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(md_forced_from_fit_html())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5.5. Example: Markdown generation when no filter is active.
|
||||||
|
If no `content_filter` is provided to `DefaultMarkdownGenerator`, `fit_markdown` and `fit_html` will be empty or None.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
|
||||||
|
async def md_no_filter():
|
||||||
|
md_generator = DefaultMarkdownGenerator() # No filter provided
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Raw Markdown (First 300 chars) ---")
|
||||||
|
print(result.markdown.raw_markdown[:300])
|
||||||
|
print("\n--- Fit Markdown (Expected: None or empty) ---")
|
||||||
|
print(result.markdown.fit_markdown)
|
||||||
|
print("\n--- Fit HTML (Expected: None or empty) ---")
|
||||||
|
print(result.markdown.fit_html)
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(md_no_filter())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Understanding `MarkdownGenerationResult` Output Fields
|
||||||
|
|
||||||
|
### 6.1. Example: Accessing all fields of `MarkdownGenerationResult`.
|
||||||
|
This example demonstrates how to access all the different Markdown and HTML outputs available in the `MarkdownGenerationResult` object.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter # Using a filter to populate fit_html/fit_markdown
|
||||||
|
|
||||||
|
async def access_all_markdown_fields():
|
||||||
|
# Setup with a filter to ensure fit_html and fit_markdown are generated
|
||||||
|
content_filter = PruningContentFilter()
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=content_filter,
|
||||||
|
content_source="cleaned_html" # raw_markdown will be from cleaned_html
|
||||||
|
)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Using a content-rich page
|
||||||
|
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
md_result = result.markdown
|
||||||
|
|
||||||
|
print("--- Accessing MarkdownGenerationResult Fields ---")
|
||||||
|
|
||||||
|
print(f"\n1. Raw Markdown (from '{md_generator.content_source}' - snippet):")
|
||||||
|
print(md_result.raw_markdown[:300] + "...")
|
||||||
|
|
||||||
|
print(f"\n2. Markdown with Citations (snippet):")
|
||||||
|
print(md_result.markdown_with_citations[:300] + "...")
|
||||||
|
|
||||||
|
print(f"\n3. References Markdown (snippet):")
|
||||||
|
print(md_result.references_markdown[:200] + "...")
|
||||||
|
|
||||||
|
print(f"\n4. Fit HTML (from ContentFilter - snippet):")
|
||||||
|
if md_result.fit_html:
|
||||||
|
print(md_result.fit_html[:300] + "...")
|
||||||
|
else:
|
||||||
|
print("None (No filter or filter produced no output)")
|
||||||
|
|
||||||
|
print(f"\n5. Fit Markdown (from fit_html - snippet):")
|
||||||
|
if md_result.fit_markdown:
|
||||||
|
print(md_result.fit_markdown[:300] + "...")
|
||||||
|
else:
|
||||||
|
print("None (No filter or filter produced no output)")
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(access_all_markdown_fields())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Advanced and Specific Scenarios
|
||||||
|
|
||||||
|
### 7.1. Example: Handling HTML with complex table structures.
|
||||||
|
`DefaultMarkdownGenerator` (via `html2text`) attempts to render HTML tables into Markdown tables.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def markdown_for_tables():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = """
|
||||||
|
<html><body>
|
||||||
|
<h3>Product Comparison</h3>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr><th>Feature</th><th>Product A</th><th>Product B</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Price</td><td>$100</td><td>$120</td></tr>
|
||||||
|
<tr><td>Rating</td><td>4.5 stars</td><td>4.2 stars</td></tr>
|
||||||
|
<tr><td>Multi-row<br/>Feature</td><td colspan="2">Supported by Both</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown for Table ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
markdown_for_tables()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 7.2. Example: Handling HTML with code blocks.
|
||||||
|
Code blocks are generally preserved in Markdown format.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
def markdown_for_code_blocks():
|
||||||
|
generator = DefaultMarkdownGenerator()
|
||||||
|
html_content = """
|
||||||
|
<html><body>
|
||||||
|
<p>Here is some Python code:</p>
|
||||||
|
<pre><code class="language-python">
|
||||||
|
def greet(name):
|
||||||
|
print(f"Hello, {name}!")
|
||||||
|
|
||||||
|
greet("World")
|
||||||
|
</code></pre>
|
||||||
|
<p>And an inline <code>example_function()</code>.</p>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
result_md = generator.generate_markdown(input_html=html_content)
|
||||||
|
|
||||||
|
print("--- Markdown for Code Blocks ---")
|
||||||
|
print(result_md.raw_markdown)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
markdown_for_code_blocks()
|
||||||
|
```
|
||||||
|
---
|
||||||
|
|
||||||
|
### 7.3. Example: Using a custom `MarkdownGenerationStrategy` (conceptual).
|
||||||
|
You can create your own Markdown generation logic by subclassing `MarkdownGenerationStrategy`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler, CrawlerRunConfig, CacheMode,
|
||||||
|
MarkdownGenerationStrategy, MarkdownGenerationResult
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define a minimal custom Markdown generator
|
||||||
|
class CustomMarkdownGenerator(MarkdownGenerationStrategy):
|
||||||
|
def __init__(self, prefix="CUSTOM MD: ", **kwargs):
|
||||||
|
super().__init__(**kwargs) # Pass along any other options
|
||||||
|
self.prefix = prefix
|
||||||
|
|
||||||
|
def generate_markdown(
|
||||||
|
self,
|
||||||
|
input_html: str,
|
||||||
|
base_url: str = "",
|
||||||
|
html2text_options: dict = None, # Can be used by html2text
|
||||||
|
citations: bool = True, # Standard param
|
||||||
|
**kwargs # For other potential strategy-specific params
|
||||||
|
) -> MarkdownGenerationResult:
|
||||||
|
# Simplified custom logic: just prefix and take a snippet
|
||||||
|
# A real custom generator would do more sophisticated parsing/conversion
|
||||||
|
custom_raw_md = self.prefix + input_html[:100].strip() + "..."
|
||||||
|
|
||||||
|
# For simplicity, we'll just return the custom raw markdown for all fields
|
||||||
|
return MarkdownGenerationResult(
|
||||||
|
raw_markdown=custom_raw_md,
|
||||||
|
markdown_with_citations=custom_raw_md, # No real citation logic here
|
||||||
|
references_markdown="",
|
||||||
|
fit_markdown=None, # Not implementing filtering here
|
||||||
|
fit_html=None
|
||||||
|
)
|
||||||
|
|
||||||
|
async def use_custom_markdown_generator():
|
||||||
|
custom_generator = CustomMarkdownGenerator(prefix="[MyGenerator Says]: ")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=custom_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=config)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("--- Output from CustomMarkdownGenerator ---")
|
||||||
|
print(result.markdown.raw_markdown)
|
||||||
|
# Since our custom generator doesn't really do citations or filtering:
|
||||||
|
print(f"Citations: '{result.markdown.markdown_with_citations}'")
|
||||||
|
print(f"References: '{result.markdown.references_markdown}'")
|
||||||
|
print(f"Fit Markdown: '{result.markdown.fit_markdown}'")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(use_custom_markdown_generator())
|
||||||
|
```
|
||||||
|
---
|
||||||
|
**End of Examples Document**
|
||||||
|
```
|
||||||
@@ -0,0 +1,228 @@
|
|||||||
|
Okay, I have processed the instructions and the provided information. I will now generate the detailed "Foundational Memory" document for the `markdown` component of `crawl4ai`, adhering strictly to the outline you've provided and focusing on factual API details.
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - markdown Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_markdown.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction to Markdown Generation in Crawl4ai
|
||||||
|
|
||||||
|
* 1.1. Purpose: This section outlines the `markdown` component of the `crawl4ai` library. Its primary role is to convert HTML content, obtained during web crawling, into various Markdown formats. These formats are designed to be suitable for consumption by Large Language Models (LLMs), as well as for other applications requiring structured text from web pages.
|
||||||
|
* 1.2. Key Abstractions:
|
||||||
|
* `MarkdownGenerationStrategy`: An abstract base class that defines the interface for different markdown generation algorithms and approaches. This allows for customizable Markdown conversion processes.
|
||||||
|
* `DefaultMarkdownGenerator`: The standard, out-of-the-box implementation of `MarkdownGenerationStrategy`. It handles the conversion of HTML to Markdown, including features like link-to-citation conversion and integration with content filtering.
|
||||||
|
* `MarkdownGenerationResult`: A Pydantic data model that encapsulates the various outputs of the markdown generation process, such as raw markdown, markdown with citations, and markdown derived from filtered content.
|
||||||
|
* `CrawlerRunConfig.markdown_generator`: An attribute within the `CrawlerRunConfig` class that allows users to specify which instance of a `MarkdownGenerationStrategy` should be used for a particular crawl operation.
|
||||||
|
* 1.3. Relationship with Content Filtering: The markdown generation process can be integrated with `RelevantContentFilter` strategies. When a content filter is applied, it first refines the input HTML, and then this filtered HTML is used to produce a `fit_markdown` output, providing a more focused version of the content.
|
||||||
|
|
||||||
|
## 2. Core Interface: `MarkdownGenerationStrategy`
|
||||||
|
|
||||||
|
* 2.1. Purpose: The `MarkdownGenerationStrategy` class is an abstract base class (ABC) that defines the contract for all markdown generation strategies within `crawl4ai`. It ensures that any custom markdown generator will adhere to a common interface, making them pluggable into the crawling process.
|
||||||
|
* 2.2. Source File: `crawl4ai/markdown_generation_strategy.py`
|
||||||
|
* 2.3. Initialization (`__init__`)
|
||||||
|
* 2.3.1. Signature:
|
||||||
|
```python
|
||||||
|
class MarkdownGenerationStrategy(ABC):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
content_filter: Optional[RelevantContentFilter] = None,
|
||||||
|
options: Optional[Dict[str, Any]] = None,
|
||||||
|
verbose: bool = False,
|
||||||
|
content_source: str = "cleaned_html",
|
||||||
|
):
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
* 2.3.2. Parameters:
|
||||||
|
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional `RelevantContentFilter` instance. If provided, this filter will be used to process the HTML before generating the `fit_markdown` and `fit_html` outputs in the `MarkdownGenerationResult`.
|
||||||
|
* `options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary for strategy-specific custom options. This allows subclasses to receive additional configuration parameters. Defaults to an empty dictionary if `None`.
|
||||||
|
* `verbose (bool`, default: `False`)`: If `True`, enables verbose logging for the markdown generation process.
|
||||||
|
* `content_source (str`, default: `"cleaned_html"`)`: A string indicating the source of HTML to use for Markdown generation. Common values might include `"raw_html"` (original HTML from the page), `"cleaned_html"` (HTML after initial cleaning by the scraping strategy), or `"fit_html"` (HTML after being processed by `content_filter`). The actual available sources depend on the `ScrapingResult` provided to the markdown generator.
|
||||||
|
* 2.4. Abstract Methods:
|
||||||
|
* 2.4.1. `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`
|
||||||
|
* Purpose: This abstract method must be implemented by concrete subclasses. It is responsible for taking an HTML string and converting it into various Markdown representations, encapsulated within a `MarkdownGenerationResult` object.
|
||||||
|
* Parameters:
|
||||||
|
* `input_html (str)`: The HTML string content to be converted to Markdown.
|
||||||
|
* `base_url (str`, default: `""`)`: The base URL of the crawled page. This is crucial for resolving relative URLs, especially when converting links to citations.
|
||||||
|
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary of options to be passed to the underlying HTML-to-text conversion engine (e.g., `CustomHTML2Text`).
|
||||||
|
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional `RelevantContentFilter` instance. If provided, this filter is used to generate `fit_markdown` and `fit_html`. This parameter overrides any filter set during the strategy's initialization for this specific call.
|
||||||
|
* `citations (bool`, default: `True`)`: A boolean flag indicating whether to convert Markdown links into a citation format (e.g., `[text]^[1]^`) with a corresponding reference list.
|
||||||
|
* `**kwargs`: Additional keyword arguments to allow for future extensions or strategy-specific parameters.
|
||||||
|
* Returns: (`MarkdownGenerationResult`) An object containing the results of the Markdown generation, including `raw_markdown`, `markdown_with_citations`, `references_markdown`, and potentially `fit_markdown` and `fit_html`.
|
||||||
|
|
||||||
|
## 3. Default Implementation: `DefaultMarkdownGenerator`
|
||||||
|
|
||||||
|
* 3.1. Purpose: `DefaultMarkdownGenerator` is the standard concrete implementation of `MarkdownGenerationStrategy`. It provides a robust mechanism for converting HTML to Markdown, featuring link-to-citation conversion and the ability to integrate with `RelevantContentFilter` strategies for focused content output.
|
||||||
|
* 3.2. Source File: `crawl4ai/markdown_generation_strategy.py`
|
||||||
|
* 3.3. Inheritance: Inherits from `MarkdownGenerationStrategy`.
|
||||||
|
* 3.4. Initialization (`__init__`)
|
||||||
|
* 3.4.1. Signature:
|
||||||
|
```python
|
||||||
|
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
content_filter: Optional[RelevantContentFilter] = None,
|
||||||
|
options: Optional[Dict[str, Any]] = None,
|
||||||
|
# content_source parameter from parent is available
|
||||||
|
# verbose parameter from parent is available
|
||||||
|
):
|
||||||
|
super().__init__(content_filter, options, content_source=kwargs.get("content_source", "cleaned_html"), verbose=kwargs.get("verbose", False))
|
||||||
|
```
|
||||||
|
*(Note: The provided code snippet for `DefaultMarkdownGenerator.__init__` does not explicitly list `verbose` and `content_source`, but they are passed to `super().__init__` through `**kwargs` in the actual library code, so their effective signature matches the parent.)*
|
||||||
|
* 3.4.2. Parameters:
|
||||||
|
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: As defined in `MarkdownGenerationStrategy`.
|
||||||
|
* `options (Optional[Dict[str, Any]]`, default: `None`)`: As defined in `MarkdownGenerationStrategy`.
|
||||||
|
* `verbose (bool`, default: `False`)`: (Passed via `kwargs` to parent) As defined in `MarkdownGenerationStrategy`.
|
||||||
|
* `content_source (str`, default: `"cleaned_html"`)`: (Passed via `kwargs` to parent) As defined in `MarkdownGenerationStrategy`.
|
||||||
|
* 3.5. Key Class Attributes:
|
||||||
|
* 3.5.1. `LINK_PATTERN (re.Pattern)`: A compiled regular expression pattern used to find Markdown links. The pattern is `r'!\[(.[^\]]*)\]\(([^)]*?)(?:\s*\"(.*)\")?\)'`.
|
||||||
|
* 3.6. Key Public Methods:
|
||||||
|
* 3.6.1. `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`
|
||||||
|
* Purpose: Implements the conversion of HTML to Markdown. It uses `CustomHTML2Text` for the base conversion, handles link-to-citation transformation, and integrates with an optional `RelevantContentFilter` to produce `fit_markdown`.
|
||||||
|
* Parameters:
|
||||||
|
* `input_html (str)`: The HTML content to convert.
|
||||||
|
* `base_url (str`, default: `""`)`: Base URL for resolving relative links.
|
||||||
|
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: Options for the `CustomHTML2Text` converter. If not provided, it uses `self.options`.
|
||||||
|
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: Overrides the instance's `content_filter` for this call.
|
||||||
|
* `citations (bool`, default: `True`)`: Whether to convert links to citations.
|
||||||
|
* `**kwargs`: Additional arguments (not currently used by this specific implementation beyond parent class).
|
||||||
|
* Core Logic:
|
||||||
|
1. Instantiates `CustomHTML2Text` using `base_url` and the resolved `html2text_options` (merged from method arg, `self.options`, and defaults).
|
||||||
|
2. Converts `input_html` to `raw_markdown` using the `CustomHTML2Text` instance.
|
||||||
|
3. If `citations` is `True`, calls `self.convert_links_to_citations(raw_markdown, base_url)` to get `markdown_with_citations` and `references_markdown`.
|
||||||
|
4. If `citations` is `False`, `markdown_with_citations` is set to `raw_markdown`, and `references_markdown` is an empty string.
|
||||||
|
5. Determines the active `content_filter` (parameter or instance's `self.content_filter`).
|
||||||
|
6. If an active `content_filter` exists:
|
||||||
|
* Calls `active_filter.filter_content(input_html)` to get a list of filtered HTML strings.
|
||||||
|
* Joins these strings with `\n` and wraps them in `<div>` tags to form `fit_html`.
|
||||||
|
* Uses a new `CustomHTML2Text` instance to convert `fit_html` into `fit_markdown`.
|
||||||
|
7. Otherwise, `fit_html` and `fit_markdown` are set to `None` (or empty strings based on implementation details).
|
||||||
|
8. Constructs and returns a `MarkdownGenerationResult` object with all generated Markdown variants.
|
||||||
|
* 3.6.2. `convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]`
|
||||||
|
* Purpose: Transforms standard Markdown links within the input `markdown` string into a citation format (e.g., `[Link Text]^[1]^`) and generates a corresponding numbered list of references.
|
||||||
|
* Parameters:
|
||||||
|
* `markdown (str)`: The input Markdown string.
|
||||||
|
* `base_url (str`, default: `""`)`: The base URL used to resolve relative link URLs before they are added to the reference list.
|
||||||
|
* Returns: (`Tuple[str, str]`) A tuple where the first element is the Markdown string with links converted to citations, and the second element is a string containing the formatted list of references.
|
||||||
|
* Internal Logic:
|
||||||
|
* Uses the `LINK_PATTERN` regex to find all Markdown links.
|
||||||
|
* For each link, it resolves the URL using `fast_urljoin(base, url)` if `base_url` is provided and the link is relative.
|
||||||
|
* Assigns a unique citation number to each unique URL.
|
||||||
|
* Replaces the original link markup with the citation format (e.g., `[Text]^[Number]^`).
|
||||||
|
* Constructs a Markdown formatted reference list string.
|
||||||
|
* 3.7. Role of `CustomHTML2Text`:
|
||||||
|
* `CustomHTML2Text` is a customized version of an HTML-to-Markdown converter, likely based on the `html2text` library.
|
||||||
|
* It's instantiated by `DefaultMarkdownGenerator` to perform the core HTML to plain Markdown conversion.
|
||||||
|
* Its behavior is controlled by options passed via `html2text_options` in `generate_markdown` or `self.options` of the `DefaultMarkdownGenerator`. These options can include `body_width`, `ignore_links`, `ignore_images`, etc., influencing the final Markdown output. (Refer to `crawl4ai/html2text.py` for specific options).
|
||||||
|
|
||||||
|
## 4. Output Data Model: `MarkdownGenerationResult`
|
||||||
|
|
||||||
|
* 4.1. Purpose: `MarkdownGenerationResult` is a Pydantic `BaseModel` designed to structure and encapsulate the various Markdown outputs generated by any `MarkdownGenerationStrategy`. It provides a consistent way to access different versions of the converted content.
|
||||||
|
* 4.2. Source File: `crawl4ai/models.py`
|
||||||
|
* 4.3. Fields:
|
||||||
|
* 4.3.1. `raw_markdown (str)`: The direct result of converting the input HTML to Markdown, before any citation processing or specific content filtering (by the generator itself) is applied. This represents the most basic Markdown version of the content.
|
||||||
|
* 4.3.2. `markdown_with_citations (str)`: Markdown content where hyperlinks have been converted into a citation style (e.g., `[Link Text]^[1]^`). This is typically derived from `raw_markdown`.
|
||||||
|
* 4.3.3. `references_markdown (str)`: A string containing a formatted list of references (e.g., numbered list of URLs) corresponding to the citations found in `markdown_with_citations`.
|
||||||
|
* 4.3.4. `fit_markdown (Optional[str]`, default: `None`)`: Markdown content generated from HTML that has been processed by a `RelevantContentFilter`. This version is intended to be more concise or focused on relevant parts of the original content. It is `None` if no content filter was applied or if the filter resulted in no content.
|
||||||
|
* 4.3.5. `fit_html (Optional[str]`, default: `None`)`: The HTML content that remains after being processed by a `RelevantContentFilter`. `fit_markdown` is generated from this `fit_html`. It is `None` if no content filter was applied or if the filter resulted in no content.
|
||||||
|
* 4.4. Methods:
|
||||||
|
* 4.4.1. `__str__(self) -> str`:
|
||||||
|
* Purpose: Defines the string representation of a `MarkdownGenerationResult` object.
|
||||||
|
* Signature: `__str__(self) -> str`
|
||||||
|
* Returns: (`str`) The content of the `raw_markdown` field.
|
||||||
|
|
||||||
|
## 5. Integration with Content Filtering (`RelevantContentFilter`)
|
||||||
|
|
||||||
|
* 5.1. Purpose of Integration: `DefaultMarkdownGenerator` allows integration with `RelevantContentFilter` strategies to produce a `fit_markdown` output. This enables generating Markdown from a version of the HTML that has been refined or focused based on relevance criteria defined by the filter (e.g., keywords, semantic similarity, or LLM-based assessment).
|
||||||
|
* 5.2. Mechanism:
|
||||||
|
* A `RelevantContentFilter` instance can be passed to `DefaultMarkdownGenerator` either during its initialization (via the `content_filter` parameter) or directly to its `generate_markdown` method. The filter passed to `generate_markdown` takes precedence if both are provided.
|
||||||
|
* When an active filter is present, `DefaultMarkdownGenerator.generate_markdown` calls the filter's `filter_content(input_html)` method. This method is expected to return a list of HTML string chunks deemed relevant.
|
||||||
|
* These chunks are then joined (typically with `\n` and wrapped in `<div>` tags) to form the `fit_html` string.
|
||||||
|
* This `fit_html` is then converted to Markdown using `CustomHTML2Text`, and the result is stored as `fit_markdown`.
|
||||||
|
* 5.3. Impact on `MarkdownGenerationResult`:
|
||||||
|
* If a `RelevantContentFilter` is successfully used:
|
||||||
|
* `MarkdownGenerationResult.fit_markdown` will contain the Markdown derived from the filtered HTML.
|
||||||
|
* `MarkdownGenerationResult.fit_html` will contain the actual filtered HTML string.
|
||||||
|
* If no filter is used, or if the filter returns an empty list of chunks (indicating no content passed the filter), `fit_markdown` and `fit_html` will be `None` (or potentially empty strings, depending on the exact implementation details of joining an empty list).
|
||||||
|
* 5.4. Supported Filter Types (High-Level Mention):
|
||||||
|
* `PruningContentFilter`: A filter that likely removes irrelevant HTML sections based on predefined rules or structural analysis (e.g., removing common boilerplate like headers, footers, navbars).
|
||||||
|
* `BM25ContentFilter`: A filter that uses the BM25 ranking algorithm to score and select HTML chunks based on their relevance to a user-provided query.
|
||||||
|
* `LLMContentFilter`: A filter that leverages a Large Language Model to assess the relevance of HTML chunks, potentially based on a user query or a general understanding of content importance.
|
||||||
|
* *Note: Detailed descriptions and usage of each filter strategy are covered in their respective documentation sections.*
|
||||||
|
|
||||||
|
## 6. Configuration via `CrawlerRunConfig`
|
||||||
|
|
||||||
|
* 6.1. `CrawlerRunConfig.markdown_generator`
|
||||||
|
* Purpose: This attribute of the `CrawlerRunConfig` class allows a user to specify a custom `MarkdownGenerationStrategy` instance to be used for the markdown conversion phase of a crawl. This provides flexibility in how HTML content is transformed into Markdown.
|
||||||
|
* Type: `MarkdownGenerationStrategy` (accepts any concrete implementation of this ABC).
|
||||||
|
* Default Value: If not specified, an instance of `DefaultMarkdownGenerator()` is used by default within the `AsyncWebCrawler`'s `aprocess_html` method when `config.markdown_generator` is `None`.
|
||||||
|
* Usage Example:
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig, DefaultMarkdownGenerator, AsyncWebCrawler
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Example: Configure a markdown generator with a BM25 filter
|
||||||
|
bm25_filter = BM25ContentFilter(user_query="Python programming language")
|
||||||
|
custom_md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||||
|
|
||||||
|
run_config_with_custom_md = CrawlerRunConfig(
|
||||||
|
markdown_generator=custom_md_generator,
|
||||||
|
# Other run configurations...
|
||||||
|
)
|
||||||
|
|
||||||
|
async def example_crawl():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://en.wikipedia.org/wiki/Python_(programming_language)",
|
||||||
|
config=run_config_with_custom_md
|
||||||
|
)
|
||||||
|
if result.success and result.markdown:
|
||||||
|
print("Raw Markdown (snippet):", result.markdown.raw_markdown[:200])
|
||||||
|
if result.markdown.fit_markdown:
|
||||||
|
print("Fit Markdown (snippet):", result.markdown.fit_markdown[:200])
|
||||||
|
|
||||||
|
# asyncio.run(example_crawl())
|
||||||
|
```
|
||||||
|
|
||||||
|
## 7. Influencing Markdown Output for LLM Consumption
|
||||||
|
|
||||||
|
* 7.1. Role of `DefaultMarkdownGenerator.options` and `html2text_options`:
|
||||||
|
* The `options` parameter in `DefaultMarkdownGenerator.__init__` and the `html2text_options` parameter in its `generate_markdown` method are used to pass configuration settings directly to the underlying `CustomHTML2Text` instance.
|
||||||
|
* `html2text_options` provided to `generate_markdown` will take precedence over `self.options` set during initialization.
|
||||||
|
* These options control various aspects of the HTML-to-Markdown conversion, such as line wrapping, handling of links, images, and emphasis, which can be crucial for preparing text for LLMs.
|
||||||
|
* 7.2. Key `CustomHTML2Text` Options (via `html2text_options` or `DefaultMarkdownGenerator.options`):
|
||||||
|
* `bodywidth (int`, default: `0` when `DefaultMarkdownGenerator` calls `CustomHTML2Text` for `raw_markdown` and `fit_markdown` if not otherwise specified): Determines the width for wrapping lines. A value of `0` disables line wrapping, which is often preferred for LLM processing as it preserves sentence structure across lines.
|
||||||
|
* `ignore_links (bool`, default: `False` in `CustomHTML2Text`): If `True`, all hyperlinks (`<a>` tags) are removed from the output, leaving only their anchor text.
|
||||||
|
* `ignore_images (bool`, default: `False` in `CustomHTML2Text`): If `True`, all image tags (`<img>`) are removed from the output.
|
||||||
|
* `ignore_emphasis (bool`, default: `False` in `CustomHTML2Text`): If `True`, emphasized text (e.g., `<em>`, `<strong>`) is rendered as plain text without Markdown emphasis characters (like `*` or `_`).
|
||||||
|
* `bypass_tables (bool`, default: `False` in `CustomHTML2Text`): If `True`, tables are not formatted as Markdown tables but are rendered as a series of paragraphs, which might be easier for some LLMs to process.
|
||||||
|
* `default_image_alt (str`, default: `""` in `CustomHTML2Text`): Specifies a default alt text for images that do not have an `alt` attribute.
|
||||||
|
* `protect_links (bool`, default: `False` in `CustomHTML2Text`): If `True`, URLs in links are not processed or modified.
|
||||||
|
* `single_line_break (bool`, default: `True` in `CustomHTML2Text`): If `True`, single newlines in HTML are converted to Markdown line breaks (two spaces then a newline). This can help preserve some formatting.
|
||||||
|
* `mark_code (bool`, default: `True` in `CustomHTML2Text`): If `True`, `<code>` and `<pre>` blocks are appropriately marked in Markdown.
|
||||||
|
* `escape_snob (bool`, default: `False` in `CustomHTML2Text`): If `True`, more aggressive escaping of special Markdown characters is performed.
|
||||||
|
* *Note: This list is based on common `html2text` options; refer to `crawl4ai/html2text.py` for the exact implementation and default behaviors within `CustomHTML2Text`.*
|
||||||
|
* 7.3. Impact of `citations (bool)` in `generate_markdown`:
|
||||||
|
* When `citations=True` (default in `DefaultMarkdownGenerator.generate_markdown`):
|
||||||
|
* Standard Markdown links `[text](url)` are converted to `[text]^[citation_number]^`.
|
||||||
|
* A `references_markdown` string is generated, listing all unique URLs with their corresponding citation numbers. This helps LLMs trace information back to its source and can reduce token count if URLs are long or repetitive.
|
||||||
|
* When `citations=False`:
|
||||||
|
* Links remain in their original Markdown format `[text](url)`.
|
||||||
|
* `references_markdown` will be an empty string.
|
||||||
|
* This might be preferred if the LLM needs to directly process the URLs or if the citation format is not desired.
|
||||||
|
* 7.4. Role of `content_source` in `MarkdownGenerationStrategy`:
|
||||||
|
* This parameter (defaulting to `"cleaned_html"` in `DefaultMarkdownGenerator`) specifies which HTML version is used as the primary input for the `generate_markdown` method.
|
||||||
|
* `"cleaned_html"`: Typically refers to HTML that has undergone initial processing by the `ContentScrapingStrategy` (e.g., removal of scripts, styles, and potentially some boilerplate based on the scraping strategy's rules). This is usually the recommended source for general Markdown conversion.
|
||||||
|
* `"raw_html"`: The original, unmodified HTML content fetched from the web page. Using this source would bypass any initial cleaning done by the scraping strategy.
|
||||||
|
* `"fit_html"`: This source is relevant when a `RelevantContentFilter` is used. `fit_html` is the HTML output *after* the `RelevantContentFilter` has processed the `input_html` (which itself is determined by `content_source`). If `content_source` is, for example, `"cleaned_html"`, then `fit_html` is the result of filtering that cleaned HTML. `fit_markdown` is then generated from this `fit_html`.
|
||||||
|
* 7.5. `fit_markdown` vs. `raw_markdown`/`markdown_with_citations`:
|
||||||
|
* `raw_markdown` (or `markdown_with_citations` if `citations=True`) is generated from the HTML specified by `content_source` (e.g., `"cleaned_html"`). It represents a general conversion of that source.
|
||||||
|
* `fit_markdown` is generated *only if* a `RelevantContentFilter` is active (either set in `DefaultMarkdownGenerator` or passed to `generate_markdown`). It is derived from the `fit_html` (the output of the content filter).
|
||||||
|
* **Choosing which to use for LLMs:**
|
||||||
|
* Use `fit_markdown` when you need a concise, highly relevant subset of the page's content tailored to a specific query or set of criteria defined by the filter. This can reduce noise and token count for the LLM.
|
||||||
|
* Use `raw_markdown` or `markdown_with_citations` when you need a more comprehensive representation of the page's textual content, or when no specific filtering criteria are applied.
|
||||||
|
```
|
||||||
@@ -0,0 +1,760 @@
|
|||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - markdown Component
|
||||||
|
|
||||||
|
**Target Document Type:** reasoning
|
||||||
|
**Target Output Filename Suggestion:** `llm_reasoning_markdown_generation.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction to Markdown Generation in Crawl4AI
|
||||||
|
|
||||||
|
* 1.1. **Why Markdown Generation Matters for LLMs**
|
||||||
|
* 1.1.1. The role of clean, structured text for Large Language Model consumption.
|
||||||
|
* **Explanation:** LLMs perform significantly better when input data is well-structured and free of irrelevant noise (like HTML tags, scripts, or complex layouts not meant for textual understanding). Markdown, with its simple syntax, provides a human-readable and machine-parseable format that captures essential semantic structure (headings, lists, paragraphs, code blocks, tables) without the clutter of full HTML. This makes it easier for LLMs to understand the content's hierarchy, identify key information, and perform tasks like summarization, question-answering, or RAG (Retrieval Augmented Generation) more accurately and efficiently.
|
||||||
|
* 1.1.2. Benefits of Markdown: readability, structure preservation, common format.
|
||||||
|
* **Explanation:**
|
||||||
|
* **Readability:** Markdown is designed to be easily readable in its raw form, making it simple for developers and users to inspect and understand the crawled content.
|
||||||
|
* **Structure Preservation:** It effectively preserves the semantic structure of the original HTML (headings, lists, emphasis, etc.), which is crucial context for LLMs.
|
||||||
|
* **Common Format:** Markdown is a widely adopted standard, ensuring compatibility with a vast ecosystem of tools, editors, and LLM input pipelines.
|
||||||
|
* 1.1.3. How Crawl4AI's Markdown generation facilitates RAG and other LLM applications.
|
||||||
|
* **Explanation:** For RAG, Crawl4AI's Markdown output, especially when combined with content filtering, provides clean, relevant text chunks that can be easily embedded and indexed. This improves the quality of retrieved context for LLM prompts. For fine-tuning or direct prompting, the structured Markdown helps the LLM focus on the core content, leading to better quality responses and reducing token consumption by eliminating HTML overhead.
|
||||||
|
|
||||||
|
* 1.2. **Overview of Crawl4AI's Markdown Generation Pipeline**
|
||||||
|
* 1.2.1. High-level flow: HTML -> (Optional Filtering) -> Markdown Conversion -> (Optional Citation Handling).
|
||||||
|
* **Explanation:**
|
||||||
|
1. **Input HTML:** The process starts with either raw HTML from the crawled page or a cleaned/selected HTML segment.
|
||||||
|
2. **Optional Content Filtering:** Before Markdown conversion, a `RelevantContentFilter` can be applied to the HTML. This step aims to remove boilerplate, ads, or irrelevant sections, resulting in `fit_html`. This is crucial for generating `fit_markdown`.
|
||||||
|
3. **Markdown Conversion:** The selected HTML (either the original, cleaned, or filtered `fit_html`) is converted into Markdown using an underlying `html2text` library, specifically `CustomHTML2Text` in Crawl4AI for enhanced control.
|
||||||
|
4. **Optional Citation Handling:** If enabled, inline links in the generated Markdown are converted to a citation format (e.g., `text [^1^]`), and a separate list of references is created.
|
||||||
|
* 1.2.2. Key components involved: `MarkdownGenerationStrategy`, `DefaultMarkdownGenerator`, `CustomHTML2Text`, `RelevantContentFilter`.
|
||||||
|
* **Explanation:**
|
||||||
|
* **`MarkdownGenerationStrategy`:** An interface defining how Markdown should be generated. Allows for custom implementations.
|
||||||
|
* **`DefaultMarkdownGenerator`:** The standard implementation of `MarkdownGenerationStrategy`, using `CustomHTML2Text`. It orchestrates filtering (if provided) and citation handling.
|
||||||
|
* **`CustomHTML2Text`:** An enhanced version of the `html2text` library, providing fine-grained control over the HTML-to-Markdown conversion.
|
||||||
|
* **`RelevantContentFilter`:** An interface for strategies that filter HTML content before it's converted to Markdown, producing `fit_html` and consequently `fit_markdown`.
|
||||||
|
* 1.2.3. How `CrawlerRunConfig` ties these components together.
|
||||||
|
* **Explanation:** The `CrawlerRunConfig` object allows you to specify which `MarkdownGenerationStrategy` (and by extension, which filters and `CustomHTML2Text` options) should be used for a particular crawl run via its `markdown_generator` parameter. This provides run-specific control over the Markdown output.
|
||||||
|
|
||||||
|
* 1.3. **Goals of this Guide**
|
||||||
|
* 1.3.1. Understanding how to configure and customize Markdown output.
|
||||||
|
* **Explanation:** This guide will walk you through the various configuration options available, from choosing HTML sources and content filters to fine-tuning the `html2text` conversion itself.
|
||||||
|
* 1.3.2. Best practices for generating LLM-friendly Markdown.
|
||||||
|
* **Explanation:** We'll discuss tips and techniques to produce Markdown that is optimally structured and cleaned for consumption by Large Language Models.
|
||||||
|
* 1.3.3. Troubleshooting common Markdown generation issues.
|
||||||
|
* **Explanation:** We'll cover common problems encountered during Markdown generation (e.g., noisy output, missing content) and provide strategies for diagnosing and resolving them.
|
||||||
|
|
||||||
|
## 2. Core Concepts in Markdown Generation
|
||||||
|
|
||||||
|
* 2.1. **The `MarkdownGenerationStrategy` Interface**
|
||||||
|
* 2.1.1. **Purpose and Design Rationale:**
|
||||||
|
* Why use a strategy pattern for Markdown generation? (Flexibility, extensibility).
|
||||||
|
* **Explanation:** The strategy pattern allows Crawl4AI to define a common interface for Markdown generation while enabling different concrete implementations. This means users can easily swap out the default Markdown generator for a custom one without altering the core crawler logic. It promotes flexibility and makes the system extensible for future Markdown conversion needs or integration with other libraries.
|
||||||
|
* Core problem it solves: Decoupling Markdown generation logic from the crawler.
|
||||||
|
* **Explanation:** By abstracting Markdown generation into a strategy, the `AsyncWebCrawler` itself doesn't need to know the specifics of *how* Markdown is created. It simply delegates the task to the configured strategy. This separation of concerns makes the codebase cleaner and easier to maintain.
|
||||||
|
* 2.1.2. **When to Implement a Custom `MarkdownGenerationStrategy`:**
|
||||||
|
* Scenarios requiring completely different Markdown conversion logic.
|
||||||
|
* **Example:** If you need to convert HTML to a very specific dialect of Markdown not supported by `html2text`, or if you want to use a different underlying conversion library entirely.
|
||||||
|
* Integrating third-party Markdown conversion libraries.
|
||||||
|
* **Example:** If you prefer to use a library like `turndown` or `mistune` for its specific features or output style.
|
||||||
|
* Advanced pre/post-processing of Markdown.
|
||||||
|
* **Example:** If you need to perform complex transformations on the Markdown *after* initial generation, such as custom table formatting, complex footnote handling beyond standard citations, or domain-specific semantic tagging within the Markdown.
|
||||||
|
* 2.1.3. **How to Implement a Custom `MarkdownGenerationStrategy`:**
|
||||||
|
* Key methods to override (`generate_markdown`).
|
||||||
|
* **Explanation:** The primary method to implement is `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`. This method will receive the HTML (based on `content_source`), and it's responsible for returning a `MarkdownGenerationResult` object.
|
||||||
|
* Input parameters and expected output (`MarkdownGenerationResult`).
|
||||||
|
* **Explanation:** Your custom strategy will receive the `input_html`, the `base_url` (for resolving relative links if needed), `html2text_options` (which you can choose to use or ignore), an optional `content_filter`, and a `citations` flag. It must return an instance of `MarkdownGenerationResult` populated with the relevant Markdown strings.
|
||||||
|
* *Code Example:*
|
||||||
|
```python
|
||||||
|
from crawl4ai import MarkdownGenerationStrategy, MarkdownGenerationResult, RelevantContentFilter
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
class MyCustomMarkdownStrategy(MarkdownGenerationStrategy):
|
||||||
|
def __init__(self, content_source: str = "cleaned_html", **kwargs):
|
||||||
|
super().__init__(content_source=content_source, **kwargs)
|
||||||
|
# Initialize any custom resources if needed
|
||||||
|
|
||||||
|
def generate_markdown(
|
||||||
|
self,
|
||||||
|
input_html: str,
|
||||||
|
base_url: str = "",
|
||||||
|
html2text_options: Optional[Dict[str, Any]] = None, # You can use or ignore these
|
||||||
|
content_filter: Optional[RelevantContentFilter] = None,
|
||||||
|
citations: bool = True, # You can decide how to handle this
|
||||||
|
**kwargs
|
||||||
|
) -> MarkdownGenerationResult:
|
||||||
|
|
||||||
|
# 1. Apply content filter if provided and desired
|
||||||
|
fit_html_output = ""
|
||||||
|
if content_filter:
|
||||||
|
# Assuming content_filter.filter_content returns a list of HTML strings
|
||||||
|
filtered_html_blocks = content_filter.filter_content(input_html)
|
||||||
|
fit_html_output = "\n".join(filtered_html_blocks)
|
||||||
|
|
||||||
|
# 2. Your custom HTML to Markdown conversion logic
|
||||||
|
# This is where you'd use your preferred library or custom logic
|
||||||
|
raw_markdown_text = f"# Custom Markdown for {base_url}\n\n{input_html[:200]}..." # Placeholder
|
||||||
|
|
||||||
|
markdown_with_citations_text = raw_markdown_text # Placeholder for citation logic
|
||||||
|
references_markdown_text = "" # Placeholder for references
|
||||||
|
|
||||||
|
# If you used a filter, also generate fit_markdown
|
||||||
|
fit_markdown_text = ""
|
||||||
|
if fit_html_output:
|
||||||
|
fit_markdown_text = f"# Custom Filtered Markdown\n\n{fit_html_output[:200]}..." # Placeholder
|
||||||
|
|
||||||
|
return MarkdownGenerationResult(
|
||||||
|
raw_markdown=raw_markdown_text,
|
||||||
|
markdown_with_citations=markdown_with_citations_text,
|
||||||
|
references_markdown=references_markdown_text,
|
||||||
|
fit_markdown=fit_markdown_text,
|
||||||
|
fit_html=fit_html_output
|
||||||
|
)
|
||||||
|
|
||||||
|
# Usage:
|
||||||
|
# custom_md_generator = MyCustomMarkdownStrategy()
|
||||||
|
# run_config = CrawlerRunConfig(markdown_generator=custom_md_generator)
|
||||||
|
```
|
||||||
|
* Common pitfalls when creating custom strategies.
|
||||||
|
* **Explanation:**
|
||||||
|
* Forgetting to handle all fields in `MarkdownGenerationResult` (even if some are empty strings).
|
||||||
|
* Incorrectly managing `base_url` for relative links if your custom converter doesn't handle it.
|
||||||
|
* Performance bottlenecks if your custom logic is inefficient.
|
||||||
|
* Not properly integrating with the `content_filter` if one is provided.
|
||||||
|
* 2.1.4. **Understanding `content_source` in `MarkdownGenerationStrategy`**
|
||||||
|
* 2.1.4.1. Purpose: What HTML source should be used for Markdown generation?
|
||||||
|
* **Explanation:** The `content_source` attribute of a `MarkdownGenerationStrategy` (including `DefaultMarkdownGenerator`) tells the strategy which version of the HTML to use as the primary input for generating `raw_markdown` and `markdown_with_citations`.
|
||||||
|
* 2.1.4.2. Available options: `"cleaned_html"`, `"raw_html"`, `"fit_html"`.
|
||||||
|
* **`"cleaned_html"` (Default):** This is the HTML after Crawl4AI's internal `ContentScrapingStrategy` (e.g., `WebScrapingStrategy` or `LXMLWebScrapingStrategy`) has processed it. This usually involves removing scripts, styles, and applying structural cleaning or selection based on `target_elements` or `css_selector` in `CrawlerRunConfig`.
|
||||||
|
* **`"raw_html"`:** The original, unmodified HTML fetched from the page. This is useful if you want to apply your own complete cleaning and Markdown conversion pipeline.
|
||||||
|
* **`"fit_html"`:** The HTML *after* a `RelevantContentFilter` (if provided to the `MarkdownGenerationStrategy`) has processed the input HTML (which would be `cleaned_html` or `raw_html` depending on the initial source). This option is powerful when you want Markdown generated *only* from the most relevant parts of the page.
|
||||||
|
* 2.1.4.3. **Decision Guide: Choosing the Right `content_source`**:
|
||||||
|
* **When to use `"cleaned_html"`:** This is the recommended default for most LLM use cases. It provides a good balance of structured content without excessive noise, as common boilerplate is often removed by the scraping strategy.
|
||||||
|
* **When to use `"raw_html"`:** Choose this if you need absolute control over the HTML input for your Markdown converter, or if Crawl4AI's default cleaning removes elements you wish to keep. Be aware that this might result in noisier Markdown.
|
||||||
|
* **When to use `"fit_html"`:** Opt for this when you are using a `RelevantContentFilter` with your `MarkdownGenerationStrategy` and you want the `raw_markdown` and `markdown_with_citations` to be based *only* on the filtered content. This is distinct from just using the `fit_markdown` field in the result, as it makes the filtered content the *primary* source for all main Markdown outputs.
|
||||||
|
* **Impact on performance and output quality:**
|
||||||
|
* `"raw_html"` might be slightly faster if Crawl4AI's cleaning is complex, but could lead to lower quality Markdown due to more noise.
|
||||||
|
* `"cleaned_html"` offers a good trade-off.
|
||||||
|
* `"fit_html"` depends on the performance of the `RelevantContentFilter` itself.
|
||||||
|
* 2.1.4.4. *Example Scenarios:*
|
||||||
|
* **General Summarization:** `"cleaned_html"` is usually best.
|
||||||
|
* **Highly Specific Q&A on a Section:** Use a `RelevantContentFilter` to produce `fit_html`, then set `content_source="fit_html"` (or just use the `fit_markdown` from the result if `raw_markdown` from `"cleaned_html"` is also desired).
|
||||||
|
* **Archiving Raw Structure:** `"raw_html"` might be chosen if the goal is to convert the entire, unmodified page structure to Markdown, perhaps for later, more nuanced processing.
|
||||||
|
|
||||||
|
* 2.2. **The `MarkdownGenerationResult` Model**
|
||||||
|
* 2.2.1. **Understanding its Purpose:** Why a structured result object?
|
||||||
|
* **Explanation:** A structured object like `MarkdownGenerationResult` is used instead of a single Markdown string to provide different views or versions of the generated Markdown, catering to various use cases. This allows users to pick the representation that best suits their needs (e.g., with or without citations, raw vs. filtered) without re-processing. It also clearly separates the main content from metadata like references or the intermediate `fit_html`.
|
||||||
|
* 2.2.2. **Deep Dive into `MarkdownGenerationResult` Fields:**
|
||||||
|
* `raw_markdown`:
|
||||||
|
* **What it is:** This is the direct, primary Markdown output generated from the `content_source` (e.g., `cleaned_html`) defined in the `MarkdownGenerationStrategy`. It does *not* have inline links converted to citation format.
|
||||||
|
* **How to use it:** Use this when you need the most "vanilla" Markdown, perhaps for LLMs that are sensitive to citation formats or if you plan to implement your own link/reference handling.
|
||||||
|
* **When it's useful:** For direct input to LLMs that don't require source attribution within the text, or as a base for further custom Markdown processing.
|
||||||
|
* `markdown_with_citations`:
|
||||||
|
* **What it is:** This takes the `raw_markdown` and converts its inline links (e.g., `[link text](http://example.com)`) into a citation format (e.g., `link text [^1^]`).
|
||||||
|
* **How it's generated:** The `DefaultMarkdownGenerator` (via `CustomHTML2Text`) scans `raw_markdown` for links, assigns unique numerical IDs to each unique URL, replaces the inline link with the text and citation marker, and populates `references_markdown`.
|
||||||
|
* **How to use it:** This is often the most useful Markdown for LLM tasks requiring RAG or for generating human-readable documents where sources are important. Combine it with `references_markdown`.
|
||||||
|
* *Example:*
|
||||||
|
```html
|
||||||
|
<!-- Input HTML fragment -->
|
||||||
|
<p>Crawl4AI is an <a href="https://github.com/unclecode/crawl4ai">open-source</a> library.</p>
|
||||||
|
```
|
||||||
|
```markdown
|
||||||
|
// Resulting markdown_with_citations (simplified)
|
||||||
|
Crawl4AI is an open-source [^1^] library.
|
||||||
|
```
|
||||||
|
* `references_markdown`:
|
||||||
|
* **What it is:** A separate Markdown string that lists all unique URLs found and converted to citations, formatted typically as a numbered list.
|
||||||
|
* **How to use it:** Append this string to the end of `markdown_with_citations` to create a complete document with a bibliography or reference section.
|
||||||
|
* **Why it's separate:** This provides flexibility. You can choose to display references at the end, in a sidebar, or not at all.
|
||||||
|
* *Example:*
|
||||||
|
```markdown
|
||||||
|
## References
|
||||||
|
|
||||||
|
[^1^]: https://github.com/unclecode/crawl4ai
|
||||||
|
```
|
||||||
|
* `fit_markdown`:
|
||||||
|
* **What it is:** This is Markdown generated *exclusively* from the `fit_html`. `fit_html` itself is the output of a `RelevantContentFilter` if one was provided to the `MarkdownGenerationStrategy`. If no filter was used, `fit_markdown` will likely be empty or reflect the `raw_markdown`.
|
||||||
|
* **How to use it:** When your primary goal is to feed an LLM with the most relevant, filtered content. This is excellent for tasks like generating concise summaries or providing highly focused context for RAG.
|
||||||
|
* **Relationship with `raw_markdown`:** If a filter is active, `fit_markdown` is based on a *subset* or *transformed version* of the HTML that `raw_markdown` was based on (assuming `content_source` wasn't `"fit_html"`). If `content_source` *was* `"fit_html"`, then `raw_markdown` and `fit_markdown` would be derived from the same filtered HTML, but `fit_markdown` might still undergo different processing if the strategy handles it distinctly.
|
||||||
|
* *Example:* Imagine a news article page. `raw_markdown` might contain the article, comments, ads, and navigation. If a `BM25ContentFilter` is used with a query about "stock market impact", `fit_markdown` would ideally only contain paragraphs related to that topic, stripped of other page elements.
|
||||||
|
* `fit_html`:
|
||||||
|
* **What it is:** The actual HTML string *after* a `RelevantContentFilter` (like `PruningContentFilter` or `LLMContentFilter`) has processed the input HTML. If no filter is applied, this field will be empty.
|
||||||
|
* **How to use it:** Primarily for debugging your content filters. You can inspect `fit_html` to see exactly what HTML content was deemed "relevant" by your filter before it was converted to `fit_markdown`. It can also be useful if you need this filtered HTML for purposes other than Markdown generation.
|
||||||
|
* **Why it's included:** It provides transparency into the filtering process and allows advanced users to work with the intermediate filtered HTML directly.
|
||||||
|
|
||||||
|
## 3. The `DefaultMarkdownGenerator` - Your Go-To Solution
|
||||||
|
|
||||||
|
* 3.1. **Understanding the `DefaultMarkdownGenerator`**
|
||||||
|
* 3.1.1. **Purpose and Design:** The `DefaultMarkdownGenerator` is Crawl4AI's standard, out-of-the-box mechanism for converting HTML content into various Markdown representations. It's designed to be a robust and generally applicable solution for most common use cases, especially when targeting LLM consumption.
|
||||||
|
* 3.1.2. Core Functionality: Its primary task is to orchestrate the HTML-to-Markdown conversion. It internally uses an instance of `CustomHTML2Text` (Crawl4AI's enhanced `html2text` wrapper) to perform the actual conversion.
|
||||||
|
* 3.1.3. How it handles citations and references by default.
|
||||||
|
* **Explanation:** If the `citations` parameter in its `generate_markdown` method is `True` (which it is by default), `DefaultMarkdownGenerator` will post-process the initially generated Markdown to convert inline links into citation markers (e.g., `[^1^]`) and generate a corresponding `references_markdown` block. This is done by its internal `CustomHTML2Text` instance.
|
||||||
|
|
||||||
|
* 3.2. **Configuring `DefaultMarkdownGenerator`**
|
||||||
|
* 3.2.1. **Initialization Options:**
|
||||||
|
* `content_filter (Optional[RelevantContentFilter])`:
|
||||||
|
* **Why use it:** To refine the HTML *before* it's converted to Markdown. This is essential if you want `fit_markdown` (and consequently `fit_html`) to contain only the most relevant parts of the page, leading to a more focused Markdown output.
|
||||||
|
* **How it integrates:** When `generate_markdown` is called, if a `content_filter` is present, `DefaultMarkdownGenerator` first passes the `input_html` (determined by `content_source`) to this filter. The filter returns a list of HTML strings (or a single string if merged). This filtered HTML becomes the `fit_html`. Then, `fit_markdown` is generated from this `fit_html`. The `raw_markdown` and `markdown_with_citations` are still generated from the original `content_source` unless `content_source` itself is set to `"fit_html"`.
|
||||||
|
* *Impact:* Directly influences `fit_markdown` and `fit_html` fields in `MarkdownGenerationResult`. Can significantly reduce the noise and improve the relevance of the final Markdown for LLMs.
|
||||||
|
* *Code Example:*
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
# Initialize a filter
|
||||||
|
pruning_filter = PruningContentFilter(threshold_type="fixed", threshold=0.5)
|
||||||
|
|
||||||
|
# Initialize DefaultMarkdownGenerator with the filter
|
||||||
|
md_generator_with_filter = DefaultMarkdownGenerator(content_filter=pruning_filter)
|
||||||
|
|
||||||
|
# This generator will now produce 'fit_markdown' based on pruning.
|
||||||
|
# run_config = CrawlerRunConfig(markdown_generator=md_generator_with_filter)
|
||||||
|
# result = await crawler.arun(url="...", config=run_config)
|
||||||
|
# print(result.markdown.fit_markdown)
|
||||||
|
```
|
||||||
|
* `options (Optional[Dict[str, Any]])`:
|
||||||
|
* **What it is:** This dictionary allows you to pass configuration options directly to the underlying `CustomHTML2Text` instance. These options control the specifics of the HTML-to-Markdown conversion process.
|
||||||
|
* **How to use it:** Provide a dictionary where keys are `html2text` option names (e.g., `body_width`, `ignore_links`) and values are their desired settings.
|
||||||
|
* *See Section 6: Mastering `CustomHTML2Text` for detailed options.*
|
||||||
|
* `content_source (str)`:
|
||||||
|
* **Reiteration:** As discussed in section 2.1.4, this determines the primary HTML input for `raw_markdown` and `markdown_with_citations`.
|
||||||
|
* **How it interacts with `content_filter`:**
|
||||||
|
* If `content_source` is, for example, `"cleaned_html"` and a `content_filter` is also provided, the `content_filter` will process this `"cleaned_html"` to produce `fit_html`. The `fit_markdown` field in `MarkdownGenerationResult` will be based on this `fit_html`.
|
||||||
|
* However, `raw_markdown` and `markdown_with_citations` will still be based on the original `"cleaned_html"` (unless `content_source` was explicitly set to `"fit_html"`). This allows you to have both a "fuller" Markdown and a "filtered" Markdown from a single generation step.
|
||||||
|
|
||||||
|
* 3.3. **Common Workflows with `DefaultMarkdownGenerator`**
|
||||||
|
* 3.3.1. **Workflow: Generating Basic Markdown with Citations**
|
||||||
|
* Steps: Instantiate `DefaultMarkdownGenerator` (or use the crawler's default). The crawler calls its `generate_markdown` method. Access `result.markdown.markdown_with_citations` and `result.markdown.references_markdown`.
|
||||||
|
* *Code Example:*
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
async def basic_markdown_workflow():
|
||||||
|
# DefaultMarkdownGenerator is used implicitly if none is specified in CrawlerRunConfig
|
||||||
|
# Or explicitly:
|
||||||
|
md_generator = DefaultMarkdownGenerator()
|
||||||
|
run_config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print("--- Markdown with Citations ---")
|
||||||
|
print(result.markdown.markdown_with_citations[:500]) # Show first 500 chars
|
||||||
|
print("\n--- References ---")
|
||||||
|
print(result.markdown.references_markdown)
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
```
|
||||||
|
* 3.3.2. **Workflow: Generating Focused Markdown using a Content Filter**
|
||||||
|
* Steps:
|
||||||
|
1. Choose and instantiate a `RelevantContentFilter` (e.g., `BM25ContentFilter`).
|
||||||
|
2. Instantiate `DefaultMarkdownGenerator`, passing the filter to its `content_filter` parameter.
|
||||||
|
3. Set this `DefaultMarkdownGenerator` instance in `CrawlerRunConfig.markdown_generator`.
|
||||||
|
4. After crawling, access `result.markdown.fit_markdown`.
|
||||||
|
* Key configuration considerations for the filter and generator:
|
||||||
|
* For `BM25ContentFilter`, ensure you provide a relevant `user_query`.
|
||||||
|
* Adjust filter thresholds (e.g., `bm25_threshold`) as needed.
|
||||||
|
* The `content_source` for `DefaultMarkdownGenerator` will be the input to the filter.
|
||||||
|
* *Code Example:*
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
|
||||||
|
async def filtered_markdown_workflow():
|
||||||
|
user_query = "information about Crawl4AI library"
|
||||||
|
bm25_filter = BM25ContentFilter(user_query=user_query, bm25_threshold=0.1)
|
||||||
|
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS # For consistent demo results
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Using a page that hopefully has content related to the query
|
||||||
|
result = await crawler.arun(url="https://github.com/unclecode/crawl4ai", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print("--- Fit Markdown (BM25 Filtered) ---")
|
||||||
|
print(result.markdown.fit_markdown) # This is the key output
|
||||||
|
# You can also inspect fit_html to see what the filter selected
|
||||||
|
# print("\n--- Fit HTML ---")
|
||||||
|
# print(result.markdown.fit_html[:500])
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
```
|
||||||
|
* 3.3.3. **Workflow: Customizing Markdown Style via `html2text_options`**
|
||||||
|
* Steps: Instantiate `DefaultMarkdownGenerator` passing a dictionary of `html2text` options to its `options` parameter.
|
||||||
|
* *Code Example:*
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
async def custom_style_markdown_workflow():
|
||||||
|
# Example: Disable line wrapping and ignore images
|
||||||
|
html2text_opts = {
|
||||||
|
"body_width": 0, # Disable line wrapping
|
||||||
|
"ignore_images": True # Don't include image markdown 
|
||||||
|
}
|
||||||
|
md_generator = DefaultMarkdownGenerator(options=html2text_opts)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print("--- Custom Styled Markdown (No Wrap, No Images) ---")
|
||||||
|
print(result.markdown.raw_markdown[:500]) # raw_markdown will reflect these options
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
```
|
||||||
|
* 3.4. **Best Practices for `DefaultMarkdownGenerator`**
|
||||||
|
* **When to use `DefaultMarkdownGenerator` vs. a custom strategy:**
|
||||||
|
* Use `DefaultMarkdownGenerator` for most cases. It's robust and highly configurable through `content_filter` and `html2text_options`.
|
||||||
|
* Opt for a custom strategy only if you need fundamentally different conversion logic or integration with external Markdown libraries that `CustomHTML2Text` doesn't cover.
|
||||||
|
* **Tips for choosing the right `content_source` and `content_filter`:**
|
||||||
|
* Start with `content_source="cleaned_html"` (default) and no filter.
|
||||||
|
* If the output is too noisy, introduce a `RelevantContentFilter`. `PruningContentFilter` is a good first step for general boilerplate. Use `BM25ContentFilter` or `LLMContentFilter` for more targeted filtering based on semantic relevance.
|
||||||
|
* If your filter is very effective and you *only* want Markdown from the filtered content, consider setting `content_source="fit_html"` in your `DefaultMarkdownGenerator` instance.
|
||||||
|
* **How to leverage `MarkdownGenerationResult` effectively:**
|
||||||
|
* For LLM input where source attribution is important, use `markdown_with_citations` + `references_markdown`.
|
||||||
|
* For tasks needing maximum conciseness based on relevance, use `fit_markdown` (after configuring a `content_filter`).
|
||||||
|
* Use `raw_markdown` if you need the "purest" Markdown conversion without citation processing.
|
||||||
|
* Inspect `fit_html` to debug your content filters.
|
||||||
|
|
||||||
|
## 4. Integrating Content Filters for Smarter Markdown (`fit_markdown`)
|
||||||
|
|
||||||
|
* 4.1. **The "Why": Purpose of Content Filtering Before Markdown Generation**
|
||||||
|
* 4.1.1. Reducing noise and improving relevance for LLMs.
|
||||||
|
* **Explanation:** Web pages often contain much more than just the main article content (e.g., navigation, ads, footers, related articles). These can be detrimental to LLM performance, increasing token count, processing time, and potentially confusing the model. Content filters aim to isolate the core, relevant information.
|
||||||
|
* 4.1.2. Generating more concise and focused Markdown (`fit_markdown`).
|
||||||
|
* **Explanation:** By filtering the HTML *before* converting it to Markdown, the resulting `fit_markdown` is inherently more concise and focused on what the filter deemed important. This is ideal for tasks where brevity and relevance are key.
|
||||||
|
* 4.1.3. How `fit_html` is generated and its role.
|
||||||
|
* **Explanation:** When a `RelevantContentFilter` is used with a `MarkdownGenerationStrategy`, the strategy first passes the input HTML (e.g., `cleaned_html`) to the filter's `filter_content` method. This method returns a list of HTML strings (or a single merged string). This output is stored as `fit_html` in the `MarkdownGenerationResult`. `fit_markdown` is then generated by converting this `fit_html` to Markdown.
|
||||||
|
|
||||||
|
* 4.2. **Overview of `RelevantContentFilter` Strategies**
|
||||||
|
* 4.2.1. **`PruningContentFilter`**:
|
||||||
|
* **How it works:** Applies heuristic rules to remove common boilerplate. For example, it might remove elements with very short text content, elements with a high link-to-text ratio, or elements matching common boilerplate CSS classes/IDs (like "footer", "nav", "sidebar").
|
||||||
|
* **When to use it:** A good first-pass filter for general-purpose cleaning. It's fast and doesn't require LLM calls or complex configuration.
|
||||||
|
* **Impact on `fit_markdown`:** Typically good at removing obvious non-content sections, resulting in a cleaner, more article-focused Markdown.
|
||||||
|
* 4.2.2. **`BM25ContentFilter`**:
|
||||||
|
* **How it works:** This filter uses the BM25 algorithm, a classical information retrieval technique. It tokenizes the HTML content into chunks and scores each chunk's relevance against a `user_query`. Chunks exceeding a `bm25_threshold` are kept.
|
||||||
|
* **When to use it:** When you want to extract content specifically related to a user's query from a larger page. Excellent for targeted information retrieval.
|
||||||
|
* **Impact on `fit_markdown`:** The output will be highly tailored to the query. If the query is "Tell me about Crawl4AI's caching", `fit_markdown` should primarily contain sections discussing caching.
|
||||||
|
* 4.2.3. **`LLMContentFilter`**:
|
||||||
|
* **How it works:** This is the most powerful and flexible filter. It chunks the input HTML and sends each chunk (or a summary) to an LLM with specific `instructions` (e.g., "Extract only the paragraphs discussing financial results"). The LLM decides which chunks are relevant.
|
||||||
|
* **When to use it:** For complex filtering criteria that are hard to express with rules or keywords, or when nuanced understanding of content is required.
|
||||||
|
* **Impact on `fit_markdown`:** Can produce very precise and contextually relevant Markdown. However, it's generally slower and can be more expensive due to LLM API calls.
|
||||||
|
* 4.3. **Decision Guide: Choosing the Right `RelevantContentFilter`**
|
||||||
|
* *Table:*
|
||||||
|
| Filter | Speed | Cost (LLM API) | Accuracy/Nuance | Use Case Examples | Configuration Complexity |
|
||||||
|
|-----------------------|------------|----------------|-----------------|----------------------------------------------------|--------------------------|
|
||||||
|
| `PruningContentFilter`| Very Fast | None | Low-Medium | General boilerplate removal, quick cleaning. | Low |
|
||||||
|
| `BM25ContentFilter` | Fast | None | Medium | Query-focused extraction, finding relevant sections. | Medium (query, threshold)|
|
||||||
|
| `LLMContentFilter` | Slow | Potentially High| High | Complex criteria, nuanced extraction, summarization. | High (prompt engineering) |
|
||||||
|
* Factors to consider:
|
||||||
|
* **Desired Output Quality:** For the highest semantic relevance, `LLMContentFilter` is often best, but at a cost.
|
||||||
|
* **Performance Constraints:** If speed is critical, `PruningContentFilter` or `BM25ContentFilter` are preferred.
|
||||||
|
* **Nature of the HTML Content:** For well-structured articles, `PruningContentFilter` might be sufficient. For diverse content or Q&A, `BM25ContentFilter` or `LLMContentFilter` might be better.
|
||||||
|
* **Specificity of Task:** If you have a clear query, `BM25ContentFilter` excels. If you have complex instructions, `LLMContentFilter` is suitable.
|
||||||
|
* 4.4. **Code Examples: Combining Filters with `DefaultMarkdownGenerator`**
|
||||||
|
* 4.4.1. *Example:* [Using `PruningContentFilter` to generate `fit_markdown`].
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
async def pruning_filter_example():
|
||||||
|
pruning_filter = PruningContentFilter(threshold=0.4, threshold_type="fixed") # Adjust threshold as needed
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter)
|
||||||
|
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print("--- Fit Markdown (Pruned) ---")
|
||||||
|
print(result.markdown.fit_markdown[:1000]) # Show first 1000 chars
|
||||||
|
# print("\n--- Original Raw Markdown (for comparison) ---")
|
||||||
|
# print(result.markdown.raw_markdown[:1000])
|
||||||
|
```
|
||||||
|
* 4.4.2. *Example:* [Using `BM25ContentFilter` with a query to generate query-focused `fit_markdown`].
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||||
|
|
||||||
|
async def bm25_filter_example():
|
||||||
|
user_query = "Python syntax and semantics"
|
||||||
|
bm25_filter = BM25ContentFilter(user_query=user_query, bm25_threshold=0.1)
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||||
|
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print(f"--- Fit Markdown (BM25 Filtered for query: '{user_query}') ---")
|
||||||
|
print(result.markdown.fit_markdown)
|
||||||
|
```
|
||||||
|
* 4.4.3. *Example:* [Using `LLMContentFilter` for nuanced content selection before Markdown generation].
|
||||||
|
```python
|
||||||
|
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, LLMConfig, CacheMode
|
||||||
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
import os
|
||||||
|
|
||||||
|
async def llm_filter_example():
|
||||||
|
# Ensure OPENAI_API_KEY is set in your environment
|
||||||
|
if not os.getenv("OPENAI_API_KEY"):
|
||||||
|
print("OPENAI_API_KEY not set. Skipping LLMContentFilter example.")
|
||||||
|
return
|
||||||
|
|
||||||
|
llm_config_obj = LLMConfig(provider="openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
instruction = "Extract only the sections that discuss Python's history and its creator."
|
||||||
|
llm_filter = LLMContentFilter(
|
||||||
|
llm_config=llm_config_obj,
|
||||||
|
instruction=instruction,
|
||||||
|
# chunk_token_threshold=1000 # Adjust as needed
|
||||||
|
)
|
||||||
|
|
||||||
|
md_generator = DefaultMarkdownGenerator(content_filter=llm_filter, content_source="cleaned_html")
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print(f"--- Fit Markdown (LLM Filtered with instruction: '{instruction}') ---")
|
||||||
|
print(result.markdown.fit_markdown)
|
||||||
|
llm_filter.show_usage() # Display token usage
|
||||||
|
else:
|
||||||
|
print(f"Crawl failed: {result.error_message}")
|
||||||
|
```
|
||||||
|
* 4.5. **Best Practices for Content Filtering for Markdown**
|
||||||
|
* **Start Simple:** Begin with `PruningContentFilter` for general cleanup. It's fast and often effective for removing common boilerplate.
|
||||||
|
* **Query-Specific Tasks:** If your goal is to extract information relevant to a specific query, `BM25ContentFilter` is a great, cost-effective choice.
|
||||||
|
* **Nuanced Selection:** Reserve `LLMContentFilter` for tasks requiring deeper semantic understanding or complex filtering logic that rules-based or keyword-based approaches can't handle. Be mindful of its cost and latency.
|
||||||
|
* **Iterate and Test:** Content filtering is often an iterative process. Test your filter configurations on various pages to ensure they behave as expected. Inspect `fit_html` to understand what the filter is selecting/discarding.
|
||||||
|
* **Combine with `content_source`:** Remember that `fit_markdown` is derived from the output of the filter. If you also need Markdown from the pre-filtered content, ensure your `MarkdownGenerationStrategy`'s `content_source` is set appropriately (e.g., `"cleaned_html"`) so that `raw_markdown` reflects that, while `fit_markdown` reflects the filtered version.
|
||||||
|
|
||||||
|
## 5. Customizing Markdown Output via `CrawlerRunConfig`
|
||||||
|
|
||||||
|
* 5.1. **The Role of `CrawlerRunConfig.markdown_generator`**
|
||||||
|
* 5.1.1. How it allows specifying a custom Markdown generation strategy for a crawl run.
|
||||||
|
* **Explanation:** The `markdown_generator` parameter within the `CrawlerRunConfig` object is the primary way to control how Markdown is generated for a specific crawl operation (i.e., a call to `crawler.arun()` or tasks within `crawler.arun_many()`). You can assign an instance of any class that adheres to the `MarkdownGenerationStrategy` interface to it.
|
||||||
|
* 5.1.2. Overriding the default Markdown generation behavior.
|
||||||
|
* **Explanation:** If `CrawlerRunConfig.markdown_generator` is not set (i.e., it's `None`), Crawl4AI will use a default instance of `DefaultMarkdownGenerator` with its standard settings. By providing your own `MarkdownGenerationStrategy` instance (be it a configured `DefaultMarkdownGenerator` or a custom class), you override this default behavior for that particular run.
|
||||||
|
|
||||||
|
* 5.2. **Scenarios for Using `CrawlerRunConfig.markdown_generator`**
|
||||||
|
* 5.2.1. Applying a pre-configured `DefaultMarkdownGenerator` with specific filters or options.
|
||||||
|
* **Why:** You might want different filtering logic or `html2text` options for different URLs or types of content you're crawling, even within the same `AsyncWebCrawler` instance.
|
||||||
|
* 5.2.2. Plugging in a completely custom `MarkdownGenerationStrategy`.
|
||||||
|
* **Why:** As discussed in section 2.1.2, if you have unique Markdown requirements or want to use a different conversion library.
|
||||||
|
* 5.2.3. Disabling Markdown generation entirely by setting it to `None` (if applicable, or by using a "NoOp" strategy).
|
||||||
|
* **Why:** If, for a specific crawl, you only need the HTML or extracted structured data and don't require Markdown output, you can pass `markdown_generator=None` (or a strategy that does nothing) to save processing time.
|
||||||
|
* *Note:* To truly disable Markdown generation and its associated `CustomHTML2Text` processing, you might need a "NoOpMarkdownGenerator". If `markdown_generator` is `None`, the crawler might still fall back to a default. A NoOp strategy would explicitly do nothing.
|
||||||
|
```python
|
||||||
|
# class NoOpMarkdownGenerator(MarkdownGenerationStrategy):
|
||||||
|
# def generate_markdown(self, input_html: str, **kwargs) -> MarkdownGenerationResult:
|
||||||
|
# return MarkdownGenerationResult(raw_markdown="", markdown_with_citations="", references_markdown="")
|
||||||
|
# run_config = CrawlerRunConfig(markdown_generator=NoOpMarkdownGenerator())
|
||||||
|
```
|
||||||
|
|
||||||
|
* 5.3. **Code Examples:**
|
||||||
|
* 5.3.1. *Example:* [Setting a `DefaultMarkdownGenerator` with a `PruningContentFilter` in `CrawlerRunConfig`].
|
||||||
|
```python
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
|
CacheMode
|
||||||
|
)
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
async def run_with_specific_md_generator():
|
||||||
|
# Configure a specific markdown generator
|
||||||
|
pruning_filter = PruningContentFilter(threshold=0.6)
|
||||||
|
specific_md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=pruning_filter,
|
||||||
|
options={"body_width": 0, "ignore_links": True}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure the crawl run to use this generator
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
markdown_generator=specific_md_generator,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com/article1", config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print("--- Markdown from Article 1 (Pruned, No Links, No Wrap) ---")
|
||||||
|
print(result.markdown.fit_markdown[:500])
|
||||||
|
# raw_markdown would also reflect no-wrap and no-links from html2text_options
|
||||||
|
|
||||||
|
# For another URL, you could use a different (or default) generator
|
||||||
|
# default_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
# result2 = await crawler.arun(url="https://example.com/article2", config=default_run_config)
|
||||||
|
|
||||||
|
# asyncio.run(run_with_specific_md_generator())
|
||||||
|
```
|
||||||
|
* 5.3.2. *Example:* [Setting a custom `MyMarkdownStrategy` in `CrawlerRunConfig` (assuming `MyCustomMarkdownStrategy` from 2.1.3)].
|
||||||
|
```python
|
||||||
|
# Assuming MyCustomMarkdownStrategy is defined as in section 2.1.3
|
||||||
|
# from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||||
|
# from your_module import MyCustomMarkdownStrategy # If it's in another file
|
||||||
|
|
||||||
|
# async def run_with_custom_md_strategy():
|
||||||
|
# custom_strategy = MyCustomMarkdownStrategy(content_source="raw_html")
|
||||||
|
# run_config_custom = CrawlerRunConfig(
|
||||||
|
# markdown_generator=custom_strategy,
|
||||||
|
# cache_mode=CacheMode.BYPASS
|
||||||
|
# )
|
||||||
|
|
||||||
|
# async with AsyncWebCrawler() as crawler:
|
||||||
|
# result = await crawler.arun(url="https://example.com", config=run_config_custom)
|
||||||
|
# if result.success:
|
||||||
|
# print("--- Markdown from Custom Strategy ---")
|
||||||
|
# print(result.markdown.raw_markdown) # Or other fields your strategy populates
|
||||||
|
|
||||||
|
# asyncio.run(run_with_custom_md_strategy())
|
||||||
|
```
|
||||||
|
* 5.4. **Interaction with Global vs. Run-Specific Configurations**
|
||||||
|
* **Explanation:** `AsyncWebCrawler` itself does not have a global `markdown_generator` setting during its initialization. Markdown generation is configured *per run* via `CrawlerRunConfig`. This design choice provides maximum flexibility, allowing different Markdown strategies for different URLs or tasks within the same crawler instance lifecycle. If `CrawlerRunConfig.markdown_generator` is not provided, a default `DefaultMarkdownGenerator` instance is used for that specific run.
|
||||||
|
|
||||||
|
## 6. Mastering `CustomHTML2Text` for Fine-Grained Control
|
||||||
|
|
||||||
|
* 6.1. **Understanding `CustomHTML2Text`**
|
||||||
|
* 6.1.1. **Purpose:** Why Crawl4AI includes its own `html2text` extension.
|
||||||
|
* **Enhanced control:** `CustomHTML2Text` is a subclass of the standard `html2text.HTML2Text` library. Crawl4AI uses this custom version to gain more precise control over the HTML-to-Markdown conversion process, particularly to make the output more suitable for LLMs.
|
||||||
|
* **Specific adaptations:** It includes logic for handling Crawl4AI's citation and reference generation (`convert_links_to_citations`), and potentially other tweaks that improve the quality and utility of the Markdown output for AI applications.
|
||||||
|
* 6.1.2. **How it's used by `DefaultMarkdownGenerator`**.
|
||||||
|
* **Explanation:** `DefaultMarkdownGenerator` instantiates `CustomHTML2Text` internally. When you pass `options` to `DefaultMarkdownGenerator`, these are ultimately used to configure this `CustomHTML2Text` instance. The `handle()` method of `CustomHTML2Text` is what performs the core HTML to Markdown conversion.
|
||||||
|
|
||||||
|
* 6.2. **Key `html2text_options` and Their Impact**
|
||||||
|
* (These options are passed via `DefaultMarkdownGenerator(options=...)`)
|
||||||
|
* 6.2.1. `body_width`:
|
||||||
|
* **What it does:** Controls the maximum width of lines in the generated Markdown before wrapping.
|
||||||
|
* **Why configure it:** For LLM consumption, it's often best to disable automatic line wrapping to allow the LLM to process text based on natural paragraph breaks. Setting `body_width=0` achieves this.
|
||||||
|
* *Example:*
|
||||||
|
* `body_width=80` (default-ish for some tools):
|
||||||
|
```markdown
|
||||||
|
This is a longer sentence that will be wrapped by html2text if the body_width is
|
||||||
|
set to a value like 80 characters.
|
||||||
|
```
|
||||||
|
* `body_width=0`:
|
||||||
|
```markdown
|
||||||
|
This is a longer sentence that will not be wrapped by html2text if body_width is 0, allowing the LLM to handle line breaks.
|
||||||
|
```
|
||||||
|
* 6.2.2. `ignore_links`:
|
||||||
|
* **What it does:** If `True`, all hyperlink information (`[text](url)`) is removed, leaving only the link text.
|
||||||
|
* **Why configure it:** Set to `True` if links are considered noise for your LLM task and you don't need source attribution. If `False` (default for Crawl4AI's `CustomHTML2Text` unless overridden), links are preserved and can then be converted to citations by `DefaultMarkdownGenerator`.
|
||||||
|
* *Example:*
|
||||||
|
* `ignore_links=False` (then processed for citations): `Visit [Crawl4AI](https://crawl4ai.com)` -> `Visit Crawl4AI [^1^]`
|
||||||
|
* `ignore_links=True`: `Visit [Crawl4AI](https://crawl4ai.com)` -> `Visit Crawl4AI`
|
||||||
|
* 6.2.3. `ignore_images`:
|
||||||
|
* **What it does:** If `True`, image tags (`<img>`) are completely ignored, and no Markdown image syntax (``) is generated.
|
||||||
|
* **Why configure it:** Useful if image information is irrelevant to your LLM task and you want cleaner, more text-focused Markdown.
|
||||||
|
* *Example:*
|
||||||
|
* HTML: `<img src="logo.png" alt="My Logo">`
|
||||||
|
* `ignore_images=False`: ``
|
||||||
|
* `ignore_images=True`: (nothing is output for the image)
|
||||||
|
* 6.2.4. `protect_links`:
|
||||||
|
* **What it does:** If `True`, surrounds link URLs with `<` and `>`. E.g., `[text](<url>)`.
|
||||||
|
* **Why configure it:** This can sometimes help Markdown parsers that might misinterpret URLs containing special characters. However, with Crawl4AI's citation handling, this is generally not needed, as the raw URLs are moved to the reference section.
|
||||||
|
* 6.2.5. `mark_code`:
|
||||||
|
* **What it does:** Controls how `<pre>` and `<code>` tags are handled. If `True`, it attempts to use Markdown code block syntax (backticks).
|
||||||
|
* **Why configure it:** Essential for preserving code snippets correctly. Usually, you'd want this to be `True`.
|
||||||
|
* 6.2.6. `default_image_alt`:
|
||||||
|
* **What it does:** Provides a default alt text string if an `<img>` tag is missing an `alt` attribute.
|
||||||
|
* **Why configure it:** Can make Markdown more consistent if you choose to include images.
|
||||||
|
* 6.2.7. `bypass_tables`:
|
||||||
|
* **What it does:** If `True`, `<table>` elements are not converted into Markdown table syntax. Their content might be rendered as plain text or omitted, depending on other settings.
|
||||||
|
* **Why configure it:** Standard Markdown table syntax is limited and may not handle complex tables (with `colspan`, `rowspan`, nested tables) well. If you encounter mangled tables, setting this to `True` and processing the table HTML separately (e.g., by extracting the `<table>` HTML and using a specialized table-to-text or table-to-JSON library) might be a better approach.
|
||||||
|
* 6.2.8. `pad_tables`:
|
||||||
|
* **What it does:** If `True`, adds padding spaces around cell content in Markdown tables for better visual alignment in raw Markdown.
|
||||||
|
* **Why configure it:** Mostly an aesthetic choice for human readability of the raw Markdown; LLMs typically don't care about this padding.
|
||||||
|
* *Other relevant options identified from `CustomHTML2Text` (or base `html2text`) source:*
|
||||||
|
* `escape_snob`: If `True`, escapes `>` and `&` characters. Default is `False`.
|
||||||
|
* `skip_internal_links`: If `True`, ignores links that start with `#`. Default is `False`.
|
||||||
|
* `links_each_paragraph`: If `True`, puts a link list after each paragraph. Default is `False`. Crawl4AI's citation system provides a better alternative.
|
||||||
|
* `unicode_snob`: If `True`, uses Unicode characters instead of ASCII approximations. Default is `False` in base `html2text`, but `CustomHTML2Text` might behave differently or Crawl4AI ensures UTF-8 handling.
|
||||||
|
* 6.3. **Best Practices for Configuring `CustomHTML2Text`**
|
||||||
|
* 6.3.1. **General recommendations for LLM-friendly output:**
|
||||||
|
* Set `body_width=0` to disable line wrapping and let paragraphs flow naturally.
|
||||||
|
* Consider `ignore_images=True` if images are not relevant to the LLM's task.
|
||||||
|
* Usually, keep `ignore_links=False` (Crawl4AI default) to allow `DefaultMarkdownGenerator` to handle citations properly.
|
||||||
|
* 6.3.2. **How to balance information preservation with conciseness:**
|
||||||
|
* Be selective with `ignore_*` options. Removing too much might discard useful context.
|
||||||
|
* Use content filters (Section 4) for semantic reduction rather than relying solely on `html2text` options to remove large irrelevant sections.
|
||||||
|
* 6.3.3. **Experimenting with options to achieve desired Markdown style:**
|
||||||
|
* Create a small test HTML snippet.
|
||||||
|
* Instantiate `DefaultMarkdownGenerator` with different `options` dictionaries.
|
||||||
|
* Call its `generate_markdown` method directly (or `_html_to_markdown` on its internal `CustomHTML2Text` instance if you want to bypass citation logic for testing) and observe the output.
|
||||||
|
* 6.4. **Handling Citations and References (`convert_links_to_citations` method in `CustomHTML2Text`)**
|
||||||
|
* 6.4.1. **How it works:**
|
||||||
|
* The `convert_links_to_citations` method (called by `DefaultMarkdownGenerator` if citations are enabled) iterates through the Markdown produced by `html2text.handle()`.
|
||||||
|
* It uses a regular expression (`LINK_PATTERN`) to find all Markdown links (`[text](url "optional title")`).
|
||||||
|
* For each unique URL, it assigns an incremental citation number.
|
||||||
|
* It replaces the original Markdown link with `text [^N^]` (or `![text][^N^]` for images if not ignored).
|
||||||
|
* It builds up a list of reference strings like `[^N^]: url "optional title - text if different from title"`.
|
||||||
|
* 6.4.2. **When it's called:** This method is invoked by `DefaultMarkdownGenerator.generate_markdown()` *after* the initial HTML-to-Markdown conversion by `CustomHTML2Text.handle()` if the `citations` flag is `True`.
|
||||||
|
* 6.4.3. **Impact on `MarkdownGenerationResult` fields:**
|
||||||
|
* The modified Markdown (with `[^N^]` markers) is stored in `markdown_with_citations`.
|
||||||
|
* The collected reference list is stored in `references_markdown`.
|
||||||
|
* `raw_markdown` remains the version *before* citation processing.
|
||||||
|
* 6.4.4. **Customizing Citation Behavior (if possible through options or by subclassing)**.
|
||||||
|
* **Explanation:** Direct customization of the citation format (e.g., changing `[^N^]` to `(N)`) via options is not explicitly provided in `CustomHTML2Text`.
|
||||||
|
* To change this, you would need to:
|
||||||
|
1. Create your own class inheriting from `DefaultMarkdownGenerator`.
|
||||||
|
2. Override the `generate_markdown` method.
|
||||||
|
3. In your override, you could either:
|
||||||
|
* Call the parent's `generate_markdown`, get the `MarkdownGenerationResult`, and then post-process `markdown_with_citations` and `references_markdown` to your desired format.
|
||||||
|
* Or, more invasively, replicate the logic but modify the citation generation part. This might involve creating a custom version of `CustomHTML2Text` or its `convert_links_to_citations` method.
|
||||||
|
* For most users, the default citation format is standard and widely accepted.
|
||||||
|
|
||||||
|
## 7. Advanced Markdown Generation Techniques & Best Practices
|
||||||
|
|
||||||
|
* 7.1. **Achieving LLM-Friendly Markdown Output**
|
||||||
|
* 7.1.1. Prioritizing semantic structure (headings, lists, paragraphs).
|
||||||
|
* **Why:** LLMs leverage structural cues to understand context and hierarchy. Ensure your `html2text_options` (e.g., for headings, list indentation) preserve this structure faithfully.
|
||||||
|
* **How:** Rely on `CustomHTML2Text`'s default handling of semantic HTML tags. If specific tags are problematic, consider pre-processing the HTML.
|
||||||
|
* 7.1.2. Handling complex HTML structures (nested tables, complex layouts).
|
||||||
|
* **Strategies for simplifying or selectively extracting from them:**
|
||||||
|
* **Tables:** For very complex tables, consider `html2text_options={'bypass_tables': True}`. Then, extract the table HTML separately (e.g., using `CrawlResult.html` and a CSS selector for the table) and process it with a specialized table parsing library or even an LLM call focused just on table interpretation.
|
||||||
|
* **Layouts:** Aggressive `RelevantContentFilter` strategies can help. If parts of a complex layout are consistently noise, use `CrawlerRunConfig.excluded_selector` to remove them before they even reach the Markdown generator.
|
||||||
|
* 7.1.3. When to prefer `fit_markdown` over `raw_markdown` (or `markdown_with_citations`).
|
||||||
|
* **Reasoning:**
|
||||||
|
* **`fit_markdown`:** Best for tasks requiring high relevance and conciseness (e.g., RAG context, focused summarization). It reflects the output of your content filtering.
|
||||||
|
* **`raw_markdown` / `markdown_with_citations`:** Better when you need a broader representation of the page's textual content, or when the filtering might be too aggressive and discard potentially useful context. Also, if your `content_source` is already very clean (e.g., from a targeted CSS selector), the difference might be minimal.
|
||||||
|
* 7.1.4. Balancing detail vs. conciseness for different LLM tasks (e.g., summarization vs. Q&A).
|
||||||
|
* **Summarization:** `fit_markdown` from a well-configured `LLMContentFilter` or `BM25ContentFilter` is often ideal. You might also use more aggressive `html2text_options` to remove minor elements.
|
||||||
|
* **Q&A / RAG:** You might prefer a slightly less aggressive filter or even `raw_markdown` (if `content_source` is clean) to ensure all potentially relevant details are available. Citations (`markdown_with_citations` and `references_markdown`) are crucial here for source tracking.
|
||||||
|
|
||||||
|
* 7.2. **Pre-processing HTML for Better Markdown**
|
||||||
|
* 7.2.1. Using `CrawlerRunConfig.excluded_tags` or `excluded_selector` to remove noise before Markdown generation.
|
||||||
|
* **How:** These parameters in `CrawlerRunConfig` are applied by the `ContentScrapingStrategy` *before* the HTML even reaches the `MarkdownGenerationStrategy`.
|
||||||
|
* **Why:** This is the most efficient way to remove large, consistently irrelevant sections (like global headers, footers, sidebars, ad blocks) across all outputs (HTML, Markdown, etc.).
|
||||||
|
* *Code Example:*
|
||||||
|
```python
|
||||||
|
# In CrawlerRunConfig
|
||||||
|
# config = CrawlerRunConfig(
|
||||||
|
# excluded_tags=["nav", "footer", "script", "style"],
|
||||||
|
# excluded_selector=".ads, #social-share-buttons"
|
||||||
|
# )
|
||||||
|
```
|
||||||
|
* 7.2.2. The role of `ContentScrapingStrategy` (e.g., `LXMLWebScrapingStrategy` or the default `WebScrapingStrategy` using BeautifulSoup) in preparing the HTML that `DefaultMarkdownGenerator` receives.
|
||||||
|
* **Explanation:** The `ContentScrapingStrategy` is responsible for the initial cleaning of the HTML. Its output (what becomes `cleaned_html`) is the direct input to `DefaultMarkdownGenerator` if `content_source` is `"cleaned_html"`. Understanding how your chosen scraping strategy cleans HTML is key to predicting the input for Markdown generation. `LXMLWebScrapingStrategy` is generally faster and can be more robust for heavily malformed HTML.
|
||||||
|
|
||||||
|
* 7.3. **Post-processing Generated Markdown**
|
||||||
|
* 7.3.1. When and why you might need to further process Markdown from `MarkdownGenerationResult`.
|
||||||
|
* **Scenarios:**
|
||||||
|
* Custom formatting not achievable with `html2text` options (e.g., specific table styles, unique list markers).
|
||||||
|
* Domain-specific transformations (e.g., converting certain patterns to custom shortcodes).
|
||||||
|
* Further cleaning or condensing based on rules `html2text` or content filters don't cover.
|
||||||
|
* 7.3.2. *Example:* [Python snippet for custom regex replacements or structural adjustments on `raw_markdown`].
|
||||||
|
```python
|
||||||
|
import re
|
||||||
|
|
||||||
|
def custom_post_process_markdown(markdown_text):
|
||||||
|
# Example: Replace all occurrences of "Crawl4AI" with "**Crawl4AI**"
|
||||||
|
markdown_text = re.sub(r"Crawl4AI", r"**Crawl4AI**", markdown_text)
|
||||||
|
|
||||||
|
# Example: Add a horizontal rule after every H2 heading
|
||||||
|
markdown_text = re.sub(r"(^## .*)", r"\1\n\n---", markdown_text, flags=re.MULTILINE)
|
||||||
|
return markdown_text
|
||||||
|
|
||||||
|
# result = await crawler.arun(...)
|
||||||
|
# if result.success:
|
||||||
|
# final_markdown = custom_post_process_markdown(result.markdown.raw_markdown)
|
||||||
|
# print(final_markdown)
|
||||||
|
```
|
||||||
|
|
||||||
|
* 7.4. **Combining Different Strategies for Optimal Results**
|
||||||
|
* 7.4.1. *Scenario:* Using a `RelevantContentFilter` to get `fit_html`, then passing `fit_html` to a custom Markdown generator that expects highly focused input.
|
||||||
|
* **How:**
|
||||||
|
1. Instantiate your filter (e.g., `LLMContentFilter`).
|
||||||
|
2. Instantiate your custom Markdown generator (`MyCustomMarkdownStrategy`).
|
||||||
|
3. In `CrawlerRunConfig`, set `markdown_generator` to your custom generator.
|
||||||
|
4. Crucially, within your custom generator's `generate_markdown` method, ensure you *first* apply the `content_filter` (passed as an argument) to the `input_html` to get the `fit_html`, and then process this `fit_html` with your custom logic. Or, configure your custom generator's `content_source="fit_html"` and pass the filter during its initialization.
|
||||||
|
* 7.4.2. *Scenario:* Using one set of `html2text_options` for `raw_markdown` and another for generating an alternative Markdown representation (perhaps for a different LLM or purpose).
|
||||||
|
* **How:** This would typically require two separate calls to `crawler.arun()` with different `CrawlerRunConfig` objects, each specifying a `DefaultMarkdownGenerator` with different `options`. Alternatively, a custom `MarkdownGenerationStrategy` could internally generate multiple Markdown versions with different settings and include them in custom fields within `MarkdownGenerationResult` (though this would require modifying or extending `MarkdownGenerationResult`).
|
||||||
|
|
||||||
|
## 8. Troubleshooting Common Markdown Generation Issues
|
||||||
|
|
||||||
|
* 8.1. **Problem: Markdown is too noisy / includes boilerplate**
|
||||||
|
* 8.1.1. **Solutions:**
|
||||||
|
* **Use a `RelevantContentFilter`**:
|
||||||
|
* Start with `PruningContentFilter`. It's fast and good for common boilerplate.
|
||||||
|
```python
|
||||||
|
# from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
# from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
# md_generator = DefaultMarkdownGenerator(content_filter=PruningContentFilter(threshold=0.5))
|
||||||
|
```
|
||||||
|
* If more precision is needed, try `BM25ContentFilter` with a relevant query or `LLMContentFilter` with clear instructions.
|
||||||
|
* **Refine `excluded_tags` or `excluded_selector` in `CrawlerRunConfig`**: This removes elements *before* any Markdown strategy sees them.
|
||||||
|
```python
|
||||||
|
# run_config = CrawlerRunConfig(
|
||||||
|
# excluded_tags=["nav", "footer", "aside", "script"],
|
||||||
|
# excluded_selector=".ad-banner, #social-links"
|
||||||
|
# )
|
||||||
|
```
|
||||||
|
* **Adjust `html2text_options`**: Options like `ignore_links`, `ignore_images`, `skip_internal_links` can reduce clutter.
|
||||||
|
```python
|
||||||
|
# from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
# md_generator = DefaultMarkdownGenerator(options={"ignore_images": True, "ignore_links": True})
|
||||||
|
```
|
||||||
|
|
||||||
|
* 8.2. **Problem: Important content is missing from Markdown**
|
||||||
|
* 8.2.1. **Solutions:**
|
||||||
|
* **Check if `content_filter` is too aggressive**: If using a filter, try lowering its threshold (e.g., `bm25_threshold` for `BM25ContentFilter`) or simplifying instructions for `LLMContentFilter`. Temporarily disable the filter to see if the content appears in `raw_markdown`.
|
||||||
|
* **Ensure `word_count_threshold` in `CrawlerRunConfig` (or scraping strategy) is not too high**: The default `WebScrapingStrategy` might have its own cleaning. If `CrawlerRunConfig.word_count_threshold` is too high, it might remove short but important paragraphs.
|
||||||
|
* **Verify `html2text_options` are not inadvertently removing desired content**: For example, if `ignore_links=True` is set, link text itself might still be there, but the link URL will be gone.
|
||||||
|
* **Examine `cleaned_html` or `fit_html`**: Inspect `result.markdown.fit_html` (if a filter was used) or `result.cleaned_html` (if no filter and `content_source` was `cleaned_html`). If the content is missing here, the issue is with HTML cleaning or filtering, not the Markdown conversion itself. If it's present in these HTML versions but not in the final Markdown, the issue is likely with `html2text_options` or the conversion process.
|
||||||
|
|
||||||
|
* 8.3. **Problem: Tables are mangled or poorly formatted**
|
||||||
|
* 8.3.1. **Solutions:**
|
||||||
|
* **Try `html2text_options={'bypass_tables': True}`**: This tells `html2text` to skip converting tables.
|
||||||
|
```python
|
||||||
|
# from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
# md_generator = DefaultMarkdownGenerator(options={"bypass_tables": True})
|
||||||
|
# run_config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||||
|
# result = await crawler.arun(...)
|
||||||
|
# # Now result.markdown.raw_markdown will not have Markdown tables.
|
||||||
|
# # You'd need to parse tables from result.cleaned_html or result.markdown.fit_html
|
||||||
|
```
|
||||||
|
You can then extract the table HTML directly from `result.cleaned_html` (or `result.markdown.fit_html`) using BeautifulSoup or lxml and parse it with a library better suited for complex tables (e.g., pandas `read_html`, or a custom parser).
|
||||||
|
* **Experiment with other `html2text` table formatting options**: Options like `pad_tables` might slightly improve appearance, but won't fix fundamentally complex table structures.
|
||||||
|
* **Consider if the table is truly a data table or a layout table**: Layout tables are often problematic for Markdown conversion and should ideally be filtered out by `PruningContentFilter` or more aggressive cleaning.
|
||||||
|
|
||||||
|
* 8.4. **Problem: Citations or references are incorrect/missing**
|
||||||
|
* 8.4.1. **Solutions:**
|
||||||
|
* **Ensure links are present in the HTML input to `DefaultMarkdownGenerator`**: If the links were removed during an earlier HTML cleaning stage (e.g., by an aggressive `ContentScrapingStrategy` or `excluded_tags`), they can't be converted to citations.
|
||||||
|
* **Verify `ignore_links` is not `True` in `html2text_options`**: `DefaultMarkdownGenerator` relies on `CustomHTML2Text` to see the links to convert them. If `ignore_links=True`, the links are stripped before citation processing can occur.
|
||||||
|
* **Check for unusual link structures in the HTML**: Very non-standard link formats (e.g., heavily JavaScript-driven links without `href` attributes) might not be picked up. `CustomHTML2Text` primarily looks for standard `<a href="...">` tags.
|
||||||
|
|
||||||
|
* 8.5. **Problem: Markdown formatting is not ideal for a specific LLM**
|
||||||
|
* 8.5.1. **Solutions:**
|
||||||
|
* **Fine-tune `html2text_options` extensively**: This is the first line of defense. Experiment with all available options (see Section 6.2) to control aspects like heading styles, list formatting, code block rendering, etc.
|
||||||
|
* **Consider a custom `MarkdownGenerationStrategy`**: If `html2text` options are insufficient, you might need to build your own strategy, possibly using a different Markdown conversion library or implementing custom transformation logic (see Section 2.1.3).
|
||||||
|
* **Implement post-processing steps**: After getting the Markdown from `MarkdownGenerationResult`, apply your own Python scripts (e.g., using regex) to further refine the formatting (see Section 7.3.2).
|
||||||
|
|
||||||
|
* 8.6. **Debugging Workflow**
|
||||||
|
* 8.6.1. **Start with `raw_html` from `CrawlResult`**: `print(result.html)` This is the very first HTML fetched, before any processing. Is your target content even here?
|
||||||
|
* 8.6.2. **Examine `cleaned_html` (or `fit_html`)**:
|
||||||
|
* If no content filter is used in `MarkdownGenerationStrategy`, inspect `result.cleaned_html`. This is what `DefaultMarkdownGenerator` (with `content_source="cleaned_html"`) will use.
|
||||||
|
* If a content filter *is* used, inspect `result.markdown.fit_html`. This is what `DefaultMarkdownGenerator` will use to produce `fit_markdown`.
|
||||||
|
* Is your target content present in these intermediate HTML stages?
|
||||||
|
* 8.6.3. **Isolate the issue**:
|
||||||
|
* **HTML Cleaning/Scraping:** If content is missing from `cleaned_html` (but present in `raw_html`), the issue lies with the `ContentScrapingStrategy` or `CrawlerRunConfig` parameters like `excluded_tags`, `css_selector`, `target_elements`.
|
||||||
|
* **Content Filtering:** If content is in `cleaned_html` but missing from `fit_html`, the issue is with your `RelevantContentFilter` configuration.
|
||||||
|
* **Markdown Conversion:** If content is in `cleaned_html`/`fit_html` but malformed or missing in the final Markdown fields (`raw_markdown`, `fit_markdown`), the issue is likely with `html2text_options` or the `CustomHTML2Text` conversion process.
|
||||||
|
* 8.6.4. **Use `verbose=True` in relevant configs**: Set `verbose=True` in `BrowserConfig` and `CrawlerRunConfig` for more detailed logging output from Crawl4AI, which can provide clues.
|
||||||
|
|
||||||
|
## 9. Conclusion and Next Steps
|
||||||
|
|
||||||
|
* 9.1. Recap of key strategies for effective Markdown generation.
|
||||||
|
* **Summary:** Crawl4AI provides a flexible Markdown generation pipeline. Start with `DefaultMarkdownGenerator`. Use `html2text_options` for stylistic control. Employ `RelevantContentFilter` strategies (`PruningContentFilter`, `BM25ContentFilter`, `LLMContentFilter`) to create focused `fit_markdown` for LLMs. Choose the appropriate `content_source` based on your needs. For highly custom requirements, implement your own `MarkdownGenerationStrategy`.
|
||||||
|
* 9.2. Pointers to other relevant documentation sections (e.g., `RelevantContentFilter` deep dive, `CustomHTML2Text` options in API reference).
|
||||||
|
* **Suggestion:** For a detailed breakdown of each `RelevantContentFilter`, see the "Content Filtering Strategies" guide. For an exhaustive list of `html2text` options, refer to the `CustomHTML2Text` API documentation or the original `html2text` library's documentation.
|
||||||
|
* 9.3. Encouragement for experimentation and community contributions.
|
||||||
|
* **Call to Action:** The best way to master Markdown generation is to experiment with different configurations and content types. If you develop useful custom strategies or identify improvements, consider contributing them back to the Crawl4AI community!
|
||||||
|
|
||||||
|
---
|
||||||
|
```
|
||||||
4374
docs/md_v2/assets/llmtxt/crawl4ai_vibe.llm.full.txt
Normal file
4374
docs/md_v2/assets/llmtxt/crawl4ai_vibe.llm.full.txt
Normal file
File diff suppressed because it is too large
Load Diff
3525
docs/md_v2/assets/llmtxt/crawl4ai_vibe_examples_content.llm.txt
Normal file
3525
docs/md_v2/assets/llmtxt/crawl4ai_vibe_examples_content.llm.txt
Normal file
File diff suppressed because it is too large
Load Diff
188
docs/md_v2/assets/llmtxt/crawl4ai_vibe_memory_content.llm.txt
Normal file
188
docs/md_v2/assets/llmtxt/crawl4ai_vibe_memory_content.llm.txt
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
Okay, I have read the "vibe" description for `crawl4ai`. Based on this, and adhering to the "memory" document type requirements, here is the detailed Markdown outline:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - vibe Component
|
||||||
|
|
||||||
|
**Target Document Type:** memory
|
||||||
|
**Target Output Filename Suggestion:** `llm_memory_vibe_coding.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Vibe Coding with Crawl4AI: Core Concept
|
||||||
|
|
||||||
|
* 1.1. Purpose:
|
||||||
|
* Provides a conceptual framework for interacting with the `crawl4ai` library, particularly when using AI coding assistants.
|
||||||
|
* Aims to simplify the process of building web data applications by focusing on high-level capabilities and key building blocks, enabling users to guide AI assistants effectively even with limited direct `crawl4ai` API knowledge.
|
||||||
|
* 1.2. Principle:
|
||||||
|
* Describes how users can communicate their web scraping and data extraction goals to an AI assistant, which then translates these "vibes" or high-level intentions into `crawl4ai` Python code by leveraging knowledge of the library's core components and configurations.
|
||||||
|
|
||||||
|
## 2. `crawl4ai` High-Level Capabilities (for Vibe Prompts)
|
||||||
|
|
||||||
|
* 2.1. Fetching Webpages
|
||||||
|
* 2.1.1. Description: The library can retrieve content from specified web URLs.
|
||||||
|
* 2.2. Converting Web Content to Clean Markdown
|
||||||
|
* 2.2.1. Description: The library can process raw HTML content and convert it into a cleaned, structured Markdown format.
|
||||||
|
* 2.2.2. Applications: Suitable for content summarization, input for Question & Answering systems, and as a pre-processing step for other LLMs.
|
||||||
|
* 2.3. Extracting Specific Information (JSON)
|
||||||
|
* 2.3.1. Description: The library can extract targeted data elements from webpages and organize them into a JSON structure.
|
||||||
|
* 2.3.2. Examples: Can be used to extract product names, prices from e-commerce sites, article headlines, author names, etc.
|
||||||
|
* 2.4. Crawling Multiple Pages
|
||||||
|
* 2.4.1. Description: The library supports concurrent fetching and processing of a list of URLs.
|
||||||
|
* 2.5. Taking Screenshots and Generating PDFs
|
||||||
|
* 2.5.1. Description: The library can capture visual representations of webpages as PNG screenshots or generate PDF documents.
|
||||||
|
* 2.6. Handling Simple Page Interactions
|
||||||
|
* 2.6.1. Description: The library can execute JavaScript to simulate basic user interactions on a webpage, such as clicking buttons (e.g., "load more") or scrolling.
|
||||||
|
|
||||||
|
## 3. Key `crawl4ai` Building Blocks (API Reference for Vibe Coding Context)
|
||||||
|
|
||||||
|
* 3.1. Class `AsyncWebCrawler`
|
||||||
|
* 3.1.1. Purpose: The primary entry point and main tool within `crawl4ai` for orchestrating web crawling and data extraction tasks.
|
||||||
|
* 3.1.2. Initialization (`__init__`):
|
||||||
|
* Signature: `AsyncWebCrawler(self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, config: Optional[BrowserConfig] = None, base_directory: str = ..., thread_safe: bool = False, logger: Optional[AsyncLoggerBase] = None, **kwargs)`
|
||||||
|
* Parameters:
|
||||||
|
* `crawler_strategy (Optional[AsyncCrawlerStrategy])`: The underlying strategy for web crawling (e.g., `AsyncPlaywrightCrawlerStrategy`). Defaults to `AsyncPlaywrightCrawlerStrategy`.
|
||||||
|
* `config (Optional[BrowserConfig])`: Configuration for the browser instance. See section 3.5 for details.
|
||||||
|
* Other parameters are generally handled by defaults for vibe coding.
|
||||||
|
* 3.2. Method `AsyncWebCrawler.arun()`
|
||||||
|
* 3.2.1. Purpose: Executes a crawl operation on a single URL or resource.
|
||||||
|
* 3.2.2. Signature: `async def arun(self, url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> RunManyReturn`
|
||||||
|
* 3.2.3. Parameters:
|
||||||
|
* `url (str)`: The target resource.
|
||||||
|
* Description: Can be a standard web URL (e.g., "https://example.com"), a local file path (e.g., "file:///path/to/file.html"), or raw HTML content (e.g., "raw:<html>...</html>").
|
||||||
|
* `config (Optional[CrawlerRunConfig])`: An instance of `CrawlerRunConfig` specifying how this particular crawl run should be executed. See section 3.4 for details.
|
||||||
|
* 3.3. Method `AsyncWebCrawler.arun_many()`
|
||||||
|
* 3.3.1. Purpose: Executes crawl operations on a list of URLs or resources, often concurrently.
|
||||||
|
* 3.3.2. Signature: `async def arun_many(self, urls: List[str], config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, **kwargs) -> RunManyReturn`
|
||||||
|
* 3.3.3. Parameters:
|
||||||
|
* `urls (List[str])`: A list of target resources (URLs, file paths, raw HTML strings).
|
||||||
|
* `config (Optional[CrawlerRunConfig])`: An instance of `CrawlerRunConfig` applied to all URLs in the list. See section 3.4 for details.
|
||||||
|
* 3.4. Class `CrawlerRunConfig`
|
||||||
|
* 3.4.1. Purpose: Configuration object for individual crawl runs, controlling aspects like content extraction, page interaction, and output formats.
|
||||||
|
* 3.4.2. Key Parameters for Vibe Coding Context:
|
||||||
|
* `markdown_generator (Optional[MarkdownGenerationStrategy])`:
|
||||||
|
* Description: Specifies the strategy for generating Markdown.
|
||||||
|
* Default: An instance of `DefaultMarkdownGenerator`.
|
||||||
|
* Note for Vibe Coding: Can be `DefaultMarkdownGenerator(content_filter=PruningContentFilter())` for cleaner output.
|
||||||
|
* `extraction_strategy (Optional[ExtractionStrategy])`:
|
||||||
|
* Description: Specifies the strategy for extracting structured data.
|
||||||
|
* Supported Strategies (for Vibe Coding):
|
||||||
|
* `JsonCssExtractionStrategy`: For extracting data based on CSS selectors from structured HTML. Requires a `schema` dictionary.
|
||||||
|
* `LLMExtractionStrategy`: For extracting data using an LLM, often for complex or unstructured HTML. Requires an `LLMConfig` and an `instruction` or Pydantic model defining the desired output.
|
||||||
|
* `js_code (Optional[Union[str, List[str]]])`:
|
||||||
|
* Description: JavaScript code (or a list of code snippets) to be executed on the page after it loads.
|
||||||
|
* `wait_for (Optional[str])`:
|
||||||
|
* Description: A CSS selector or JavaScript expression. The crawler will wait for this condition to be met after `js_code` execution before proceeding.
|
||||||
|
* `session_id (Optional[str])`:
|
||||||
|
* Description: An identifier used to maintain the state of a browser page across multiple `arun` calls. Essential for multi-step interactions on the same page.
|
||||||
|
* `js_only (bool)`:
|
||||||
|
* Description: If `True` (and `session_id` is used), only executes `js_code` on the existing page without a full navigation/reload. Default is `False`.
|
||||||
|
* `screenshot (bool)`:
|
||||||
|
* Description: If `True`, captures a screenshot of the page. Result in `CrawlResult.screenshot`. Default is `False`.
|
||||||
|
* `pdf (bool)`:
|
||||||
|
* Description: If `True`, generates a PDF of the page. Result in `CrawlResult.pdf`. Default is `False`.
|
||||||
|
* `cache_mode (Optional[CacheMode])`:
|
||||||
|
* Description: Controls caching behavior.
|
||||||
|
* Type: `crawl4ai.cache_context.CacheMode` (Enum).
|
||||||
|
* Common Values: `CacheMode.ENABLED`, `CacheMode.BYPASS`.
|
||||||
|
* 3.5. Class `BrowserConfig`
|
||||||
|
* 3.5.1. Purpose: Configures persistent browser-level settings for an `AsyncWebCrawler` instance.
|
||||||
|
* 3.5.2. Key Parameters for Vibe Coding Context:
|
||||||
|
* `headless (bool)`:
|
||||||
|
* Description: If `True`, the browser runs without a visible UI. If `False`, the browser UI is shown.
|
||||||
|
* Default: `True`.
|
||||||
|
* `proxy_config (Optional[Union[ProxyConfig, Dict[str, str]]])`:
|
||||||
|
* Description: Configuration for using a proxy server.
|
||||||
|
* Structure (if dict): `{"server": "http://<host>:<port>", "username": "<user>", "password": "<pass>"}`.
|
||||||
|
* `user_agent (Optional[str])`:
|
||||||
|
* Description: Custom User-Agent string to be used by the browser.
|
||||||
|
* 3.6. Class `LLMConfig`
|
||||||
|
* 3.6.1. Purpose: Configures settings for interacting with Large Language Models, used by `LLMExtractionStrategy`.
|
||||||
|
* 3.6.2. Key Parameters:
|
||||||
|
* `provider (str)`:
|
||||||
|
* Description: Specifies the LLM provider and model identifier.
|
||||||
|
* Examples: "openai/gpt-4o-mini", "ollama/llama3", "anthropic/claude-3-opus-20240229".
|
||||||
|
* `api_token (Optional[str])`:
|
||||||
|
* Description: API key for the LLM provider. Can be the actual key or an environment variable reference (e.g., "env:OPENAI_API_KEY").
|
||||||
|
* 3.7. Class `CrawlResult`
|
||||||
|
* 3.7.1. Purpose: The data object returned by `crawl4ai` operations, containing the results and metadata of a crawl.
|
||||||
|
* 3.7.2. Key Attributes:
|
||||||
|
* `success (bool)`: `True` if the crawl was successful, `False` otherwise.
|
||||||
|
* `markdown (MarkdownGenerationResult)`: Object containing Markdown representations.
|
||||||
|
* `markdown.raw_markdown (str)`: Markdown generated directly from the cleaned HTML.
|
||||||
|
* `markdown.fit_markdown (str)`: Markdown potentially further processed by content filters.
|
||||||
|
* `extracted_content (Optional[str])`: JSON string of structured data if an `ExtractionStrategy` was used and successful.
|
||||||
|
* `links (Links)`: Object containing `internal` and `external` lists of `Link` objects. Each `Link` object has `href`, `text`, `title`.
|
||||||
|
* `media (Media)`: Object containing lists of `MediaItem` for `images`, `videos`, `audios`, and `tables`. Each `MediaItem` has `src`, `alt`, `score`, etc.
|
||||||
|
* `screenshot (Optional[str])`: Base64 encoded string of the PNG screenshot, if `screenshot=True`.
|
||||||
|
* `pdf (Optional[bytes])`: Raw bytes of the PDF document, if `pdf=True`.
|
||||||
|
* `error_message (Optional[str])`: Description of the error if `success` is `False`.
|
||||||
|
|
||||||
|
## 4. Common `crawl4ai` Usage Patterns (Vibe Recipes Mapped to Components)
|
||||||
|
|
||||||
|
* 4.1. Task: Get Clean Markdown from a Page
|
||||||
|
* 4.1.1. Description: Fetch a single webpage and convert its main content into clean Markdown.
|
||||||
|
* 4.1.2. Key `crawl4ai` elements:
|
||||||
|
* `AsyncWebCrawler`
|
||||||
|
* `arun()` method.
|
||||||
|
* `CrawlerRunConfig`:
|
||||||
|
* `markdown_generator`: Typically `DefaultMarkdownGenerator()`. For very clean output, `DefaultMarkdownGenerator(content_filter=PruningContentFilter())`.
|
||||||
|
* 4.2. Task: Extract All Product Names and Prices from an E-commerce Category Page
|
||||||
|
* 4.2.1. Description: Scrape structured data (e.g., product names, prices) from a page with repeating elements.
|
||||||
|
* 4.2.2. Key `crawl4ai` elements:
|
||||||
|
* `AsyncWebCrawler`
|
||||||
|
* `arun()` method.
|
||||||
|
* `CrawlerRunConfig`:
|
||||||
|
* `extraction_strategy`: `JsonCssExtractionStrategy(schema={"name_field": "h2.product-title", "price_field": "span.price"})`. The schema's CSS selectors identify where to find the data.
|
||||||
|
* 4.3. Task: Extract Key Information from an Article using an LLM
|
||||||
|
* 4.3.1. Description: Use an LLM to parse an article and extract specific fields like author, date, and a summary into a JSON format.
|
||||||
|
* 4.3.2. Key `crawl4ai` elements:
|
||||||
|
* `AsyncWebCrawler`
|
||||||
|
* `arun()` method.
|
||||||
|
* `CrawlerRunConfig`:
|
||||||
|
* `extraction_strategy`: `LLMExtractionStrategy(llm_config=..., instruction=..., schema=...)`.
|
||||||
|
* `LLMConfig`: Instance specifying `provider` (e.g., "openai/gpt-4o-mini") and `api_token`.
|
||||||
|
* Schema for `LLMExtractionStrategy`: Can be a Pydantic model definition or a dictionary describing the target JSON structure.
|
||||||
|
* 4.4. Task: Crawl Multiple Pages of a Blog (Clicking "Next Page")
|
||||||
|
* 4.4.1. Description: Navigate through paginated content by simulating clicks on "Next Page" or similar links, collecting data from each page.
|
||||||
|
* 4.4.2. Key `crawl4ai` elements:
|
||||||
|
* `AsyncWebCrawler`
|
||||||
|
* Multiple sequential calls to `arun()` (typically in a loop).
|
||||||
|
* `CrawlerRunConfig` (reused or cloned for each step):
|
||||||
|
* `session_id`: A consistent identifier (e.g., "blog_pagination_session") to maintain the browser state across `arun` calls.
|
||||||
|
* `js_code`: JavaScript to trigger the "Next Page" action (e.g., `document.querySelector('a.next-page-link').click();`).
|
||||||
|
* `wait_for`: A CSS selector or JavaScript condition to ensure the new page content has loaded before proceeding.
|
||||||
|
* `js_only=True`: For subsequent `arun` calls after the initial page load to indicate only JS interaction without full navigation.
|
||||||
|
* 4.5. Task: Get Screenshots of a List of URLs
|
||||||
|
* 4.5.1. Description: Capture screenshots for a batch of URLs.
|
||||||
|
* 4.5.2. Key `crawl4ai` elements:
|
||||||
|
* `AsyncWebCrawler`
|
||||||
|
* `arun_many()` method.
|
||||||
|
* `CrawlerRunConfig`:
|
||||||
|
* `screenshot=True`.
|
||||||
|
|
||||||
|
## 5. Key Input Considerations for `crawl4ai` Operations (Inferred from Vibe Prompting Tips)
|
||||||
|
|
||||||
|
* 5.1. Clear Objective: `crawl4ai` operations are guided by the configuration. The configuration should reflect the user's goal (e.g., Markdown generation, specific data extraction, media capture).
|
||||||
|
* 5.2. URL Input: The `arun` method requires a single `url` string. `arun_many` requires a `List[str]` of URLs.
|
||||||
|
* 5.3. Structured Data Extraction Guidance:
|
||||||
|
* For `JsonCssExtractionStrategy`, the `schema` parameter (a dictionary mapping desired field names to CSS selectors) is essential.
|
||||||
|
* For `LLMExtractionStrategy`, the `instruction` parameter (natural language description of desired data) and/or a `schema` (Pydantic model or dictionary) are crucial, along with a configured `LLMConfig`.
|
||||||
|
* 5.4. LLM Configuration: When `LLMExtractionStrategy` is used, an `LLMConfig` instance specifying `provider` and `api_token` (if applicable) must be provided.
|
||||||
|
* 5.5. Dynamic Page Handling: For pages requiring interaction, `CrawlerRunConfig` parameters like `js_code`, `wait_for`, `session_id`, and `js_only` are used.
|
||||||
|
|
||||||
|
## 6. Expected Output Data from `crawl4ai` Operations (Accessing `CrawlResult`)
|
||||||
|
|
||||||
|
* 6.1. Generated Python Code: When using an AI assistant with `crawl4ai` context, the AI is expected to generate Python code that utilizes `crawl4ai` classes and methods.
|
||||||
|
* 6.2. `CrawlResult` Object: The primary output of `arun()` and `arun_many()` calls.
|
||||||
|
* `result.success (bool)`: Indicates if the individual crawl operation was successful.
|
||||||
|
* `result.markdown.raw_markdown (str)` / `result.markdown.fit_markdown (str)`: Contains the generated Markdown content.
|
||||||
|
* `result.extracted_content (Optional[str])`: Contains the JSON string of structured data if an extraction strategy was successful.
|
||||||
|
* `result.links (Links)`: Provides access to lists of internal and external links.
|
||||||
|
* `result.media (Media)`: Provides access to lists of images, videos, audio files, and tables.
|
||||||
|
* `result.screenshot (Optional[str])`: Base64 encoded screenshot data.
|
||||||
|
* `result.pdf (Optional[bytes])`: Raw PDF data.
|
||||||
|
* `result.error_message (Optional[str])`: Error details if `success` is `False`.
|
||||||
|
* 6.3. Files on Disk: Operations like screenshot or PDF generation, or custom code within an AI-generated script, might save files to the local disk (e.g., PNGs, PDFs, JSON files). The paths depend on the configuration or the custom code.
|
||||||
|
|
||||||
|
```
|
||||||
643
docs/md_v2/assets/llmtxt/crawl4ai_vibe_reasoning_content.llm.txt
Normal file
643
docs/md_v2/assets/llmtxt/crawl4ai_vibe_reasoning_content.llm.txt
Normal file
@@ -0,0 +1,643 @@
|
|||||||
|
```markdown
|
||||||
|
# Detailed Outline for crawl4ai - vibe Component
|
||||||
|
|
||||||
|
**Target Document Type:** reasoning
|
||||||
|
**Target Output Filename Suggestion:** `llm_reasoning_vibe.md`
|
||||||
|
**Library Version Context:** 0.6.3
|
||||||
|
**Outline Generation Date:** 2025-05-24
|
||||||
|
---
|
||||||
|
|
||||||
|
# Vibe Coding with Crawl4AI: A Guide for AI Coding Assistants
|
||||||
|
|
||||||
|
## 1. Introduction: What is "Vibe Coding" with Crawl4AI?
|
||||||
|
|
||||||
|
* **1.1. Explaining the Core Idea:**
|
||||||
|
"Vibe Coding" with Crawl4AI is about empowering users, even those with limited direct coding experience, to build powerful web data applications. Instead of meticulously writing every line of Python, you focus on clearly describing your data extraction or web interaction goals to an AI coding assistant. This guide teaches you how to provide that AI assistant with the *right context* about Crawl4AI, so it can generate effective and efficient code for you. The goal is to translate your "vibe" – your high-level intent – into working Crawl4AI solutions quickly.
|
||||||
|
|
||||||
|
* **1.2. Who is this Guide For?**
|
||||||
|
This guide is designed for:
|
||||||
|
* **Users new to web scraping or `crawl4ai`:** If you prefer to articulate your needs in natural language and have an AI assistant handle the code generation, this guide is for you.
|
||||||
|
* **Data analysts, researchers, and product managers:** Anyone who needs web data but doesn't want to get bogged down in the intricacies of web scraping libraries.
|
||||||
|
* **Developers looking for rapid prototyping:** Even experienced developers can use "vibe coding" to quickly generate boilerplate or test ideas with `crawl4ai` before refining the code.
|
||||||
|
* **AI Coding Assistant Users:** This guide helps you understand what information to feed your AI to get the best `crawl4ai` code.
|
||||||
|
|
||||||
|
* **1.3. How this Guide Helps You (and Your AI Assistant):**
|
||||||
|
By understanding the concepts in this guide, you (and by extension, your AI assistant) will:
|
||||||
|
* Grasp the high-level capabilities of `crawl4ai` that are most relevant for prompting an AI.
|
||||||
|
* Learn the key terminology and building blocks of `crawl4ai` to include in your prompts for precise code generation.
|
||||||
|
* Discover common "vibe recipes" – typical data extraction tasks and how to prompt an AI to solve them using `crawl4ai`.
|
||||||
|
* Pick up effective prompting patterns to maximize the quality of AI-generated `crawl4ai` code.
|
||||||
|
|
||||||
|
## 2. High-Level Capabilities of Crawl4AI (What to Tell Your AI Assistant Crawl4AI Can Do)
|
||||||
|
|
||||||
|
When you're "vibe coding" with your AI assistant, you don't need to explain every nuance of `crawl4ai`. Instead, focus on what it *can do* for you. Here's a high-level overview of capabilities you can confidently tell your AI assistant about:
|
||||||
|
|
||||||
|
* **2.1. Fetching Any Webpage:**
|
||||||
|
* **How to tell your AI:** "Crawl4AI can fetch the content of any webpage, whether it's a simple static page or a complex JavaScript-heavy application."
|
||||||
|
* **Why it's important:** This establishes the fundamental capability – getting the raw HTML from a target URL.
|
||||||
|
|
||||||
|
* **2.2. Converting Web Content into Clean Markdown:**
|
||||||
|
* **How to tell your AI:** "Crawl4AI is great at turning messy web pages into clean, readable Markdown. This is perfect if I need to summarize an article, feed content into another LLM for Q&A, or just get the main text."
|
||||||
|
* **Why it's important:** Markdown is often the desired end-format for LLM-based tasks, and `crawl4ai` simplifies this conversion.
|
||||||
|
|
||||||
|
* **2.3. Extracting Specific Pieces of Information (Structured Data/JSON):**
|
||||||
|
* **How to tell your AI:** "If I need specific data from a page, like all the product names and prices from an e-commerce site, or all the headlines from a news page, Crawl4AI can extract that and give it to me as structured JSON."
|
||||||
|
* **Why it's important:** This highlights `crawl4ai`'s ability to go beyond simple text extraction and pull out specific, targeted information.
|
||||||
|
|
||||||
|
* **2.4. Crawling Multiple Pages at Once:**
|
||||||
|
* **How to tell your AI:** "If I have a list of URLs, Crawl4AI can process them all efficiently, often in parallel, instead of me having to do them one by one."
|
||||||
|
* **Why it's important:** This points to the library's capability for batch processing, saving significant time for larger tasks.
|
||||||
|
|
||||||
|
* **2.5. Taking Screenshots or Making PDFs of Pages:**
|
||||||
|
* **How to tell your AI:** "Crawl4AI can take a screenshot of a webpage or save it as a PDF document. This is useful for archiving, visual checks, or sharing."
|
||||||
|
* **Why it's important:** Visual capture and PDF generation are common requirements that `crawl4ai` handles easily.
|
||||||
|
|
||||||
|
* **2.6. Handling Simple Page Interactions:**
|
||||||
|
* **How to tell your AI:** "For pages that need a bit of interaction, like clicking a 'load more' button to reveal more content or scrolling down, Crawl4AI can execute JavaScript to do that."
|
||||||
|
* **Why it'S important:** This shows that `crawl4ai` isn't limited to static content and can handle basic dynamic page elements to access more data.
|
||||||
|
|
||||||
|
## 3. Key Building Blocks: Communicating Crawl4AI Concepts to Your AI Assistant
|
||||||
|
|
||||||
|
To get the best results from your AI assistant, you'll want to mention the key components of `crawl4ai` it should use. Think of these as the essential vocabulary for your "vibe" prompts.
|
||||||
|
|
||||||
|
* **3.1. `AsyncWebCrawler`**: The Main Tool
|
||||||
|
* **3.1.1. Explaining its Role:**
|
||||||
|
* **What to tell your AI:** "The main tool we'll use in `crawl4ai` is called `AsyncWebCrawler`. We always need to create an instance of this first. It's best to use it with an `async with` statement to make sure resources are managed correctly."
|
||||||
|
* **Why it's important for the AI:** This directs the AI to the central class for all crawling operations and hints at the asynchronous nature and proper resource management.
|
||||||
|
* **3.1.2. Basic Usage Pattern:**
|
||||||
|
* **Vibe Prompt:** "Let's start a `crawl4ai` script. We'll need an `AsyncWebCrawler`."
|
||||||
|
* **Expected AI action:** The AI should generate the basic setup:
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig # Common imports
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# browser_config = BrowserConfig(...) # Optional, if needed
|
||||||
|
async with AsyncWebCrawler(# config=browser_config
|
||||||
|
) as crawler:
|
||||||
|
# ... your crawl logic will go here ...
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
* **3.2. `arun(url, config)`**: Crawling a Single Page
|
||||||
|
* **3.2.1. Explaining its Purpose:**
|
||||||
|
* **What to tell your AI:** "To crawl a single page, we'll use the `arun` method of the `AsyncWebCrawler` instance. It takes the URL and an optional configuration object."
|
||||||
|
* **Why it's important for the AI:** This points to the primary method for individual page crawls.
|
||||||
|
* **3.2.2. Guiding the AI on `url` Specification:**
|
||||||
|
* **What to tell your AI:** "The `url` for `arun` can be a regular web address (like 'https://example.com'), a local file path (if it starts with `file://`), or even raw HTML content (if it starts with `raw:`)."
|
||||||
|
* **Vibe Prompts & AI Interpretation:**
|
||||||
|
* "Crawl this URL: https://my-target-site.com" -> AI uses `url="https://my-target-site.com"`
|
||||||
|
* "Process this local HTML file: /path/to/my/page.html" -> AI uses `url="file:///path/to/my/page.html"`
|
||||||
|
* "I have some raw HTML I want to parse: `<html><body><h1>Test</h1></body></html>`" -> AI uses `url="raw:<html><body><h1>Test</h1></body></html>"`
|
||||||
|
* **3.2.3. Emphasizing `config` (`CrawlerRunConfig`):**
|
||||||
|
* **What to tell your AI:** "The most important part for customizing a crawl is the `config` parameter of `arun`. This is where we'll use `CrawlerRunConfig` to tell `crawl4ai` *how* to crawl, what to extract, and what to do with the page."
|
||||||
|
* **Vibe Prompt:** "...and when you call `arun`, pass in a `CrawlerRunConfig` object with these settings: [describe settings]."
|
||||||
|
|
||||||
|
* **3.3. `arun_many(urls, config)`**: Crawling Multiple Pages
|
||||||
|
* **3.3.1. Explaining its Purpose:**
|
||||||
|
* **What to tell your AI:** "If I have a list of URLs, we should use `arun_many` instead of calling `arun` in a loop. It's more efficient for batch crawling."
|
||||||
|
* **Why it's important for the AI:** Guides the AI to use the optimized method for multiple URLs, which handles concurrency internally.
|
||||||
|
* **3.3.2. Vibe Prompt:**
|
||||||
|
* "Crawl all these pages: https://example.com/page1, https://example.com/page2, https://example.com/page3."
|
||||||
|
* **Expected AI action:**
|
||||||
|
```python
|
||||||
|
# ... inside async def main() and async with AsyncWebCrawler ...
|
||||||
|
list_of_urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page3"
|
||||||
|
]
|
||||||
|
# run_config = CrawlerRunConfig(...) # Define config for all URLs
|
||||||
|
results = await crawler.arun_many(urls=list_of_urls, config=run_config)
|
||||||
|
for result in results:
|
||||||
|
# process each result
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
* **3.4. `CrawlerRunConfig`**: Tailoring the Crawl (Simplified for Vibe Coding)
|
||||||
|
* **3.4.1. Getting Markdown:**
|
||||||
|
* **Basic Request Vibe:** "Get the Markdown for this page [URL]."
|
||||||
|
* **AI's understanding:** The AI should know that `crawl4ai` produces Markdown by default, so a very simple `CrawlerRunConfig()` or even `None` might suffice if no other settings are needed.
|
||||||
|
* **For Cleaner Markdown Vibe:** "I need very clean Markdown from [URL], suitable for feeding into another LLM. Remove boilerplate like headers, footers, and navigation."
|
||||||
|
* **AI's understanding:** "Ah, the user wants cleaner output. I should suggest or use `DefaultMarkdownGenerator` combined with `PruningContentFilter`. The `PruningContentFilter` heuristically tries to remove common non-content sections."
|
||||||
|
* **Code Snippet AI might generate:**
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig, DefaultMarkdownGenerator
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
markdown_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter()
|
||||||
|
)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
markdown_generator=markdown_generator
|
||||||
|
)
|
||||||
|
# result = await crawler.arun(url="...", config=run_config)
|
||||||
|
# print(result.markdown.fit_markdown) # Access the filtered markdown
|
||||||
|
```
|
||||||
|
* **Reasoning to convey (implicitly or explicitly):** `PruningContentFilter` is a good first step for general boilerplate removal. `DefaultMarkdownGenerator` then converts the filtered HTML to Markdown. Accessing `result.markdown.fit_markdown` gives this cleaner version.
|
||||||
|
* **3.4.2. Extracting Specific Data (JSON):**
|
||||||
|
* **3.4.2.1. Using `JsonCssExtractionStrategy` for Structured Pages:**
|
||||||
|
* **When to suggest it (Vibe):** "The page I want to crawl ([URL]) has a list of items, like products on an e-commerce category page, and each item has a similar layout. I want to extract these repeating items into a JSON list."
|
||||||
|
* **Information to provide the AI (Vibe):** "For each item, I want to get the 'product_name', which is usually in an `<h2>` tag, and the 'price', which seems to be in a `<span>` tag with a class like 'price-tag' or 'current-price'."
|
||||||
|
* **AI's Role & Reasoning:** The AI should recognize this pattern and suggest `JsonCssExtractionStrategy`. It understands that the user is describing a schema. The AI's job is to translate "name from h2" into `{"name": "product_name", "selector": "h2", "type": "text"}` within the `fields` list of a schema dictionary, and the overall repeating item selector into `baseSelector`. The AI should also know to set `extraction_type="schema"` on `LLMExtractionStrategy` if it were using that for schema generation, but here it's direct CSS.
|
||||||
|
* **Code Snippet AI might generate:**
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
|
||||||
|
# AI would help construct this schema based on user's description
|
||||||
|
schema = {
|
||||||
|
"name": "ProductList",
|
||||||
|
"baseSelector": "div.product-item", # Example selector for each product block
|
||||||
|
"fields": [
|
||||||
|
{"name": "product_name", "selector": "h2.product-title", "type": "text"},
|
||||||
|
{"name": "price", "selector": "span.price-tag", "type": "text"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
extraction_strategy = JsonCssExtractionStrategy(schema=schema)
|
||||||
|
run_config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
||||||
|
# result = await crawler.arun(url="...", config=run_config)
|
||||||
|
# if result.success and result.extracted_content:
|
||||||
|
# products = json.loads(result.extracted_content)
|
||||||
|
# for product in products:
|
||||||
|
# print(f"Name: {product.get('product_name')}, Price: {product.get('price')}")
|
||||||
|
```
|
||||||
|
* **3.4.2.2. Using `LLMExtractionStrategy` for Complex/Unclear Structures:**
|
||||||
|
* **When to suggest it (Vibe):** "The page ([URL]) has the information I want, but it's not in a clear, repeating list, or it's mixed in with a lot of text. I need the AI to understand the content to pull out specific details." Or, "I want to extract information that requires some interpretation, like summarizing a paragraph."
|
||||||
|
* **Information to provide the AI (Vibe):**
|
||||||
|
* "Use `LLMExtractionStrategy` for this."
|
||||||
|
* "The LLM I want to use is [LLM provider/model, e.g., 'openai/gpt-4o-mini'] and my API key is [YOUR_API_KEY_OR_ENV_VAR_NAME] (or tell it to look for an env var)."
|
||||||
|
* **Option A (Describing fields):** "I need a JSON object with the following fields: 'author_name', 'article_publish_date', and a 'short_summary' (about 2 sentences)."
|
||||||
|
* **Option B (Example JSON):** "The JSON output should look something like this: `{\"author\": \"Jane Doe\", \"published_on\": \"2024-05-23\", \"summary\": \"This article discusses...\"}`."
|
||||||
|
* **Option C (Pydantic Model - more advanced but best for AI):** "Here's a Pydantic model that defines the structure I want: [Pydantic Class Code Snippet]. Use this for the schema."
|
||||||
|
* **AI's Role & Reasoning:** The AI needs to construct an `LLMConfig` and an `LLMExtractionStrategy`. If the user provides field descriptions or an example JSON, the AI can generate a simple schema dictionary. If a Pydantic model is provided, the AI should use `MyPydanticModel.model_json_schema()` to create the schema for `LLMExtractionStrategy`. This strategy is powerful because it leverages the LLM's understanding.
|
||||||
|
* **Code Snippet AI might generate (with Pydantic example):**
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig, LLMConfig
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
from pydantic import BaseModel, Field # Assuming user might provide this
|
||||||
|
|
||||||
|
# User might provide this, or AI generates it from description
|
||||||
|
class ArticleInfo(BaseModel):
|
||||||
|
author_name: str = Field(description="The main author of the article")
|
||||||
|
publication_date: str = Field(description="The date the article was published, e.g., YYYY-MM-DD")
|
||||||
|
short_summary: str = Field(description="A concise 2-3 sentence summary of the article")
|
||||||
|
|
||||||
|
llm_config = LLMConfig(
|
||||||
|
provider="openai/gpt-4o-mini", # Or user's choice
|
||||||
|
api_token="env:OPENAI_API_KEY" # Or direct key if user insists and understands risk
|
||||||
|
)
|
||||||
|
extraction_strategy = LLMExtractionStrategy(
|
||||||
|
llm_config=llm_config,
|
||||||
|
schema=ArticleInfo.model_json_schema(),
|
||||||
|
# instruction="Extract author, publication date, and a summary." # Could also be used
|
||||||
|
extraction_type="schema" # Important for Pydantic/JSON schema
|
||||||
|
)
|
||||||
|
run_config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
||||||
|
# result = await crawler.arun(url="...", config=run_config)
|
||||||
|
# if result.success and result.extracted_content:
|
||||||
|
# article_data = json.loads(result.extracted_content) # Or ArticleInfo.model_validate_json(result.extracted_content)
|
||||||
|
# print(article_data)
|
||||||
|
```
|
||||||
|
* **3.4.3. Interacting with Pages (Dynamic Content):**
|
||||||
|
* **How to tell your AI (Vibe):** "This page ([URL]) loads more content when you scroll down, or when you click a 'Show More' button. `crawl4ai` needs to perform this interaction."
|
||||||
|
* **For clicking (Vibe):** "To get all the data, we need to click the button with text 'Load All Comments'."
|
||||||
|
* **AI's understanding:** This requires `js_code` to find and click the button. The AI should be guided that finding elements by text might involve more complex JS like `Array.from(document.querySelectorAll('button')).find(btn => btn.textContent.includes('Load All Comments')).click();`.
|
||||||
|
* **For scrolling (Vibe):** "Scroll to the bottom of the page to make sure everything loads."
|
||||||
|
* **AI's understanding:** `js_code` like `window.scrollTo(0, document.body.scrollHeight);`
|
||||||
|
* **Ensuring actions complete (Vibe):** "After clicking 'Load More', wait for the new items to appear. They usually show up in a `div` with class `comment-list` and we expect more than 10 comments."
|
||||||
|
* **AI's understanding:** Use `wait_for`. This can be a CSS selector (e.g., `wait_for="css:.comment-list .comment-item:nth-child(11)"`) or a JS condition (e.g., `wait_for="js:() => document.querySelectorAll('.comment-item').length > 10"`).
|
||||||
|
* **For multi-step interactions on the same page (Vibe):** "I need to first click 'Agree to Cookies', then click 'Show Details', then extract the text. Make sure these happen on the same page view."
|
||||||
|
* **AI's understanding:** "This requires a `session_id` to persist the page state across multiple `arun` calls. Each `arun` call will perform one step of the interaction."
|
||||||
|
* **If only JS interaction is needed (Vibe):** "After the first page load, the next actions (like clicking 'Next Page') only update part of the page with JavaScript, they don't reload everything."
|
||||||
|
* **AI's understanding:** "For these subsequent `arun` calls within the same session, set `js_only=True` in `CrawlerRunConfig` to prevent unnecessary full page navigations, making it faster."
|
||||||
|
* **3.4.4. Taking Screenshots or PDFs:**
|
||||||
|
* **Simple request (Vibe):** "Get me a screenshot of [URL]."
|
||||||
|
* **AI's understanding:** Set `screenshot=True` in `CrawlerRunConfig`.
|
||||||
|
* **Or (Vibe):** "I need a PDF version of [URL]."
|
||||||
|
* **AI's understanding:** Set `pdf=True` in `CrawlerRunConfig`.
|
||||||
|
* **3.4.5. Caching for Speed and Freshness:**
|
||||||
|
* **For faster testing/development (Vibe):** "I'm testing my script for [URL]. Can you make it use the cache so it's faster after the first run?"
|
||||||
|
* **AI's understanding:** Use `cache_mode=CacheMode.ENABLED`.
|
||||||
|
* **Reasoning:** This saves time during development by re-using previously fetched content.
|
||||||
|
* **To get the latest data (Vibe):** "I need the absolute latest version of [URL], don't use any cached data."
|
||||||
|
* **AI's understanding:** Use `cache_mode=CacheMode.BYPASS`.
|
||||||
|
* **Reasoning:** Ensures `crawl4ai` fetches a fresh copy from the web, ignoring any local cache. This is crucial for production or when data changes frequently.
|
||||||
|
|
||||||
|
* **3.5. `BrowserConfig`**: Browser-Level Settings (Simplified for Vibe Coding)
|
||||||
|
* **3.5.1. When it's usually not needed:**
|
||||||
|
* **What to tell your AI (or what it should infer):** "For most simple crawls where you just need to fetch a page and get its content, you probably don't need to worry about `BrowserConfig`. `crawl4ai` uses sensible defaults."
|
||||||
|
* **3.5.2. Headless Mode:**
|
||||||
|
* **Default behavior (Vibe):** "By default, `crawl4ai` runs 'headless', which means you won't see a browser window pop up. This is usually what we want for automated scripts."
|
||||||
|
* **To see the browser (Vibe):** "I'm having trouble with [URL], and I want to see what the browser is doing. Can you make it run in non-headless mode?"
|
||||||
|
* **AI's understanding:** The user wants to visually debug. Set `headless=False` in `BrowserConfig`.
|
||||||
|
* **3.5.3. Using a Proxy:**
|
||||||
|
* **How to ask (Vibe):** "I need to make `crawl4ai` use a proxy server for this crawl."
|
||||||
|
* **Information to provide (Vibe):** "The proxy server address is `http://myproxy.server.com:8080`. If it needs a username and password, they are `myuser` and `mypass`. Make sure to use the `proxy_config` setting in `BrowserConfig`."
|
||||||
|
* **AI's understanding:** The AI should construct a `ProxyConfig` object (or dictionary that `BrowserConfig` can handle) and pass it to `BrowserConfig`.
|
||||||
|
* **3.5.4. Changing User Agent:**
|
||||||
|
* **How to ask (Vibe):** "The website [URL] might be blocking default user agents. Can we make `crawl4ai` look like it's Firefox on a Mac?"
|
||||||
|
* **Information to provide (Vibe):** "You can set a custom `user_agent` string in `BrowserConfig`. For example, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/115.0'."
|
||||||
|
* **AI's understanding:** The AI should pass the provided string to the `user_agent` parameter of `BrowserConfig`.
|
||||||
|
|
||||||
|
* **3.6. `LLMConfig`**: Configuring Language Models (Simplified for Vibe Coding)
|
||||||
|
* **3.6.1. When it's needed:**
|
||||||
|
* **What to tell your AI:** "If we're using `LLMExtractionStrategy` to extract structured data or `LLMContentFilter` to clean up content, we need to tell `crawl4ai` which language model to use. This is done with an `LLMConfig` object."
|
||||||
|
* **3.6.2. Information to provide the AI (Vibe):**
|
||||||
|
* **Model choice:** "For this task, let's use the `provider` called 'openai/gpt-4o-mini'." (Other examples: 'ollama/llama3', 'anthropic/claude-3-opus-20240229').
|
||||||
|
* **API Key:** "My `api_token` for this provider is [YOUR_API_KEY_PLACEHOLDER]. (Best practice is to tell the AI to get it from an environment variable, e.g., 'env:OPENAI_API_KEY')."
|
||||||
|
* **AI's understanding:** The AI will create an `LLMConfig(provider="...", api_token="...")` and pass it to the relevant strategy.
|
||||||
|
* **Code Snippet AI might generate:**
|
||||||
|
```python
|
||||||
|
from crawl4ai import LLMConfig
|
||||||
|
# For OpenAI
|
||||||
|
llm_conf = LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY")
|
||||||
|
# For Ollama (locally running Llama3)
|
||||||
|
# llm_conf = LLMConfig(provider="ollama/llama3") # api_token often not needed for local Ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
* **3.7. The `CrawlResult`**: Understanding What You Get Back
|
||||||
|
* **3.7.1. Checking for Success:**
|
||||||
|
* **What to tell your AI (Crucial Vibe):** "When `crawl4ai` finishes an `arun` or `arun_many` call, the most important first step is to check if it was successful. Tell the AI to always generate code that checks `result.success`. This will be `True` or `False`."
|
||||||
|
* **If `False` (Vibe):** "If `result.success` is `False`, the AI should print or log `result.error_message` to tell us what went wrong."
|
||||||
|
* **3.7.2. Accessing Markdown Content:**
|
||||||
|
* **Raw Markdown (Vibe):** "The main text content of the page, converted to Markdown, is usually in `result.markdown.raw_markdown`."
|
||||||
|
* **Filtered Markdown (Vibe):** "If we used a content filter (like `PruningContentFilter`), the cleaner, more focused Markdown will be in `result.markdown.fit_markdown`."
|
||||||
|
* **3.7.3. Accessing Extracted Structured Data (JSON):**
|
||||||
|
* **Where to find it (Vibe):** "If we asked `crawl4ai` to extract specific structured data (using `JsonCssExtractionStrategy` or `LLMExtractionStrategy`), that data will be in `result.extracted_content`."
|
||||||
|
* **How to use it (Vibe):** "The `result.extracted_content` is a JSON string. To use it in Python, tell the AI to parse it using `json.loads()`."
|
||||||
|
* **3.7.4. Accessing Links:**
|
||||||
|
* **What it contains (Vibe):** "`result.links` is a dictionary. It usually has two keys: 'internal' and 'external'. Each of these is a list of links found on the page, with details like the link text and URL."
|
||||||
|
* **3.7.5. Accessing Media Information:**
|
||||||
|
* **What it contains (Vibe):** "`result.media` is another dictionary. It can contain lists of 'images', 'videos', or 'audios' found on the page, along with their URLs and sometimes alt text or descriptions."
|
||||||
|
* **3.7.6. Accessing Screenshots and PDFs:**
|
||||||
|
* **Screenshots (Vibe):** "If we set `screenshot=True` in the config, the screenshot image data (which is a base64 encoded string) will be in `result.screenshot`."
|
||||||
|
* **PDFs (Vibe):** "If we set `pdf=True`, the PDF data (as raw bytes) will be in `result.pdf`."
|
||||||
|
|
||||||
|
## 4. Common "Vibe" Recipes & Prompting Patterns for AI Assistants
|
||||||
|
|
||||||
|
Here are some common scenarios and how you can prompt your AI assistant to use `crawl4ai` to solve them.
|
||||||
|
|
||||||
|
* **4.1. Recipe 1: Get Clean Markdown from a Page**
|
||||||
|
* **4.1.1. User Vibe Prompt to AI:**
|
||||||
|
* *Example 1 (Basic):* "Hey AI, can you use `crawl4ai` to get me the clean Markdown content of `https://blog.example.com/my-latest-post`?"
|
||||||
|
* *Example 2 (Requesting more cleaning):* "AI, I need *very* clean Markdown from `https://some.news.site/article123`. It's for an LLM, so please try to remove ads, navigation, and footers."
|
||||||
|
* **4.1.2. Key `crawl4ai` elements the AI should use (and why):**
|
||||||
|
* **`AsyncWebCrawler`:** The starting point for any crawl.
|
||||||
|
* **`arun`:** Since it's a single URL.
|
||||||
|
* **`CrawlerRunConfig`:**
|
||||||
|
* For Example 1, the default `DefaultMarkdownGenerator` might be sufficient.
|
||||||
|
* For Example 2 ("very clean"), the AI should be guided to use `DefaultMarkdownGenerator(content_filter=PruningContentFilter())`.
|
||||||
|
* **Reasoning:** `PruningContentFilter` is designed to heuristically remove common boilerplate like headers, footers, and navigation bars before Markdown conversion, leading to cleaner text suitable for LLMs.
|
||||||
|
* **4.1.3. Expected Output from AI-generated code:**
|
||||||
|
* A Python script that initializes `AsyncWebCrawler`, calls `arun` with the appropriate URL and config.
|
||||||
|
* The script should then access and print (or save) `result.markdown.raw_markdown` (for basic) or `result.markdown.fit_markdown` (if `PruningContentFilter` was used).
|
||||||
|
* **Code Example (for "very clean"):**
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
|
async def get_clean_markdown(url_to_crawl):
|
||||||
|
markdown_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter()
|
||||||
|
)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
markdown_generator=markdown_generator,
|
||||||
|
cache_mode="BYPASS" # Ensure fresh crawl for demo
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url=url_to_crawl, config=run_config)
|
||||||
|
if result.success:
|
||||||
|
print(f"--- Fit Markdown for {url_to_crawl} ---")
|
||||||
|
print(result.markdown.fit_markdown)
|
||||||
|
# You might also want to see raw_markdown to compare
|
||||||
|
# print(f"--- Raw Markdown for {url_to_crawl} ---")
|
||||||
|
# print(result.markdown.raw_markdown)
|
||||||
|
else:
|
||||||
|
print(f"Failed to crawl {url_to_crawl}: {result.error_message}")
|
||||||
|
|
||||||
|
# asyncio.run(get_clean_markdown("https://en.wikipedia.org/wiki/Python_(programming_language)"))
|
||||||
|
```
|
||||||
|
|
||||||
|
* **4.2. Recipe 2: Extract All Product Names and Prices from an E-commerce Category Page**
|
||||||
|
* **4.2.1. User Vibe Prompt to AI:**
|
||||||
|
* *Example:* "AI, I need to use `crawl4ai` to get all product names and their prices from `https://www.example-store.com/laptops`. On that page, product names look like they are in `<h3>` tags with a class `product-title`, and prices are in `<span>` elements with the class `final-price`."
|
||||||
|
* **4.2.2. Key `crawl4ai` elements AI should use (and why):**
|
||||||
|
* **`AsyncWebCrawler`**, **`arun`**.
|
||||||
|
* **`CrawlerRunConfig`** with **`JsonCssExtractionStrategy`**.
|
||||||
|
* **Reasoning:** The user described a page with repeating structured items. `JsonCssExtractionStrategy` is ideal for this as it uses CSS selectors to pinpoint the data. The AI's task is to translate the user's description of element locations into a valid schema for the strategy.
|
||||||
|
* The AI needs to understand that `baseSelector` in the schema should target the container for each product, and `fields` will target individual pieces of data within that container.
|
||||||
|
* **4.2.3. Expected Output from AI-generated code:**
|
||||||
|
* A Python script that defines the schema dictionary.
|
||||||
|
* Initializes `JsonCssExtractionStrategy` with this schema.
|
||||||
|
* Passes the strategy to `CrawlerRunConfig`.
|
||||||
|
* After `arun`, it parses `result.extracted_content` using `json.loads()` and likely iterates through the list of extracted product dictionaries.
|
||||||
|
* **Code Example:**
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
|
||||||
|
async def extract_products(url_to_crawl):
|
||||||
|
# AI helps create this schema based on user's description
|
||||||
|
product_schema = {
|
||||||
|
"name": "LaptopList",
|
||||||
|
"baseSelector": "div.product-listing-item", # Hypothetical selector for each product's container
|
||||||
|
"fields": [
|
||||||
|
{"name": "product_name", "selector": "h3.product-title", "type": "text"},
|
||||||
|
{"name": "price", "selector": "span.final-price", "type": "text"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
extraction_strategy = JsonCssExtractionStrategy(schema=product_schema)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
extraction_strategy=extraction_strategy,
|
||||||
|
cache_mode="BYPASS"
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url=url_to_crawl, config=run_config)
|
||||||
|
if result.success and result.extracted_content:
|
||||||
|
products = json.loads(result.extracted_content)
|
||||||
|
print(f"Found {len(products)} products:")
|
||||||
|
for i, product in enumerate(products[:3]): # Print first 3
|
||||||
|
print(f" Product {i+1}: Name='{product.get('product_name')}', Price='{product.get('price')}'")
|
||||||
|
else:
|
||||||
|
print(f"Failed to extract products from {url_to_crawl}: {result.error_message}")
|
||||||
|
|
||||||
|
# asyncio.run(extract_products("https://www.example-store.com/laptops")) # Replace with a real URL for testing
|
||||||
|
```
|
||||||
|
|
||||||
|
* **4.3. Recipe 3: Extract Key Information from an Article using an LLM**
|
||||||
|
* **4.3.1. User Vibe Prompt to AI:**
|
||||||
|
* *Example:* "AI, I want `crawl4ai` to read this article: `https://example.com/news/ai-breakthrough`. Use `openai/gpt-4o-mini` to extract the author's name, the publication date, and a short (2-3 sentence) summary. The output should be JSON. My OpenAI API key is in the `OPENAI_API_KEY` environment variable."
|
||||||
|
* **4.3.2. Key `crawl4ai` elements AI should use (and why):**
|
||||||
|
* **`AsyncWebCrawler`**, **`arun`**.
|
||||||
|
* **`CrawlerRunConfig`** with **`LLMExtractionStrategy`**.
|
||||||
|
* **`LLMConfig`**: To specify the `provider` ("openai/gpt-4o-mini") and `api_token` ("env:OPENAI_API_KEY").
|
||||||
|
* **Reasoning:** The task requires understanding and summarization, making `LLMExtractionStrategy` suitable. The AI needs to construct a schema (either a simple dictionary or a Pydantic model `model_json_schema()`) that tells the LLM what fields to populate. The instruction to the LLM will be implicitly derived from the schema field descriptions or can be explicitly provided.
|
||||||
|
* **4.3.3. Expected Output from AI-generated code:**
|
||||||
|
* Python script that defines a Pydantic model (or a dictionary schema).
|
||||||
|
* Initializes `LLMConfig` and `LLMExtractionStrategy`.
|
||||||
|
* Parses `result.extracted_content`.
|
||||||
|
* **Code Example (using Pydantic):**
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
class ArticleDetails(BaseModel):
|
||||||
|
author_name: str = Field(..., description="The main author of the article.")
|
||||||
|
publication_date: str = Field(..., description="The date the article was published (e.g., YYYY-MM-DD).")
|
||||||
|
summary: str = Field(..., description="A concise 2-3 sentence summary of the article.")
|
||||||
|
|
||||||
|
async def extract_article_info_llm(url_to_crawl):
|
||||||
|
if not os.getenv("OPENAI_API_KEY"): # Or your specific key variable
|
||||||
|
print("API key environment variable not set. Skipping LLM extraction.")
|
||||||
|
return
|
||||||
|
|
||||||
|
llm_config = LLMConfig(
|
||||||
|
provider="openai/gpt-4o-mini", # Use a cost-effective model for demos
|
||||||
|
api_token="env:OPENAI_API_KEY"
|
||||||
|
)
|
||||||
|
extraction_strategy = LLMExtractionStrategy(
|
||||||
|
llm_config=llm_config,
|
||||||
|
schema=ArticleDetails.model_json_schema(),
|
||||||
|
extraction_type="schema" # Crucial for Pydantic/JSON schema
|
||||||
|
)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
extraction_strategy=extraction_strategy,
|
||||||
|
cache_mode="BYPASS"
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url=url_to_crawl, config=run_config)
|
||||||
|
if result.success and result.extracted_content:
|
||||||
|
try:
|
||||||
|
article_data = ArticleDetails.model_validate_json(result.extracted_content)
|
||||||
|
print(f"Extracted Article Info for {url_to_crawl}:")
|
||||||
|
print(json.dumps(article_data.model_dump(), indent=2))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error parsing LLM output: {e}")
|
||||||
|
print(f"Raw LLM output: {result.extracted_content}")
|
||||||
|
else:
|
||||||
|
print(f"Failed to extract article info from {url_to_crawl}: {result.error_message}")
|
||||||
|
|
||||||
|
# asyncio.run(extract_article_info_llm("https://www.example.com/news/ai-breakthrough")) # Replace with real article
|
||||||
|
```
|
||||||
|
|
||||||
|
* **4.4. Recipe 4: Crawl the first 3 pages of a blog (clicking "Next Page")**
|
||||||
|
* **4.4.1. User Vibe Prompt to AI:**
|
||||||
|
* *Example:* "AI, can you use `crawl4ai` to get the Markdown from the first 3 pages of `https://myblog.example.com/archive`? To get to the next page, I think you need to click a link that says 'Older Posts'."
|
||||||
|
* **4.4.2. Key `crawl4ai` elements AI should use (and why):**
|
||||||
|
* **`AsyncWebCrawler`**.
|
||||||
|
* **Multiple `arun` calls** in a loop (3 iterations).
|
||||||
|
* **`CrawlerRunConfig`** with:
|
||||||
|
* `session_id="blog_session"`: **Crucial** for maintaining the browser state (cookies, current page) across the multiple clicks.
|
||||||
|
* `js_code`: JavaScript to find and click the "Older Posts" link. The AI might need to generate robust JS like:
|
||||||
|
`Array.from(document.querySelectorAll('a')).find(a => a.textContent.trim() === 'Older Posts')?.click();`
|
||||||
|
* `wait_for`: After clicking, wait for a condition that indicates the next page has loaded (e.g., a specific element on the new page, or a change in an existing element). This can be tricky and might require some iteration. A simple `wait_for` for a few seconds could also be a starting point, like `wait_for=3000` (milliseconds).
|
||||||
|
* `js_only=True`: For the second and third `arun` calls, after the initial page load. This tells `crawl4ai` to only execute the JS and not perform a full new navigation to the original URL.
|
||||||
|
* **4.4.3. Expected Output from AI-generated code:**
|
||||||
|
* A Python script with a loop that calls `arun` three times.
|
||||||
|
* The script should collect and potentially print or save the Markdown from each page.
|
||||||
|
* **Code Example:**
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def crawl_blog_pages(start_url, num_pages=3):
|
||||||
|
session_id = "my_blog_crawl_session"
|
||||||
|
all_markdowns = []
|
||||||
|
|
||||||
|
# JavaScript to find and click "Older Posts" (example)
|
||||||
|
js_click_older_posts = """
|
||||||
|
(() => {
|
||||||
|
const links = Array.from(document.querySelectorAll('a'));
|
||||||
|
const olderPostsLink = links.find(a => a.textContent.trim().toLowerCase() === 'older posts');
|
||||||
|
if (olderPostsLink) {
|
||||||
|
olderPostsLink.click();
|
||||||
|
return true; // Indicate click was attempted
|
||||||
|
}
|
||||||
|
return false; // Indicate link not found
|
||||||
|
})();
|
||||||
|
"""
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
current_url = start_url
|
||||||
|
for i in range(num_pages):
|
||||||
|
print(f"Crawling page {i+1}...")
|
||||||
|
run_config_dict = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"cache_mode": CacheMode.BYPASS,
|
||||||
|
"wait_for": 2000 # Wait 2s for content to potentially load after click
|
||||||
|
}
|
||||||
|
if i > 0: # For subsequent pages, click and don't re-navigate
|
||||||
|
run_config_dict["js_code"] = js_click_older_posts
|
||||||
|
run_config_dict["js_only"] = True
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(**run_config_dict)
|
||||||
|
|
||||||
|
result = await crawler.arun(url=current_url, config=run_config) # URL is mainly for context in js_only
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print(f" Page {i+1} ({result.url}) - Markdown length: {len(result.markdown.raw_markdown)}")
|
||||||
|
all_markdowns.append({"url": result.url, "markdown": result.markdown.raw_markdown})
|
||||||
|
if i < num_pages - 1 and i > 0 and not run_config_dict.get("js_code_executed_successfully", True): # Hypothetical flag
|
||||||
|
print(f" 'Older Posts' link might not have been found or clicked on page {i+1}. Stopping.")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f" Failed to crawl page {i+1}: {result.error_message}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Important: Clean up the session
|
||||||
|
await crawler.crawler_strategy.kill_session(session_id)
|
||||||
|
|
||||||
|
print(f"\nCollected markdown for {len(all_markdowns)} pages.")
|
||||||
|
# For demo, print first 100 chars of each
|
||||||
|
# for i, md_data in enumerate(all_markdowns):
|
||||||
|
# print(f"\n--- Page {i+1} URL: {md_data['url']} ---")
|
||||||
|
# print(md_data['markdown'][:100] + "...")
|
||||||
|
|
||||||
|
# asyncio.run(crawl_blog_pages("YOUR_BLOG_START_URL_HERE"))
|
||||||
|
```
|
||||||
|
|
||||||
|
* **4.5. Recipe 5: Get Screenshots of a List of URLs**
|
||||||
|
* **4.5.1. User Vibe Prompt to AI:**
|
||||||
|
* *Example:* "AI, use `crawl4ai` to take a screenshot of each of these pages: `https://example.com`, `https://crawl4ai.com`, `https://github.com`. Save them as `example_com.png`, `crawl4ai_com.png`, and `github_com.png`."
|
||||||
|
* **4.5.2. Key `crawl4ai` elements AI should use (and why):**
|
||||||
|
* **`AsyncWebCrawler`**.
|
||||||
|
* **`arun_many`**: Efficient for processing a list of URLs.
|
||||||
|
* **`CrawlerRunConfig`** with `screenshot=True`.
|
||||||
|
* **Reasoning:** `arun_many` will process each URL with the same config. The AI needs to add logic to iterate through the results and save each `result.screenshot` (which is base64 data) to a uniquely named file.
|
||||||
|
* **4.5.3. Expected Output from AI-generated code:**
|
||||||
|
* Python script.
|
||||||
|
* PNG files saved to the current directory or a specified output directory.
|
||||||
|
* **Code Example:**
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def take_screenshots(urls_to_screenshot):
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
screenshot=True,
|
||||||
|
cache_mode=CacheMode.BYPASS # Get fresh screenshots
|
||||||
|
)
|
||||||
|
output_dir = "screenshots_output"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results = await crawler.arun_many(urls=urls_to_screenshot, config=run_config)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
if result.success and result.screenshot:
|
||||||
|
# Create a filename from the URL
|
||||||
|
parsed_url = urlparse(result.url)
|
||||||
|
filename = "".join(c if c.isalnum() else '_' for c in parsed_url.netloc + parsed_url.path)
|
||||||
|
if not filename or filename == "_": # Handle root path or empty paths
|
||||||
|
filename = "homepage"
|
||||||
|
filepath = os.path.join(output_dir, f"{filename}.png")
|
||||||
|
|
||||||
|
try:
|
||||||
|
screenshot_data = base64.b64decode(result.screenshot)
|
||||||
|
with open(filepath, "wb") as f:
|
||||||
|
f.write(screenshot_data)
|
||||||
|
print(f"Screenshot saved to {filepath}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error saving screenshot for {result.url}: {e}")
|
||||||
|
elif not result.success:
|
||||||
|
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||||
|
elif not result.screenshot:
|
||||||
|
print(f"Crawled {result.url} but no screenshot data was returned.")
|
||||||
|
|
||||||
|
# urls = ["https://example.com", "https://crawl4ai.com", "https://github.com"]
|
||||||
|
# asyncio.run(take_screenshots(urls))
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5. Tips for Effective Prompting Your AI Assistant for Crawl4AI Tasks
|
||||||
|
|
||||||
|
To get the best code from your AI assistant when working with `crawl4ai`, consider these prompting tips:
|
||||||
|
|
||||||
|
* **5.1. Be Clear About Your Goal:**
|
||||||
|
* Start with a high-level objective. Instead of just "Crawl a page," say "I need to extract all article titles from the homepage of this news site," or "Get the main content of this blog post as clean Markdown," or "Take full-page screenshots of these product pages." This helps the AI choose the right strategies and configurations.
|
||||||
|
|
||||||
|
* **5.2. Always Provide the URL(s):**
|
||||||
|
* This seems obvious, but be precise. If it's a list, provide the list.
|
||||||
|
* Remember to use the `file:///` prefix for local files (e.g., `file:///Users/me/Documents/mypage.html`) and `raw:` for inline HTML (e.g., `raw:<html><body>...</body></html>`). The AI might not always infer this correctly without a hint.
|
||||||
|
|
||||||
|
* **5.3. Describe Data for Extraction (Especially for `JsonCssExtractionStrategy` or `LLMExtractionStrategy`):**
|
||||||
|
* **What you want:** List the specific pieces of information you need (e.g., "product name," "price," "author," "publication_date," "article summary").
|
||||||
|
* **Where to find it (for CSS/XPath):** If you have an idea of the HTML structure, share it. "Product names seem to be in `<h2>` tags with class `item-title`." "The price is always in a `<span>` element right after a `<strong>` tag that says 'Price:'." This helps the AI generate accurate CSS selectors or XPath expressions for `JsonCssExtractionStrategy`.
|
||||||
|
* **Desired structure (for LLM):** For `LLMExtractionStrategy`, tell the AI the desired JSON structure. "I want a list of objects, where each object has a 'title' and a 'link'." Or even better, "Can you define a Pydantic model for me that has 'title' as a string and 'link' as a string, and then use that for extraction?"
|
||||||
|
|
||||||
|
* **5.4. Specify LLM Details for LLM Extraction or Filtering:**
|
||||||
|
* **Model/Provider:** "Use `openai/gpt-4o-mini` for this extraction." or "I want to use my local Ollama model, `ollama/llama3`."
|
||||||
|
* **API Key:** Clearly state where the API key should come from. "My API key is in the environment variable `OPENAI_API_KEY`." (This is safer than putting the key directly in the prompt). If you must provide it directly, be aware of the security implications.
|
||||||
|
|
||||||
|
* **5.5. Mention Page Dynamics and Interactions:**
|
||||||
|
* "This page loads more items when you scroll down."
|
||||||
|
* "You need to click the 'View All Reviews' button to see all the reviews."
|
||||||
|
* "The data I want only appears after selecting 'Category X' from a dropdown."
|
||||||
|
* This signals to the AI that `js_code`, `wait_for`, and possibly `session_id` will be necessary. You might need to guide it on *how* to identify the elements to interact with (e.g., "The 'Load More' button has the ID `load-more-btn`").
|
||||||
|
|
||||||
|
* **5.6. Iterative Refinement is Key:**
|
||||||
|
* Your first prompt might not yield perfect code. That's okay!
|
||||||
|
* Treat it as a conversation. If the AI-generated code misses something or makes a mistake:
|
||||||
|
* "That was close, but it missed extracting the product ratings. Ratings seem to be in a `div` with class `star-rating` inside each product item."
|
||||||
|
* "The script timed out. Can we increase the `page_timeout` in `CrawlerRunConfig` to 90 seconds?"
|
||||||
|
* "It didn't click the 'Next' button correctly. The button actually has the text '>>' instead of 'Next Page'."
|
||||||
|
* Provide the error messages or incorrect output back to the AI for context.
|
||||||
|
|
||||||
|
## 6. What to Expect as Output (From AI-Generated Code)
|
||||||
|
|
||||||
|
When you use "Vibe Coding" with an AI assistant for `crawl4ai`, you should generally expect the following:
|
||||||
|
|
||||||
|
* **6.1. Python Code:**
|
||||||
|
* The primary output will be a Python script that uses the `crawl4ai` library.
|
||||||
|
* It should include necessary imports like `asyncio`, `AsyncWebCrawler`, `CrawlerRunConfig`, etc.
|
||||||
|
* It will typically define an `async def main():` function and run it with `asyncio.run(main())`.
|
||||||
|
|
||||||
|
* **6.2. Accessing the `CrawlResult`:**
|
||||||
|
* The core of the script will involve one or more calls to `crawler.arun(...)` or `crawler.arun_many(...)`.
|
||||||
|
* These calls return `CrawlResult` objects (or a list of them for `arun_many`).
|
||||||
|
* The AI-generated code should then show you how to access the specific data you asked for from these `CrawlResult` objects. For example:
|
||||||
|
* `print(result.markdown.raw_markdown)` or `print(result.markdown.fit_markdown)`
|
||||||
|
* `data = json.loads(result.extracted_content)`
|
||||||
|
* `screenshot_data = base64.b64decode(result.screenshot)`
|
||||||
|
* `if not result.success: print(result.error_message)`
|
||||||
|
|
||||||
|
* **6.3. Files Saved to Disk (if requested):**
|
||||||
|
* If your vibe prompt included saving data (e.g., "save the screenshots as PNG files," "write the extracted JSON to `output.json`"), the AI-generated code should include the Python logic to perform these file operations.
|
||||||
|
* **Example for saving a screenshot:**
|
||||||
|
```python
|
||||||
|
import base64
|
||||||
|
# ... inside your async function, after getting 'result' ...
|
||||||
|
if result.success and result.screenshot:
|
||||||
|
with open("myscreenshot.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))
|
||||||
|
print("Screenshot saved to myscreenshot.png")
|
||||||
|
```
|
||||||
|
|
||||||
|
## 7. Conclusion: Vibe Your Way to Web Data!
|
||||||
|
|
||||||
|
* **7.1. Recap of "Vibe Coding" Benefits with `crawl4ai`:**
|
||||||
|
"Vibe Coding" empowers you to leverage the full capabilities of `crawl4ai` without needing to memorize every API detail. By understanding the high-level concepts and key building blocks outlined in this guide, you can effectively communicate your data extraction and web interaction needs to an AI coding assistant. This leads to faster prototyping, easier access to web data for non-programmers, and a more intuitive way to build data-driven applications.
|
||||||
|
|
||||||
|
* **7.2. Encouragement to experiment with different prompts and `crawl4ai` features:**
|
||||||
|
The key to successful "Vibe Coding" is experimentation. Try different ways of describing your goals to your AI assistant. If the first attempt doesn't yield the perfect `crawl4ai` code, refine your prompt with more specific details or hints. Don't be afraid to mention `crawl4ai` specific terms like `CrawlerRunConfig`, `js_code`, or `LLMExtractionStrategy` – this guide has equipped you with the essential vocabulary. The more context you provide, the better the AI can assist you.
|
||||||
|
|
||||||
|
* **7.3. Pointers to more detailed `crawl4ai` documentation for users who want to learn direct coding or advanced configurations:**
|
||||||
|
While "Vibe Coding" is a great way to get started and be productive quickly, you might eventually want to dive deeper into `crawl4ai`'s capabilities or fine-tune the generated code yourself. For that, refer to:
|
||||||
|
* **The Official Crawl4AI API Reference:** (Assuming this exists or will exist - replace with actual link if available, e.g., `https://docs.crawl4ai.com/api/`) For detailed information on all classes, methods, and parameters.
|
||||||
|
* **Specific "Reasoning & Problem-Solving" Guides:** Check the `crawl4ai` documentation for other guides that delve into specific components like advanced `CrawlerRunConfig` options, deep crawling strategies, or custom extraction techniques.
|
||||||
|
|
||||||
|
Happy Vibe Coding, and may your web data adventures be fruitful!
|
||||||
|
```
|
||||||
217
docs/md_v2/blog/articles/llm-context-revolution.md
Normal file
217
docs/md_v2/blog/articles/llm-context-revolution.md
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
# The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples
|
||||||
|
|
||||||
|
*Published on January 24, 2025 • 8 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## The Problem with Teaching Robots to Code
|
||||||
|
|
||||||
|
Picture this: You hand someone a dictionary and ask them to write poetry. They know every word, its spelling, its definition—but do they know how words dance together? How certain combinations evoke emotion while others fall flat? This is exactly what we're doing when we throw API documentation at our AI assistants and expect magic.
|
||||||
|
|
||||||
|
I've spent countless hours watching my AI coding assistant struggle with my own library, Crawl4AI. Despite feeding it comprehensive documentation, it would generate code that was *technically* correct but practically useless. Like a tourist speaking from a phrasebook—grammatically sound, culturally tone-deaf.
|
||||||
|
|
||||||
|
## Enter the Three-Dimensional Context Protocol
|
||||||
|
|
||||||
|
What if, instead of dumping information, we provided *wisdom*? Not just the "what," but the "how" and "why"? This led me to develop what I call the **LLM Context Protocol**—a structured approach that mirrors how humans actually master libraries.
|
||||||
|
|
||||||
|
Think of it as HTTP for AI context. Just as HTTP doesn't dictate your website's content but provides a reliable structure for communication, this protocol doesn't prescribe *how* you write your documentation—it provides a framework for *what* your AI needs to truly understand your code.
|
||||||
|
|
||||||
|
### The Three Pillars of Library Wisdom
|
||||||
|
|
||||||
|
<div style="background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px; margin: 20px 0;">
|
||||||
|
|
||||||
|
#### 🧠 **Memory: The Foundation**
|
||||||
|
```markdown
|
||||||
|
# AsyncWebCrawler.arun() - Memory Context
|
||||||
|
|
||||||
|
## Signature
|
||||||
|
async def arun(
|
||||||
|
url: str,
|
||||||
|
config: CrawlerConfig = None,
|
||||||
|
session_id: str = None,
|
||||||
|
**kwargs
|
||||||
|
) -> CrawlResult
|
||||||
|
|
||||||
|
## Parameters
|
||||||
|
- url: Target URL to crawl
|
||||||
|
- config: Optional configuration object
|
||||||
|
- session_id: Optional session identifier for caching
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
This is your API reference—the facts, the parameters, the return types. It's the easiest part to generate and, ironically, the least useful in isolation. It's like memorizing a dictionary without understanding grammar.
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px; margin: 20px 0;">
|
||||||
|
|
||||||
|
#### 🎯 **Reasoning: The Soul**
|
||||||
|
```markdown
|
||||||
|
# AsyncWebCrawler Design Philosophy - Reasoning Context
|
||||||
|
|
||||||
|
## Why Async-First Architecture?
|
||||||
|
|
||||||
|
Crawl4AI uses AsyncWebCrawler as its primary interface because modern web
|
||||||
|
scraping demands concurrency. Here's the thinking:
|
||||||
|
|
||||||
|
1. **Network I/O is slow**: Waiting synchronously wastes 90% of execution time
|
||||||
|
2. **Modern sites are complex**: Multiple resources load in parallel
|
||||||
|
3. **Scale matters**: You're rarely crawling just one page
|
||||||
|
|
||||||
|
## When to Use Session Management
|
||||||
|
|
||||||
|
Session management isn't just about performance—it's about appearing human:
|
||||||
|
- Use sessions when crawling multiple pages from the same domain
|
||||||
|
- Reuse browser contexts to maintain cookies and local storage
|
||||||
|
- But don't overdo it: too long sessions look suspicious
|
||||||
|
|
||||||
|
## The Cache Strategy Decision Tree
|
||||||
|
if static_content and infrequent_updates:
|
||||||
|
use_cache_mode('read_write')
|
||||||
|
elif dynamic_content and real_time_needed:
|
||||||
|
use_cache_mode('bypass')
|
||||||
|
else:
|
||||||
|
use_cache_mode('read_only') # Safe default
|
||||||
|
```
|
||||||
|
|
||||||
|
This is where the library creator's philosophy lives. It's not just *what* the library does, but *why* it does it that way. This is the hardest part to write because it requires genuine understanding—and it's a red flag when a library lacks it.
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px; margin: 20px 0;">
|
||||||
|
|
||||||
|
#### 💻 **Examples: The Practice**
|
||||||
|
```python
|
||||||
|
# Crawling with JavaScript execution
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||||
|
wait_for="css:.lazy-loaded-content"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extracting structured data with CSS selectors
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://shop.example.com",
|
||||||
|
extraction_strategy=CSSExtractionStrategy({
|
||||||
|
"prices": "span.price::text",
|
||||||
|
"titles": "h2.product-title::text"
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
# Session-based crawling with custom headers
|
||||||
|
async with crawler:
|
||||||
|
result1 = await crawler.arun(url1, session_id="product_scan")
|
||||||
|
result2 = await crawler.arun(url2, session_id="product_scan")
|
||||||
|
```
|
||||||
|
|
||||||
|
Pure code. No fluff. Just patterns in action. Because sometimes, you just need to see how it's done.
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
## Why This Matters (Especially for Smaller LLMs)
|
||||||
|
|
||||||
|
Here's the thing about AI assistants: the smaller ones can't think their way out of a paper bag. They're like eager interns—full of potential but needing clear guidance. When you rely on a large language model to "figure it out" from raw API docs, you're asking it to reinvent your library's philosophy from scratch. Every. Single. Time.
|
||||||
|
|
||||||
|
By providing structured context across these three dimensions, we're not just documenting—we're teaching. We're transferring not just knowledge, but wisdom.
|
||||||
|
|
||||||
|
## The Cultural DNA of Your Library
|
||||||
|
|
||||||
|
<div style="display: flex; align-items: center; background-color: #3f3f44; padding: 20px; margin: 20px 0; border-left: 4px solid #09b5a5;">
|
||||||
|
<div style="font-size: 48px; margin-right: 20px;">🧬</div>
|
||||||
|
<div>
|
||||||
|
<strong>Your library's reasoning is its cultural DNA.</strong><br>
|
||||||
|
It reflects your taste, your architectural decisions, your opinions about how things should be done. A library without reasoning is like a recipe without techniques—sure, you have the ingredients, but good luck making something edible.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
Think about it: When you learn a new library, what are you really after? You want mastery. And mastery comes from understanding:
|
||||||
|
- **Memory** tells you what's possible
|
||||||
|
- **Reasoning** tells you what's sensible
|
||||||
|
- **Examples** show you what's practical
|
||||||
|
|
||||||
|
Together, they create wisdom.
|
||||||
|
|
||||||
|
## Beyond Manual Documentation
|
||||||
|
|
||||||
|
Now, here's where it gets interesting. I didn't hand-craft thousands of lines of structured documentation for Crawl4AI. Who has that kind of time? Instead, I built a tool that:
|
||||||
|
|
||||||
|
1. Analyzes your codebase
|
||||||
|
2. Extracts API signatures and structures (Memory)
|
||||||
|
3. Identifies patterns and architectural decisions (Reasoning)
|
||||||
|
4. Collects real-world usage from tests and examples (Examples)
|
||||||
|
5. Generates structured LLM context files
|
||||||
|
|
||||||
|
The beauty? This tool is becoming part of Crawl4AI itself. Because if we're going to revolutionize how AI understands our code, we might as well automate it.
|
||||||
|
|
||||||
|
## The Protocol, Not the Prescription
|
||||||
|
|
||||||
|
Remember: this is a protocol, not a prescription. Just as HTTP doesn't tell you what website to build, the LLM Context Protocol doesn't dictate your documentation style. It simply says:
|
||||||
|
|
||||||
|
> "If you want an AI to truly understand your library, it needs three things: facts, philosophy, and patterns."
|
||||||
|
|
||||||
|
How you deliver those is up to you. The protocol just ensures nothing important gets lost in translation.
|
||||||
|
|
||||||
|
## Try It Yourself
|
||||||
|
|
||||||
|
Curious about implementing this for your own library? The context generation tool will be open-sourced as part of Crawl4AI. If you're interested in early access or want to discuss the approach, drop me a DM on X [@unclecode](https://twitter.com/unclecode).
|
||||||
|
|
||||||
|
Because let's face it: if we're going to live in a world where AI writes half our code, we might as well teach it properly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## A Final Thought
|
||||||
|
|
||||||
|
<div style="text-align: center; padding: 40px; background-color: #1a1a1c; border: 1px dashed #3f3f44; margin: 30px 0;">
|
||||||
|
<div style="font-size: 24px; color: #09b5a5; margin-bottom: 10px;">
|
||||||
|
Memory + Reasoning + Examples = Wisdom
|
||||||
|
</div>
|
||||||
|
<div style="color: #a3abba;">
|
||||||
|
And wisdom, not information, is what makes great developers—human or artificial.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Want to see this in action? Check out the [Crawl4AI LLM Context Builder](/core/llmtxt/) and experience the difference structured context makes.*
|
||||||
|
|
||||||
|
<style>
|
||||||
|
/* Custom styles for this article */
|
||||||
|
.markdown-body pre {
|
||||||
|
background-color: #1e1e1e !important;
|
||||||
|
border: 1px solid #3f3f44;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body code {
|
||||||
|
background-color: #3f3f44;
|
||||||
|
color: #50ffff;
|
||||||
|
padding: 2px 6px;
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body pre code {
|
||||||
|
background-color: transparent;
|
||||||
|
color: #e8e9ed;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body blockquote {
|
||||||
|
border-left: 4px solid #09b5a5;
|
||||||
|
background-color: #1a1a1c;
|
||||||
|
padding: 15px 20px;
|
||||||
|
margin: 20px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body h2 {
|
||||||
|
color: #50ffff;
|
||||||
|
border-bottom: 1px dashed #3f3f44;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body h3 {
|
||||||
|
color: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body strong {
|
||||||
|
color: #50ffff;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
@@ -137,7 +137,7 @@ if __name__ == "__main__":
|
|||||||
- Higher → fewer chunks but more relevant.
|
- Higher → fewer chunks but more relevant.
|
||||||
- Lower → more inclusive.
|
- Lower → more inclusive.
|
||||||
|
|
||||||
> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
|
> In more advanced scenarios, you might see parameters like `language`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -242,4 +242,4 @@ class MyCustomFilter(RelevantContentFilter):
|
|||||||
|
|
||||||
With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
|
With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
|
||||||
|
|
||||||
- Last Updated: 2025-01-01
|
- Last Updated: 2025-01-01
|
||||||
|
|||||||
61
docs/md_v2/core/llmtxt.md
Normal file
61
docs/md_v2/core/llmtxt.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
<div class="llmtxt-container">
|
||||||
|
<iframe id="llmtxt-frame" src="../../llmtxt/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI LLM Context Builder"></iframe>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Iframe height adjustment
|
||||||
|
function resizeLLMtxtIframe() {
|
||||||
|
const iframe = document.getElementById('llmtxt-frame');
|
||||||
|
if (iframe) {
|
||||||
|
const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
|
||||||
|
const topOffset = headerHeight + 20;
|
||||||
|
const availableHeight = window.innerHeight - topOffset;
|
||||||
|
iframe.style.height = Math.max(800, availableHeight) + 'px';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run immediately and on resize/load
|
||||||
|
resizeLLMtxtIframe();
|
||||||
|
let resizeTimer;
|
||||||
|
window.addEventListener('load', resizeLLMtxtIframe);
|
||||||
|
window.addEventListener('resize', () => {
|
||||||
|
clearTimeout(resizeTimer);
|
||||||
|
resizeTimer = setTimeout(resizeLLMtxtIframe, 150);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove Footer & HR from parent page
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
setTimeout(() => {
|
||||||
|
const footer = window.parent.document.querySelector('footer');
|
||||||
|
if (footer) {
|
||||||
|
const hrBeforeFooter = footer.previousElementSibling;
|
||||||
|
if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
|
||||||
|
hrBeforeFooter.remove();
|
||||||
|
}
|
||||||
|
footer.remove();
|
||||||
|
resizeLLMtxtIframe();
|
||||||
|
}
|
||||||
|
}, 100);
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
#terminal-mkdocs-main-content {
|
||||||
|
padding: 0 !important;
|
||||||
|
margin: 0;
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
#terminal-mkdocs-main-content .llmtxt-container {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
max-width: none;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
#terminal-mkdocs-toc-panel {
|
||||||
|
display: none !important;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
BIN
docs/md_v2/favicon.ico
Normal file
BIN
docs/md_v2/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.4 KiB |
BIN
docs/md_v2/img/favicon-32x32.png
Normal file
BIN
docs/md_v2/img/favicon-32x32.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.6 KiB |
BIN
docs/md_v2/img/favicon-x-32x32.png
Normal file
BIN
docs/md_v2/img/favicon-x-32x32.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.4 KiB |
BIN
docs/md_v2/img/favicon.ico
Normal file
BIN
docs/md_v2/img/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.4 KiB |
75
docs/md_v2/llmtxt/build.md
Normal file
75
docs/md_v2/llmtxt/build.md
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
|
||||||
|
O**Prompt for AI Coding Assistant: Create an Interactive LLM Context Builder Page**
|
||||||
|
|
||||||
|
**Objective:**
|
||||||
|
|
||||||
|
Your task is to create an interactive HTML webpage with JavaScript functionality that allows users to select and combine different `crawl4ai` LLM context files into a single downloadable Markdown (`.md`) file. This tool will empower users to craft tailored context for their AI assistants based on their specific needs.
|
||||||
|
|
||||||
|
**Core Functionality:**
|
||||||
|
|
||||||
|
1. **Display `crawl4ai` Components:** The page will list all available `crawl4ai` documentation components.
|
||||||
|
2. **Select Context Types:** For each component, users can select which types of context they want to include:
|
||||||
|
* Memory (API facts)
|
||||||
|
* Reasoning (How-to/why)
|
||||||
|
* Examples (Code snippets)
|
||||||
|
(All should be selected by default for each initially selected component).
|
||||||
|
3. **Special "Aggregate" Contexts:** Include options for special, pre-combined contexts:
|
||||||
|
* "Vibe Coding" (a curated mix for general AI prompting)
|
||||||
|
* "All Library Context" (a comprehensive aggregation of all memory, reasoning, and examples for the entire library).
|
||||||
|
4. **Fetch and Concatenate:** When the user clicks a "Download Combined Context" button:
|
||||||
|
* The JavaScript will fetch the content of all selected Markdown files from the server (from a predefined folder, e.g., `/llmtxt/`).
|
||||||
|
* It will concatenate the content of these files into a single string.
|
||||||
|
5. **Client-Side Download:** The concatenated content will be offered to the user as a download (e.g., `custom_crawl4ai_context.md`).
|
||||||
|
|
||||||
|
**Input/Assumptions:**
|
||||||
|
|
||||||
|
* **Context Files Location:** All individual context Markdown files are located on the server in a publicly accessible folder named `llmtxt/`.
|
||||||
|
* **File Naming Convention:** Files follow the pattern: `crawl4ai_{{component_name}}_[memory|reasoning|examples]_content.llm.md`.
|
||||||
|
* `{{component_name}}` can contain underscores (e.g., `deep_crawling`, `config_objects`).
|
||||||
|
* The special contexts will have names like `crawl4ai_vibe_content.llm.md` and `crawl4ai_all_content.llm.md`.
|
||||||
|
* **Component List:** You will be provided with a list of `crawl4ai` components. For this implementation, use the following list:
|
||||||
|
* `core`
|
||||||
|
* `config_objects`
|
||||||
|
* `deep_crawling`
|
||||||
|
* `deployment` (covers Installation & Docker Deployment)
|
||||||
|
* `extraction` (covers Structured Data Extraction)
|
||||||
|
* `markdown` (covers Markdown Generation Algorithm)
|
||||||
|
* `pdf_processing`
|
||||||
|
* *(No separate "Vibe Coding" or "All Library Context" in this list, as they are special top-level selections)*
|
||||||
|
|
||||||
|
**Detailed UI/UX Requirements:**
|
||||||
|
|
||||||
|
1. **Main Page Structure:**
|
||||||
|
* **Header:** "Crawl4AI Interactive LLM Context Builder"
|
||||||
|
* **Introduction:** Briefly explain the purpose of the tool (from the `USING_LLM_CONTEXTS.md` content you helped draft: "Supercharging Your AI Assistant...").
|
||||||
|
* **Selection Area:**
|
||||||
|
* **Special Aggregate Contexts (Radio Buttons or Prominent Checkboxes):**
|
||||||
|
* [ ] "Vibe Coding Context" (`crawl4ai_vibe_content.llm.md`)
|
||||||
|
* [ ] "All Library Context (Comprehensive)" (`crawl4ai_all_content.llm.md`)
|
||||||
|
* *Behavior:* Selecting one of these might disable individual component selections (or vice-versa) to avoid redundancy, or simply add them to the list. Consider user experience here. A simple approach is that if an aggregate is selected, it's the *only* thing downloaded.
|
||||||
|
* **Individual Component Selection (Table or List of Checkboxes):**
|
||||||
|
* A section titled "Select Individual Components & Context Types:"
|
||||||
|
* For each component in the provided list:
|
||||||
|
* A master checkbox for the component itself (e.g., `[ ] Core Functionality`). Selected by default.
|
||||||
|
* Nested checkboxes (indented or grouped) for context types, enabled only if the parent component is checked:
|
||||||
|
* `[x] Memory (API Facts)`
|
||||||
|
* `[x] Reasoning (How-to/Why)`
|
||||||
|
* `[x] Examples (Code Snippets)`
|
||||||
|
(These three sub-checkboxes should be selected by default if the parent component is selected).
|
||||||
|
* **Action Button:**
|
||||||
|
* A button: "Generate & Download Combined Context"
|
||||||
|
* **Status/Feedback Area:** (Optional, but good UX)
|
||||||
|
* Display messages like "Fetching files...", "Combining context...", "Download starting..." or error messages.
|
||||||
|
|
||||||
|
|
||||||
|
**Final Output:**
|
||||||
|
|
||||||
|
* A single HTML file (e.g., `interactive_context_builder.html`).
|
||||||
|
* Associated JavaScript code (can be inline within `<script>` tags or in a separate `.js` file).
|
||||||
|
* Associated CSS code (can be inline within `<style>` tags or in a separate `.css` file).
|
||||||
|
|
||||||
|
This interactive tool will greatly enhance the user experience for `crawl4ai` developers looking to leverage your specialized LLM contexts. Please ensure the JavaScript is robust and provides good user feedback.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
This prompt should give your AI coding assistant a very clear set of requirements and guidelines for building the interactive context builder. Remember to provide it with the list of components as mentioned in the "Input/Assumptions" section.
|
||||||
142
docs/md_v2/llmtxt/index.html
Normal file
142
docs/md_v2/llmtxt/index.html
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Crawl4AI LLM Context Builder</title>
|
||||||
|
<link rel="stylesheet" href="llmtxt.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container">
|
||||||
|
<header class="header">
|
||||||
|
<h1><span class="logo">🚀🤖</span> Crawl4AI LLM Context Builder</h1>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<section class="intro">
|
||||||
|
<div class="intro-header">
|
||||||
|
<h2>🧠 A New Approach to LLM Context</h2>
|
||||||
|
<p>
|
||||||
|
Traditional <code>llm.txt</code> files often fail with complex libraries like Crawl4AI. They dump massive amounts of API documentation, causing <strong>information overload</strong> and <strong>lost focus</strong>. They provide the "what" but miss the crucial "how" and "why" that makes AI assistants truly helpful.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="intro-solution">
|
||||||
|
<h3>💡 The Solution: Multi-Dimensional, Modular Contexts</h3>
|
||||||
|
<p>
|
||||||
|
Inspired by modular libraries like Lodash, I've redesigned how we provide context to AI assistants. Instead of one monolithic file, Crawl4AI's documentation is organized by <strong>components</strong> and <strong>perspectives</strong>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div class="dimensions">
|
||||||
|
<div class="dimension">
|
||||||
|
<span class="badge memory">Memory</span>
|
||||||
|
<h4>The "What"</h4>
|
||||||
|
<p>Precise API facts, parameters, signatures, and configuration objects. Your unambiguous reference.</p>
|
||||||
|
</div>
|
||||||
|
<div class="dimension">
|
||||||
|
<span class="badge reasoning">Reasoning</span>
|
||||||
|
<h4>The "How" & "Why"</h4>
|
||||||
|
<p>Design principles, best practices, trade-offs, and workflows. Teaches AI to think like an expert.</p>
|
||||||
|
</div>
|
||||||
|
<div class="dimension">
|
||||||
|
<span class="badge examples">Examples</span>
|
||||||
|
<h4>The "Show Me"</h4>
|
||||||
|
<p>Runnable code snippets demonstrating patterns in action. Pure practical implementation.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="intro-benefits">
|
||||||
|
<p>
|
||||||
|
<strong>Why this matters:</strong> You can now give your AI assistant exactly what it needs - whether that's quick API lookups, help designing solutions, or seeing practical implementations. No more information overload, just focused, relevant context.
|
||||||
|
</p>
|
||||||
|
<p class="learn-more">
|
||||||
|
<a href="/blog/articles/llm-context-revolution" class="learn-more-link" target="_parent">📖 Read the full story behind this approach →</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="builder">
|
||||||
|
<div class="special-contexts">
|
||||||
|
<h2>Quick Presets</h2>
|
||||||
|
<div class="preset-options">
|
||||||
|
<label class="preset-option">
|
||||||
|
<input type="radio" name="preset" value="vibe" id="preset-vibe">
|
||||||
|
<div class="preset-card">
|
||||||
|
<h3>🎯 Vibe Coding</h3>
|
||||||
|
<p>Curated context for general AI prompting - perfect for exploring capabilities</p>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
<label class="preset-option">
|
||||||
|
<input type="radio" name="preset" value="all" id="preset-all">
|
||||||
|
<div class="preset-card">
|
||||||
|
<h3>📚 Complete Library</h3>
|
||||||
|
<p>Comprehensive context including all components and perspectives</p>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
<label class="preset-option">
|
||||||
|
<input type="radio" name="preset" value="custom" id="preset-custom" checked>
|
||||||
|
<div class="preset-card">
|
||||||
|
<h3>🔧 Custom Selection</h3>
|
||||||
|
<p>Choose specific components and context types</p>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="component-selector" id="component-selector">
|
||||||
|
<h2>Select Components & Context Types</h2>
|
||||||
|
<div class="select-all-controls">
|
||||||
|
<button class="btn-small" id="select-all">Select All</button>
|
||||||
|
<button class="btn-small" id="deselect-all">Deselect All</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="component-table-wrapper">
|
||||||
|
<table class="component-selection-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th width="50"></th>
|
||||||
|
<th>Component</th>
|
||||||
|
<th class="clickable-header" data-type="memory">Memory</th>
|
||||||
|
<th class="clickable-header" data-type="reasoning">Reasoning</th>
|
||||||
|
<th class="clickable-header" data-type="examples">Examples</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="components-tbody">
|
||||||
|
<!-- Components will be dynamically inserted here -->
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="action-area">
|
||||||
|
<button class="download-btn" id="download-btn">
|
||||||
|
<span class="icon">⬇</span> Generate & Download Context
|
||||||
|
</button>
|
||||||
|
<div class="status" id="status"></div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="reference-table">
|
||||||
|
<h2>Available Context Files</h2>
|
||||||
|
<div class="table-wrapper">
|
||||||
|
<table class="context-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Component</th>
|
||||||
|
<th>Memory</th>
|
||||||
|
<th>Reasoning</th>
|
||||||
|
<th>Examples</th>
|
||||||
|
<th>Full</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="reference-table-body">
|
||||||
|
<!-- Table rows will be dynamically inserted here -->
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script src="llmtxt.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
460
docs/md_v2/llmtxt/llmtxt.css
Normal file
460
docs/md_v2/llmtxt/llmtxt.css
Normal file
@@ -0,0 +1,460 @@
|
|||||||
|
/* Terminal Theme CSS for LLM Context Builder */
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--background-color: #070708;
|
||||||
|
--font-color: #e8e9ed;
|
||||||
|
--primary-color: #50ffff;
|
||||||
|
--primary-dimmed: #09b5a5;
|
||||||
|
--secondary-color: #d5cec0;
|
||||||
|
--tertiary-color: #a3abba;
|
||||||
|
--accent-color: rgb(243, 128, 245);
|
||||||
|
--error-color: #ff3c74;
|
||||||
|
--code-bg-color: #3f3f44;
|
||||||
|
--border-color: #3f3f44;
|
||||||
|
--hover-bg: #1a1a1c;
|
||||||
|
--success-color: #50ff50;
|
||||||
|
}
|
||||||
|
|
||||||
|
* {
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
font-family: dm, Monaco, Courier New, monospace;
|
||||||
|
font-size: 14px;
|
||||||
|
line-height: 1.5;
|
||||||
|
background-color: var(--background-color);
|
||||||
|
color: var(--font-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 1200px;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Header */
|
||||||
|
.header {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 40px;
|
||||||
|
border-bottom: 1px dashed var(--tertiary-color);
|
||||||
|
padding-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header h1 {
|
||||||
|
font-size: 24px;
|
||||||
|
color: var(--primary-color);
|
||||||
|
margin: 0;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo {
|
||||||
|
font-size: 28px;
|
||||||
|
vertical-align: middle;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Intro Section */
|
||||||
|
.intro {
|
||||||
|
background-color: var(--code-bg-color);
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
padding: 30px;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-header h2 {
|
||||||
|
color: var(--primary-color);
|
||||||
|
margin: 0 0 15px 0;
|
||||||
|
font-size: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-header p {
|
||||||
|
line-height: 1.6;
|
||||||
|
margin-bottom: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-header code {
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
padding: 2px 6px;
|
||||||
|
color: var(--primary-dimmed);
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-solution {
|
||||||
|
margin-top: 5px;
|
||||||
|
padding-top: 25px;
|
||||||
|
border-top: 1px dashed var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-solution h3 {
|
||||||
|
color: var(--secondary-color);
|
||||||
|
margin: 0 0 15px 0;
|
||||||
|
font-size: 18px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dimensions {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||||
|
gap: 20px;
|
||||||
|
margin: 20px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dimension {
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
padding: 20px;
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
transition: all 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dimension:hover {
|
||||||
|
border-color: var(--primary-dimmed);
|
||||||
|
}
|
||||||
|
|
||||||
|
.dimension h4 {
|
||||||
|
color: var(--font-color);
|
||||||
|
margin: 10px 0 8px 0;
|
||||||
|
font-size: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dimension p {
|
||||||
|
font-size: 13px;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: var(--tertiary-color);
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-benefits {
|
||||||
|
margin-top: 0px;
|
||||||
|
padding-top: 0x;
|
||||||
|
border-top: 1px dashed var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.intro-benefits strong {
|
||||||
|
color: var(--primary-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.learn-more {
|
||||||
|
margin-top: 15px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.learn-more-link {
|
||||||
|
color: var(--primary-dimmed);
|
||||||
|
text-decoration: none;
|
||||||
|
font-weight: bold;
|
||||||
|
transition: color 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.learn-more-link:hover {
|
||||||
|
color: var(--primary-color);
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 2px 8px;
|
||||||
|
font-size: 12px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
margin-right: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge.memory {
|
||||||
|
background-color: var(--primary-dimmed);
|
||||||
|
color: var(--background-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge.reasoning {
|
||||||
|
background-color: var(--accent-color);
|
||||||
|
color: var(--background-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge.examples {
|
||||||
|
background-color: var(--secondary-color);
|
||||||
|
color: var(--background-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Builder Section */
|
||||||
|
.builder {
|
||||||
|
margin-bottom: 40px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.builder h2 {
|
||||||
|
color: var(--primary-color);
|
||||||
|
font-size: 18px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Preset Options */
|
||||||
|
.preset-options {
|
||||||
|
display: flex;
|
||||||
|
gap: 20px;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-option {
|
||||||
|
flex: 1;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-option input[type="radio"] {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-card {
|
||||||
|
border: 2px solid var(--border-color);
|
||||||
|
padding: 20px;
|
||||||
|
transition: all 0.2s ease;
|
||||||
|
background-color: var(--code-bg-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-card h3 {
|
||||||
|
margin: 0 0 10px 0;
|
||||||
|
color: var(--secondary-color);
|
||||||
|
font-size: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-card p {
|
||||||
|
margin: 0;
|
||||||
|
font-size: 12px;
|
||||||
|
color: var(--tertiary-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-option input:checked + .preset-card {
|
||||||
|
border-color: var(--primary-color);
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
}
|
||||||
|
|
||||||
|
.preset-card:hover {
|
||||||
|
border-color: var(--primary-dimmed);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Component Selector */
|
||||||
|
.component-selector {
|
||||||
|
margin-bottom: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.select-all-controls {
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn-small {
|
||||||
|
background-color: var(--code-bg-color);
|
||||||
|
color: var(--font-color);
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
padding: 5px 15px;
|
||||||
|
margin-right: 10px;
|
||||||
|
cursor: pointer;
|
||||||
|
font-family: inherit;
|
||||||
|
font-size: 12px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
transition: all 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn-small:hover {
|
||||||
|
background-color: var(--primary-dimmed);
|
||||||
|
color: var(--background-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Component Selection Table */
|
||||||
|
.component-table-wrapper {
|
||||||
|
overflow-x: auto;
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
margin-top: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
background-color: var(--code-bg-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table th,
|
||||||
|
.component-selection-table td {
|
||||||
|
padding: 12px;
|
||||||
|
text-align: left;
|
||||||
|
border-bottom: 1px solid var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table th {
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
color: var(--primary-color);
|
||||||
|
text-transform: uppercase;
|
||||||
|
font-size: 12px;
|
||||||
|
letter-spacing: 1px;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table th.clickable-header {
|
||||||
|
cursor: pointer;
|
||||||
|
user-select: none;
|
||||||
|
transition: background-color 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table th.clickable-header:hover {
|
||||||
|
background-color: var(--primary-dimmed);
|
||||||
|
color: var(--background-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table th:nth-child(3),
|
||||||
|
.component-selection-table th:nth-child(4),
|
||||||
|
.component-selection-table th:nth-child(5) {
|
||||||
|
text-align: center;
|
||||||
|
width: 120px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table td {
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table td:nth-child(3),
|
||||||
|
.component-selection-table td:nth-child(4),
|
||||||
|
.component-selection-table td:nth-child(5) {
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table tr:hover td {
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-name {
|
||||||
|
color: var(--primary-color);
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.component-selection-table input[type="checkbox"] {
|
||||||
|
cursor: pointer;
|
||||||
|
width: 16px;
|
||||||
|
height: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Disabled row state */
|
||||||
|
.component-selection-table tr.disabled td:not(:first-child) {
|
||||||
|
opacity: 0.5;
|
||||||
|
pointer-events: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Action Area */
|
||||||
|
.action-area {
|
||||||
|
text-align: center;
|
||||||
|
margin: 40px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.download-btn {
|
||||||
|
background-color: var(--primary-dimmed);
|
||||||
|
color: var(--background-color);
|
||||||
|
border: none;
|
||||||
|
padding: 15px 40px;
|
||||||
|
font-size: 16px;
|
||||||
|
font-family: inherit;
|
||||||
|
cursor: pointer;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 1px;
|
||||||
|
transition: all 0.2s ease;
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.download-btn:hover {
|
||||||
|
background-color: var(--primary-color);
|
||||||
|
transform: translateY(-2px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.download-btn .icon {
|
||||||
|
font-size: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status {
|
||||||
|
margin-top: 20px;
|
||||||
|
font-size: 14px;
|
||||||
|
min-height: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.status.loading {
|
||||||
|
color: var(--primary-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.status.success {
|
||||||
|
color: var(--success-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.status.error {
|
||||||
|
color: var(--error-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reference Table */
|
||||||
|
.reference-table {
|
||||||
|
margin-top: 60px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.reference-table h2 {
|
||||||
|
color: var(--primary-color);
|
||||||
|
font-size: 18px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
|
||||||
|
.table-wrapper {
|
||||||
|
overflow-x: auto;
|
||||||
|
border: 1px solid var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.context-table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
background-color: var(--code-bg-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.context-table th,
|
||||||
|
.context-table td {
|
||||||
|
padding: 12px;
|
||||||
|
text-align: left;
|
||||||
|
border-bottom: 1px solid var(--border-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
.context-table th {
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
color: var(--primary-color);
|
||||||
|
text-transform: uppercase;
|
||||||
|
font-size: 12px;
|
||||||
|
letter-spacing: 1px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.context-table td {
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.context-table tr:hover td {
|
||||||
|
background-color: var(--hover-bg);
|
||||||
|
}
|
||||||
|
|
||||||
|
.file-link {
|
||||||
|
color: var(--primary-dimmed);
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.file-link:hover {
|
||||||
|
color: var(--primary-color);
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
|
||||||
|
.file-size {
|
||||||
|
color: var(--tertiary-color);
|
||||||
|
font-size: 11px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Responsive Design */
|
||||||
|
@media (max-width: 768px) {
|
||||||
|
.preset-options {
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
|
||||||
|
.components-grid {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
padding: 10px;
|
||||||
|
}
|
||||||
|
}
|
||||||
458
docs/md_v2/llmtxt/llmtxt.js
Normal file
458
docs/md_v2/llmtxt/llmtxt.js
Normal file
@@ -0,0 +1,458 @@
|
|||||||
|
// Crawl4AI LLM Context Builder JavaScript
|
||||||
|
|
||||||
|
// Component definitions
|
||||||
|
const components = [
|
||||||
|
{
|
||||||
|
id: 'all',
|
||||||
|
name: 'All Components',
|
||||||
|
description: 'All components with all context types',
|
||||||
|
special: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'core',
|
||||||
|
name: 'Core Functionality',
|
||||||
|
description: 'Basic crawling and scraping features'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'config_objects',
|
||||||
|
name: 'Configuration Objects',
|
||||||
|
description: 'Browser and crawler configuration'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'deep_crawling',
|
||||||
|
name: 'Deep Crawling',
|
||||||
|
description: 'Multi-page crawling strategies'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'deployment',
|
||||||
|
name: 'Deployment',
|
||||||
|
description: 'Installation and Docker setup'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'extraction',
|
||||||
|
name: 'Data Extraction',
|
||||||
|
description: 'Structured data extraction strategies'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'markdown',
|
||||||
|
name: 'Markdown Generation',
|
||||||
|
description: 'Content-to-markdown conversion'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'vibe',
|
||||||
|
name: 'Vibe Coding',
|
||||||
|
description: 'General-purpose AI context',
|
||||||
|
special: false
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
// Context types
|
||||||
|
const contextTypes = ['memory', 'reasoning', 'examples'];
|
||||||
|
|
||||||
|
// State management
|
||||||
|
const state = {
|
||||||
|
preset: 'custom',
|
||||||
|
selectedComponents: new Set(),
|
||||||
|
selectedContextTypes: new Map()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Initialize the application
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
setupPresetHandlers();
|
||||||
|
renderComponents();
|
||||||
|
renderReferenceTable();
|
||||||
|
setupActionHandlers();
|
||||||
|
setupColumnHeaderHandlers();
|
||||||
|
|
||||||
|
// Initialize only core component as selected with all context types
|
||||||
|
state.selectedComponents.add('core');
|
||||||
|
state.selectedContextTypes.set('core', new Set(contextTypes));
|
||||||
|
updateComponentUI();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Setup preset radio button handlers
|
||||||
|
function setupPresetHandlers() {
|
||||||
|
const presetRadios = document.querySelectorAll('input[name="preset"]');
|
||||||
|
presetRadios.forEach(radio => {
|
||||||
|
radio.addEventListener('change', (e) => {
|
||||||
|
state.preset = e.target.value;
|
||||||
|
updatePresetSelection();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update UI based on preset selection
|
||||||
|
function updatePresetSelection() {
|
||||||
|
const componentSelector = document.getElementById('component-selector');
|
||||||
|
|
||||||
|
if (state.preset === 'custom') {
|
||||||
|
componentSelector.style.display = 'block';
|
||||||
|
} else {
|
||||||
|
componentSelector.style.display = 'none';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render component selection table
|
||||||
|
function renderComponents() {
|
||||||
|
const tbody = document.getElementById('components-tbody');
|
||||||
|
tbody.innerHTML = '';
|
||||||
|
|
||||||
|
components.filter(c => !c.special).forEach(component => {
|
||||||
|
const row = createComponentRow(component);
|
||||||
|
tbody.appendChild(row);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a component table row
|
||||||
|
function createComponentRow(component) {
|
||||||
|
const tr = document.createElement('tr');
|
||||||
|
tr.id = `component-${component.id}`;
|
||||||
|
|
||||||
|
// Component checkbox cell
|
||||||
|
const checkboxCell = document.createElement('td');
|
||||||
|
checkboxCell.innerHTML = `
|
||||||
|
<input type="checkbox" id="check-${component.id}"
|
||||||
|
data-component="${component.id}">
|
||||||
|
`;
|
||||||
|
tr.appendChild(checkboxCell);
|
||||||
|
|
||||||
|
// Component name cell
|
||||||
|
const nameCell = document.createElement('td');
|
||||||
|
nameCell.innerHTML = `<span class="component-name">${component.name}</span>`;
|
||||||
|
tr.appendChild(nameCell);
|
||||||
|
|
||||||
|
// Context type cells
|
||||||
|
contextTypes.forEach(type => {
|
||||||
|
const td = document.createElement('td');
|
||||||
|
td.innerHTML = `
|
||||||
|
<input type="checkbox" id="check-${component.id}-${type}"
|
||||||
|
data-component="${component.id}" data-type="${type}">
|
||||||
|
`;
|
||||||
|
tr.appendChild(td);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add event listeners
|
||||||
|
const mainCheckbox = tr.querySelector(`#check-${component.id}`);
|
||||||
|
mainCheckbox.addEventListener('change', (e) => {
|
||||||
|
handleComponentToggle(component.id, e.target.checked);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add event listeners for context type checkboxes
|
||||||
|
contextTypes.forEach(type => {
|
||||||
|
const typeCheckbox = tr.querySelector(`#check-${component.id}-${type}`);
|
||||||
|
typeCheckbox.addEventListener('change', (e) => {
|
||||||
|
handleContextTypeToggle(component.id, type, e.target.checked);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return tr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle component checkbox toggle
|
||||||
|
function handleComponentToggle(componentId, checked) {
|
||||||
|
if (checked) {
|
||||||
|
state.selectedComponents.add(componentId);
|
||||||
|
// Select all context types when component is selected
|
||||||
|
if (!state.selectedContextTypes.has(componentId)) {
|
||||||
|
state.selectedContextTypes.set(componentId, new Set(contextTypes));
|
||||||
|
} else {
|
||||||
|
// If component was already partially selected, select all
|
||||||
|
state.selectedContextTypes.set(componentId, new Set(contextTypes));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
state.selectedComponents.delete(componentId);
|
||||||
|
state.selectedContextTypes.delete(componentId);
|
||||||
|
}
|
||||||
|
updateComponentUI();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle component selection based on context types
|
||||||
|
function updateComponentSelection(componentId) {
|
||||||
|
const types = state.selectedContextTypes.get(componentId) || new Set();
|
||||||
|
if (types.size > 0) {
|
||||||
|
state.selectedComponents.add(componentId);
|
||||||
|
} else {
|
||||||
|
state.selectedComponents.delete(componentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle context type checkbox toggle
|
||||||
|
function handleContextTypeToggle(componentId, type, checked) {
|
||||||
|
if (!state.selectedContextTypes.has(componentId)) {
|
||||||
|
state.selectedContextTypes.set(componentId, new Set());
|
||||||
|
}
|
||||||
|
|
||||||
|
const types = state.selectedContextTypes.get(componentId);
|
||||||
|
if (checked) {
|
||||||
|
types.add(type);
|
||||||
|
} else {
|
||||||
|
types.delete(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
updateComponentSelection(componentId);
|
||||||
|
updateComponentUI();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update UI to reflect current state
|
||||||
|
function updateComponentUI() {
|
||||||
|
components.filter(c => !c.special).forEach(component => {
|
||||||
|
const row = document.getElementById(`component-${component.id}`);
|
||||||
|
const mainCheckbox = row.querySelector(`#check-${component.id}`);
|
||||||
|
const hasSelection = state.selectedComponents.has(component.id);
|
||||||
|
const selectedTypes = state.selectedContextTypes.get(component.id) || new Set();
|
||||||
|
|
||||||
|
// Update main checkbox
|
||||||
|
mainCheckbox.checked = hasSelection;
|
||||||
|
|
||||||
|
// Update row disabled state
|
||||||
|
row.classList.toggle('disabled', !hasSelection);
|
||||||
|
|
||||||
|
// Update context type checkboxes
|
||||||
|
contextTypes.forEach(type => {
|
||||||
|
const typeCheckbox = row.querySelector(`#check-${component.id}-${type}`);
|
||||||
|
typeCheckbox.checked = selectedTypes.has(type);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup action button handlers
|
||||||
|
function setupActionHandlers() {
|
||||||
|
// Select/Deselect all buttons
|
||||||
|
document.getElementById('select-all').addEventListener('click', () => {
|
||||||
|
components.filter(c => !c.special).forEach(comp => {
|
||||||
|
state.selectedComponents.add(comp.id);
|
||||||
|
state.selectedContextTypes.set(comp.id, new Set(contextTypes));
|
||||||
|
});
|
||||||
|
updateComponentUI();
|
||||||
|
});
|
||||||
|
|
||||||
|
document.getElementById('deselect-all').addEventListener('click', () => {
|
||||||
|
state.selectedComponents.clear();
|
||||||
|
state.selectedContextTypes.clear();
|
||||||
|
updateComponentUI();
|
||||||
|
});
|
||||||
|
|
||||||
|
// Download button
|
||||||
|
document.getElementById('download-btn').addEventListener('click', handleDownload);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup column header click handlers
|
||||||
|
function setupColumnHeaderHandlers() {
|
||||||
|
const headers = document.querySelectorAll('.clickable-header');
|
||||||
|
headers.forEach(header => {
|
||||||
|
header.addEventListener('click', () => {
|
||||||
|
const type = header.getAttribute('data-type');
|
||||||
|
toggleColumnSelection(type);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Toggle all checkboxes in a column
|
||||||
|
function toggleColumnSelection(type) {
|
||||||
|
// Check if all are currently selected
|
||||||
|
let allSelected = true;
|
||||||
|
components.filter(c => !c.special).forEach(comp => {
|
||||||
|
const types = state.selectedContextTypes.get(comp.id);
|
||||||
|
if (!types || !types.has(type)) {
|
||||||
|
allSelected = false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Toggle all
|
||||||
|
components.filter(c => !c.special).forEach(comp => {
|
||||||
|
if (!state.selectedContextTypes.has(comp.id)) {
|
||||||
|
state.selectedContextTypes.set(comp.id, new Set());
|
||||||
|
}
|
||||||
|
|
||||||
|
const types = state.selectedContextTypes.get(comp.id);
|
||||||
|
if (allSelected) {
|
||||||
|
types.delete(type);
|
||||||
|
} else {
|
||||||
|
types.add(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
updateComponentSelection(comp.id);
|
||||||
|
});
|
||||||
|
|
||||||
|
updateComponentUI();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle download action
|
||||||
|
async function handleDownload() {
|
||||||
|
const statusEl = document.getElementById('status');
|
||||||
|
statusEl.textContent = 'Preparing context files...';
|
||||||
|
statusEl.className = 'status loading';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const files = getSelectedFiles();
|
||||||
|
if (files.length === 0) {
|
||||||
|
throw new Error('No files selected. Please select at least one component or preset.');
|
||||||
|
}
|
||||||
|
|
||||||
|
statusEl.textContent = `Fetching ${files.length} files...`;
|
||||||
|
|
||||||
|
const contents = await fetchFiles(files);
|
||||||
|
const combined = combineContents(contents);
|
||||||
|
|
||||||
|
downloadFile(combined, 'crawl4ai_custom_context.md');
|
||||||
|
|
||||||
|
statusEl.textContent = 'Download complete!';
|
||||||
|
statusEl.className = 'status success';
|
||||||
|
|
||||||
|
setTimeout(() => {
|
||||||
|
statusEl.textContent = '';
|
||||||
|
statusEl.className = 'status';
|
||||||
|
}, 3000);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
statusEl.textContent = `Error: ${error.message}`;
|
||||||
|
statusEl.className = 'status error';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get list of selected files based on current state
|
||||||
|
function getSelectedFiles() {
|
||||||
|
const files = [];
|
||||||
|
|
||||||
|
if (state.preset === 'vibe') {
|
||||||
|
files.push('crawl4ai_vibe.llm.full.md');
|
||||||
|
} else if (state.preset === 'all') {
|
||||||
|
// Use the dedicated aggregated files for all components
|
||||||
|
files.push('crawl4ai_all_memory_content.llm.md');
|
||||||
|
files.push('crawl4ai_all_reasoning_content.llm.md');
|
||||||
|
files.push('crawl4ai_all_examples_content.llm.md');
|
||||||
|
} else {
|
||||||
|
// Custom selection
|
||||||
|
state.selectedComponents.forEach(compId => {
|
||||||
|
const types = state.selectedContextTypes.get(compId);
|
||||||
|
if (types) {
|
||||||
|
types.forEach(type => {
|
||||||
|
files.push(`crawl4ai_${compId}_${type}_content.llm.md`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return files;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch multiple files
|
||||||
|
async function fetchFiles(fileNames) {
|
||||||
|
// Use /assets/llmtxt/ path with .txt extension
|
||||||
|
const baseUrl = '/assets/llmtxt/';
|
||||||
|
const promises = fileNames.map(async (fileName) => {
|
||||||
|
// Convert .md to .txt for fetching
|
||||||
|
const txtFileName = fileName.replace('.md', '.txt');
|
||||||
|
try {
|
||||||
|
const response = await fetch(baseUrl + txtFileName);
|
||||||
|
if (!response.ok) {
|
||||||
|
console.warn(`Failed to fetch ${txtFileName} from ${baseUrl + txtFileName}`);
|
||||||
|
return { fileName, content: `<!-- Failed to load ${fileName} -->` };
|
||||||
|
}
|
||||||
|
const content = await response.text();
|
||||||
|
return { fileName, content };
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`Error fetching ${txtFileName} from ${baseUrl + txtFileName}:`, error);
|
||||||
|
return { fileName, content: `<!-- Error loading ${fileName} -->` };
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return Promise.all(promises);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combine file contents with headers
|
||||||
|
function combineContents(fileContents) {
|
||||||
|
const header = `# Crawl4AI Custom LLM Context
|
||||||
|
Generated on: ${new Date().toISOString()}
|
||||||
|
Total files: ${fileContents.length}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
`;
|
||||||
|
|
||||||
|
const sections = fileContents.map(({ fileName, content }) => {
|
||||||
|
const componentName = extractComponentName(fileName);
|
||||||
|
const contextType = extractContextType(fileName);
|
||||||
|
|
||||||
|
return `## ${componentName} - ${contextType}
|
||||||
|
Source: ${fileName}
|
||||||
|
|
||||||
|
${content}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
`;
|
||||||
|
});
|
||||||
|
|
||||||
|
return header + sections.join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract component name from filename
|
||||||
|
function extractComponentName(fileName) {
|
||||||
|
// Pattern: crawl4ai_{component}_{type}_content.llm.md
|
||||||
|
const match = fileName.match(/crawl4ai_(.+?)_(memory|reasoning|examples|llm\.full)/);
|
||||||
|
if (match) {
|
||||||
|
const compId = match[1];
|
||||||
|
const component = components.find(c => c.id === compId);
|
||||||
|
return component ? component.name : compId.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
|
||||||
|
}
|
||||||
|
return 'Unknown Component';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract context type from filename
|
||||||
|
function extractContextType(fileName) {
|
||||||
|
if (fileName.includes('_memory_')) return 'Memory';
|
||||||
|
if (fileName.includes('_reasoning_')) return 'Reasoning';
|
||||||
|
if (fileName.includes('_examples_')) return 'Examples';
|
||||||
|
if (fileName.includes('.llm.full')) return 'Complete Context';
|
||||||
|
return 'Context';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Download file to user's computer
|
||||||
|
function downloadFile(content, fileName) {
|
||||||
|
const blob = new Blob([content], { type: 'text/markdown' });
|
||||||
|
const url = URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement('a');
|
||||||
|
a.href = url;
|
||||||
|
a.download = fileName;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
document.body.removeChild(a);
|
||||||
|
URL.revokeObjectURL(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render reference table
|
||||||
|
function renderReferenceTable() {
|
||||||
|
const tbody = document.getElementById('reference-table-body');
|
||||||
|
tbody.innerHTML = '';
|
||||||
|
|
||||||
|
// Since vibe is no longer special, just show all components the same way
|
||||||
|
components.forEach(component => {
|
||||||
|
const row = document.createElement('tr');
|
||||||
|
row.innerHTML = `
|
||||||
|
<td><strong>${component.name}</strong></td>
|
||||||
|
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_memory_content.llm.txt" class="file-link" target="_blank">Memory</a></td>
|
||||||
|
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_reasoning_content.llm.txt" class="file-link" target="_blank">Reasoning</a></td>
|
||||||
|
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_examples_content.llm.txt" class="file-link" target="_blank">Examples</a></td>
|
||||||
|
<td><a href="/assets/llmtxt/crawl4ai_${component.id}.llm.full.txt" class="file-link" target="_blank">Full</a></td>
|
||||||
|
`;
|
||||||
|
tbody.appendChild(row);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if examples file exists (all components have examples)
|
||||||
|
function hasExamplesFile(componentId) {
|
||||||
|
// All components have examples files
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if full file exists (all components have full files)
|
||||||
|
function hasFullFile(componentId) {
|
||||||
|
// All components have full files
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Utility function to capitalize first letter
|
||||||
|
function capitalizeFirst(str) {
|
||||||
|
return str.charAt(0).toUpperCase() + str.slice(1);
|
||||||
|
}
|
||||||
37
docs/md_v2/llmtxt/why.md
Normal file
37
docs/md_v2/llmtxt/why.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# Supercharging Your AI Assistant: My Journey to Better LLM Contexts for `crawl4ai`
|
||||||
|
|
||||||
|
When I started diving deep into using AI coding assistants with my own libraries, particularly `crawl4ai`, I quickly realized that the common approach to providing context via a simple `llm.txt` or even a beefed-up `README.md` just wasn't cutting it. This document explains the problems I encountered and how I've tried to create a more effective system for `crawl4ai`, allowing you (and your AI assistant) to get precisely the right information.
|
||||||
|
|
||||||
|
## My Frustration with Standard `llm.txt` Files
|
||||||
|
|
||||||
|
My experience with generic `llm.txt` files for complex libraries like `crawl4ai` revealed several pain points:
|
||||||
|
|
||||||
|
1. **Information Overload & Lost Focus:** I found that when I threw a massive, monolithic context file at an LLM, it often struggled. The sheer volume of information seemed to dilute its focus. If I asked a specific question about a niche feature, the LLM might get sidetracked by more prominent but currently irrelevant parts of the library. It felt like trying to find a single sentence in a thousand-page novel – the information was *there*, but not always accessible or prioritized correctly by the AI.
|
||||||
|
|
||||||
|
2. **The "What" Without the "How" or "Why":** Most `llm.txt` files I encountered were essentially API dumps – a list of functions, classes, and parameters. This is the "what" of a library. But to truly use a library effectively, especially one as flexible as `crawl4ai`, you need the "how" (idiomatic usage patterns, best practices for common tasks) and the "why" (the design rationale behind certain features). Without this, I noticed my AI assistant would often generate syntactically correct but practically inefficient or non-idiomatic code. It was guessing the *intent* and the *best way* to use the library, and those guesses weren't always right.
|
||||||
|
|
||||||
|
3. **No Guidance on "Thinking" Like an Expert:** A static list of facts doesn't teach an LLM the *art* of using the library. It doesn't convey the trade-offs an experienced developer considers, the common pitfalls they've learned to avoid, or the clever ways to combine features to solve complex problems. I wanted my AI assistant to not just recall an API, but to help me *reason* about the best way to build a solution with `crawl4ai`.
|
||||||
|
|
||||||
|
## Inspiration: Selective Inclusion & Multi-Dimensional Understanding
|
||||||
|
|
||||||
|
I've always admired how libraries like Lodash or jQuery (in its modular days) allowed developers to pick and choose only the parts they needed, resulting in smaller, more focused bundles. This idea of modularity and selective inclusion resonated deeply with me as I thought about LLM context. Why force-feed an LLM the entire library's details when I'm only working on a specific component or task?
|
||||||
|
|
||||||
|
This led me to develop a new approach for `crawl4ai`: **multi-dimensional, modular contexts**.
|
||||||
|
|
||||||
|
Instead of one giant `llm.txt`, I've broken down the `crawl4ai` documentation into:
|
||||||
|
|
||||||
|
1. **Logical Components:** Context is organized around the major functional areas of the library (e.g., Core, Data Extraction, Deep Crawling, Markdown Generation, etc.). This allows you to select context relevant only to the task at hand.
|
||||||
|
2. **Three Dimensions of Context for Each Component:**
|
||||||
|
* **`_memory.md` (Foundational Memory):** This is the "what." It contains the precise, factual information about the component's public API, data structures, configuration objects, parameters, and method signatures. It's the detailed, unambiguous reference.
|
||||||
|
* **`_reasoning.md` (Reasoning & Problem-Solving Framework):** This is the "how" and "why." It includes design principles, common task workflows with decision guides, best practices, anti-patterns, illustrative code examples solving real problems, and explanations of trade-offs. It aims to guide the LLM in "thinking" like an expert `crawl4ai` user.
|
||||||
|
* **`_examples.md` (Practical Code Examples):** This is pure "show-me-the-code." It's a collection of runnable snippets demonstrating various ways to use the component's features and configurations, with minimal explanatory text. It’s for quickly seeing different patterns in action.
|
||||||
|
|
||||||
|
**The Goal:**
|
||||||
|
My aim is to provide you with a flexible system. You can give your AI assistant:
|
||||||
|
* Just the **memory** files for quick API lookups.
|
||||||
|
* The **reasoning** files (perhaps with memory) for help designing solutions.
|
||||||
|
* The **examples** files for seeing practical implementations.
|
||||||
|
* A **combination** of these across one or more components tailored to your specific task.
|
||||||
|
* Or, for broader understanding, special aggregate contexts like the "Vibe Coding" context or the "All Library Context."
|
||||||
|
|
||||||
|
By providing these structured, multi-faceted contexts, I hope to significantly improve the quality and relevance of the assistance you get when using AI to code with `crawl4ai`. The following sections will guide you on how to select and use these different context files.
|
||||||
47
docs/md_v2/overrides/main.html
Normal file
47
docs/md_v2/overrides/main.html
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
{% set extra_html_attrs = 'data-theme="dark"' %}
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block extrahead %}
|
||||||
|
{{ super() }}
|
||||||
|
<script>
|
||||||
|
document.documentElement.setAttribute("data-theme", "dark");
|
||||||
|
</script>
|
||||||
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pushfeedback/dist/pushfeedback/pushfeedback.css">
|
||||||
|
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
/* brand */
|
||||||
|
--feedback-primary-color: #09b5a5;
|
||||||
|
--feedback-highlight-color: #fed500;
|
||||||
|
|
||||||
|
|
||||||
|
/* align with the value you really use in :root */
|
||||||
|
--header-height: 65px;
|
||||||
|
|
||||||
|
/* Push modal content down */
|
||||||
|
--feedback-modal-content-position-top: var(--header-height);
|
||||||
|
|
||||||
|
--feedback-modal-modal-wrapper-z-index: 1100;
|
||||||
|
/* > header’s 1000 */
|
||||||
|
--feedback-modal-content-z-index: 1101;
|
||||||
|
}
|
||||||
|
|
||||||
|
feedback-modal::part(overlay) {
|
||||||
|
top: var(--header-height);
|
||||||
|
/* start below header */
|
||||||
|
height: calc(100vh - var(--header-height));
|
||||||
|
/* fill the rest */
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<script type="module"
|
||||||
|
src="https://cdn.jsdelivr.net/npm/pushfeedback@latest/dist/pushfeedback/pushfeedback.esm.js"></script>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block footer %}
|
||||||
|
<feedback-button project="w8plzp8vjp" button-style="dark" button-position="center-right" modal-position="sidebar-right">
|
||||||
|
>
|
||||||
|
Feedback
|
||||||
|
</feedback-button>
|
||||||
|
{% endblock %}
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
site_name: Crawl4AI Documentation (v0.6.x)
|
site_name: Crawl4AI Documentation (v0.6.x)
|
||||||
|
site_favicon: docs/md_v2/favicon.ico
|
||||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||||
site_url: https://docs.crawl4ai.com
|
site_url: https://docs.crawl4ai.com
|
||||||
repo_url: https://github.com/unclecode/crawl4ai
|
repo_url: https://github.com/unclecode/crawl4ai
|
||||||
@@ -8,6 +9,7 @@ docs_dir: docs/md_v2
|
|||||||
nav:
|
nav:
|
||||||
- Home: 'index.md'
|
- Home: 'index.md'
|
||||||
- "Ask AI": "core/ask-ai.md"
|
- "Ask AI": "core/ask-ai.md"
|
||||||
|
- "LLM Context": "core/llmtxt.md"
|
||||||
- "Quick Start": "core/quickstart.md"
|
- "Quick Start": "core/quickstart.md"
|
||||||
- "Code Examples": "core/examples.md"
|
- "Code Examples": "core/examples.md"
|
||||||
- Setup & Installation:
|
- Setup & Installation:
|
||||||
@@ -57,6 +59,8 @@ nav:
|
|||||||
theme:
|
theme:
|
||||||
name: 'terminal'
|
name: 'terminal'
|
||||||
palette: 'dark'
|
palette: 'dark'
|
||||||
|
custom_dir: docs/md_v2/overrides
|
||||||
|
color_mode: 'dark'
|
||||||
icon:
|
icon:
|
||||||
repo: fontawesome/brands/github
|
repo: fontawesome/brands/github
|
||||||
|
|
||||||
@@ -82,8 +86,11 @@ extra_css:
|
|||||||
- assets/styles.css
|
- assets/styles.css
|
||||||
- assets/highlight.css
|
- assets/highlight.css
|
||||||
- assets/dmvendor.css
|
- assets/dmvendor.css
|
||||||
|
- assets/feedback-overrides.css
|
||||||
|
|
||||||
extra_javascript:
|
extra_javascript:
|
||||||
|
- https://www.googletagmanager.com/gtag/js?id=G-58W0K2ZQ25
|
||||||
|
- assets/gtag.js
|
||||||
- assets/highlight.min.js
|
- assets/highlight.min.js
|
||||||
- assets/highlight_init.js
|
- assets/highlight_init.js
|
||||||
- https://buttons.github.io/buttons.js
|
- https://buttons.github.io/buttons.js
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ dependencies = [
|
|||||||
"xxhash~=3.4",
|
"xxhash~=3.4",
|
||||||
"rank-bm25~=0.2",
|
"rank-bm25~=0.2",
|
||||||
"aiofiles>=24.1.0",
|
"aiofiles>=24.1.0",
|
||||||
"colorama~=0.4",
|
|
||||||
"snowballstemmer~=2.2",
|
"snowballstemmer~=2.2",
|
||||||
"pydantic>=2.10",
|
"pydantic>=2.10",
|
||||||
"pyOpenSSL>=24.3.0",
|
"pyOpenSSL>=24.3.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user