Basic HTML document for testing purposes.
+diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 4f9da890..57b3fc4b 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -926,6 +926,8 @@ class CrawlerRunConfig():
Default: False.
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
Default: 0.2.
+ max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan.
+ If None, scrolls until the entire page is loaded. Default: None.
process_iframes (bool): If True, attempts to process and inline iframe content.
Default: False.
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
@@ -1066,6 +1068,7 @@ class CrawlerRunConfig():
ignore_body_visibility: bool = True,
scan_full_page: bool = False,
scroll_delay: float = 0.2,
+ max_scroll_steps: Optional[int] = None,
process_iframes: bool = False,
remove_overlay_elements: bool = False,
simulate_user: bool = False,
@@ -1170,6 +1173,7 @@ class CrawlerRunConfig():
self.ignore_body_visibility = ignore_body_visibility
self.scan_full_page = scan_full_page
self.scroll_delay = scroll_delay
+ self.max_scroll_steps = max_scroll_steps
self.process_iframes = process_iframes
self.remove_overlay_elements = remove_overlay_elements
self.simulate_user = simulate_user
@@ -1387,6 +1391,7 @@ class CrawlerRunConfig():
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
scan_full_page=kwargs.get("scan_full_page", False),
scroll_delay=kwargs.get("scroll_delay", 0.2),
+ max_scroll_steps=kwargs.get("max_scroll_steps"),
process_iframes=kwargs.get("process_iframes", False),
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
simulate_user=kwargs.get("simulate_user", False),
@@ -1499,6 +1504,7 @@ class CrawlerRunConfig():
"ignore_body_visibility": self.ignore_body_visibility,
"scan_full_page": self.scan_full_page,
"scroll_delay": self.scroll_delay,
+ "max_scroll_steps": self.max_scroll_steps,
"process_iframes": self.process_iframes,
"remove_overlay_elements": self.remove_overlay_elements,
"simulate_user": self.simulate_user,
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index b94e68ca..9fdb0fe2 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -469,9 +469,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
console_messages=captured_console,
)
- elif url.startswith("raw:") or url.startswith("raw://"):
+ #####
+ # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
+ # Fix: Check for "raw://" first, then "raw:"
+ # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
+ #####
+ elif url.startswith("raw://") or url.startswith("raw:"):
# Process raw HTML content
- raw_html = url[4:] if url[:4] == "raw:" else url[7:]
+ # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
+ raw_html = url[6:] if url.startswith("raw://") else url[4:]
html = raw_html
if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html)
@@ -930,7 +936,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Handle full page scanning
if config.scan_full_page:
- await self._handle_full_page_scan(page, config.scroll_delay)
+ # await self._handle_full_page_scan(page, config.scroll_delay)
+ await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps)
# Handle virtual scroll if configured
if config.virtual_scroll_config:
@@ -1122,7 +1129,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Close the page
await page.close()
- async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
+ # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
+ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None):
"""
Helper method to handle full page scanning.
@@ -1137,6 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
Args:
page (Page): The Playwright page object
scroll_delay (float): The delay between page scrolls
+ max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end.
"""
try:
@@ -1161,9 +1170,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
dimensions = await self.get_page_dimensions(page)
total_height = dimensions["height"]
+ scroll_step_count = 0
while current_position < total_height:
+ ####
+ # NEW FEATURE: Check if we've reached the maximum allowed scroll steps
+ # This prevents infinite scrolling on very long pages or infinite scroll scenarios
+ # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior)
+ ####
+ if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps:
+ break
current_position = min(current_position + viewport_height, total_height)
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
+
+ # Increment the step counter for max_scroll_steps tracking
+ scroll_step_count += 1
+
# await page.evaluate(f"window.scrollTo(0, {current_position})")
# await asyncio.sleep(scroll_delay)
@@ -1804,12 +1825,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# then wait for the new page to load before continuing
result = None
try:
+ # OLD VERSION:
+ # result = await page.evaluate(
+ # f"""
+ # (async () => {{
+ # try {{
+ # const script_result = {script};
+ # return {{ success: true, result: script_result }};
+ # }} catch (err) {{
+ # return {{ success: false, error: err.toString(), stack: err.stack }};
+ # }}
+ # }})();
+ # """
+ # )
+
+ # """ NEW VERSION:
+ # When {script} contains statements (e.g., const link = ā¦; link.click();),
+ # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.
+ # """
result = await page.evaluate(
f"""
(async () => {{
try {{
- const script_result = {script};
- return {{ success: true, result: script_result }};
+ return await (async () => {{
+ {script}
+ }})();
}} catch (err) {{
return {{ success: false, error: err.toString(), stack: err.stack }};
}}
diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py
index 122be482..b65112e2 100644
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
# Prefix check (/foo/*)
if self._simple_prefixes:
path = url.split("?")[0]
- if any(path.startswith(p) for p in self._simple_prefixes):
- result = True
- self._update_stats(result)
- return not result if self._reverse else result
+ # if any(path.startswith(p) for p in self._simple_prefixes):
+ # result = True
+ # self._update_stats(result)
+ # return not result if self._reverse else result
+ ####
+ # Modified the prefix matching logic to ensure path boundary checking:
+ # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
+ # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
+ ####
+ for prefix in self._simple_prefixes:
+ if path.startswith(prefix):
+ if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
+ result = True
+ self._update_stats(result)
+ return not result if self._reverse else result
# Complex patterns
if self._path_patterns:
@@ -337,6 +348,15 @@ class ContentTypeFilter(URLFilter):
"sqlite": "application/vnd.sqlite3",
# Placeholder
"unknown": "application/octet-stream", # Fallback for unknown file types
+ # php
+ "php": "application/x-httpd-php",
+ "php3": "application/x-httpd-php",
+ "php4": "application/x-httpd-php",
+ "php5": "application/x-httpd-php",
+ "php7": "application/x-httpd-php",
+ "phtml": "application/x-httpd-php",
+ "phps": "application/x-httpd-php-source",
+
}
@staticmethod
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 4538eb0c..74ad794f 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -332,7 +332,7 @@ The `clone()` method:
### Key fields to note
1.ā**`provider`**:
-- Which LLM provoder to use.
+- Which LLM provider to use.
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)*
2.ā**`api_token`**:
diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py
index 27a1c310..de9c1c4a 100644
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,43 +1,55 @@
-from crawl4ai import LLMConfig
-from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
import asyncio
-import os
-import json
from pydantic import BaseModel, Field
-
-url = "https://openai.com/api/pricing/"
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from typing import Dict
+import os
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
- output_fee: str = Field(
- ..., description="Fee for output token for the OpenAI model."
+ output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+
+
+async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+ print(f"\n--- Extracting Structured Data with {provider} ---")
+
+ if api_token is None and provider != "ollama":
+ print(f"API token is required for {provider}. Skipping this example.")
+ return
+
+ browser_config = BrowserConfig(headless=True)
+
+ extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+ if extra_headers:
+ extra_args["extra_headers"] = extra_headers
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ word_count_threshold=1,
+ page_timeout=80000,
+ extraction_strategy=LLMExtractionStrategy(
+ llm_config=LLMConfig(provider=provider, api_token=api_token),
+ schema=OpenAIModelFee.model_json_schema(),
+ extraction_type="schema",
+ instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
+ Do not miss any models in the entire content.""",
+ extra_args=extra_args,
+ ),
)
-async def main():
- # Use AsyncWebCrawler
- async with AsyncWebCrawler() as crawler:
+ async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
- url=url,
- word_count_threshold=1,
- extraction_strategy=LLMExtractionStrategy(
- # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
- llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
- schema=OpenAIModelFee.model_json_schema(),
- extraction_type="schema",
- instruction="From the crawled content, extract all mentioned model names along with their "
- "fees for input and output tokens. Make sure not to miss anything in the entire content. "
- "One extracted model JSON format should look like this: "
- '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
- ),
+ url="https://openai.com/api/pricing/",
+ config=crawler_config
)
- print("Success:", result.success)
- model_fees = json.loads(result.extracted_content)
- print(len(model_fees))
-
- with open(".data/data.json", "w", encoding="utf-8") as f:
- f.write(result.extracted_content)
+ print(result.extracted_content)
-asyncio.run(main())
+if __name__ == "__main__":
+ asyncio.run(
+ extract_structured_data_using_llm(
+ provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+ )
+ )
diff --git a/docs/md_v2/advanced/pdf-parsing.md b/docs/md_v2/advanced/pdf-parsing.md
new file mode 100644
index 00000000..909c0dd1
--- /dev/null
+++ b/docs/md_v2/advanced/pdf-parsing.md
@@ -0,0 +1,201 @@
+# PDF Processing Strategies
+
+Crawl4AI provides specialized strategies for handling and extracting content from PDF files. These strategies allow you to seamlessly integrate PDF processing into your crawling workflows, whether the PDFs are hosted online or stored locally.
+
+## `PDFCrawlerStrategy`
+
+### Overview
+`PDFCrawlerStrategy` is an implementation of `AsyncCrawlerStrategy` designed specifically for PDF documents. Instead of interpreting the input URL as an HTML webpage, this strategy treats it as a pointer to a PDF file. It doesn't perform deep crawling or HTML parsing itself but rather prepares the PDF source for a dedicated PDF scraping strategy. Its primary role is to identify the PDF source (web URL or local file) and pass it along the processing pipeline in a way that `AsyncWebCrawler` can handle.
+
+### When to Use
+Use `PDFCrawlerStrategy` when you need to:
+- Process PDF files using the `AsyncWebCrawler`.
+- Handle PDFs from both web URLs (e.g., `https://example.com/document.pdf`) and local file paths (e.g., `file:///path/to/your/document.pdf`).
+- Integrate PDF content extraction into a unified `CrawlResult` object, allowing consistent handling of PDF data alongside web page data.
+
+### Key Methods and Their Behavior
+- **`__init__(self, logger: AsyncLogger = None)`**:
+ - Initializes the strategy.
+ - `logger`: An optional `AsyncLogger` instance (from `crawl4ai.async_logger`) for logging purposes.
+- **`async crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`**:
+ - This method is called by the `AsyncWebCrawler` during the `arun` process.
+ - It takes the `url` (which should point to a PDF) and creates a minimal `AsyncCrawlResponse`.
+ - The `html` attribute of this response is typically empty or a placeholder, as the actual PDF content processing is deferred to the `PDFContentScrapingStrategy` (or a similar PDF-aware scraping strategy).
+ - It sets `response_headers` to indicate "application/pdf" and `status_code` to 200.
+- **`async close(self)`**:
+ - A method for cleaning up any resources used by the strategy. For `PDFCrawlerStrategy`, this is usually minimal.
+- **`async __aenter__(self)` / `async __aexit__(self, exc_type, exc_val, exc_tb)`**:
+ - Enables asynchronous context management for the strategy, allowing it to be used with `async with`.
+
+### Example Usage
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+
+async def main():
+ # Initialize the PDF crawler strategy
+ pdf_crawler_strategy = PDFCrawlerStrategy()
+
+ # PDFCrawlerStrategy is typically used in conjunction with PDFContentScrapingStrategy
+ # The scraping strategy handles the actual PDF content extraction
+ pdf_scraping_strategy = PDFContentScrapingStrategy()
+ run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)
+
+ async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
+ # Example with a remote PDF URL
+ pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" # A public PDF from arXiv
+
+ print(f"Attempting to process PDF: {pdf_url}")
+ result = await crawler.arun(url=pdf_url, config=run_config)
+
+ if result.success:
+ print(f"Successfully processed PDF: {result.url}")
+ print(f"Metadata Title: {result.metadata.get('title', 'N/A')}")
+ # Further processing of result.markdown, result.media, etc.
+ # would be done here, based on what PDFContentScrapingStrategy extracts.
+ if result.markdown and hasattr(result.markdown, 'raw_markdown'):
+ print(f"Extracted text (first 200 chars): {result.markdown.raw_markdown[:200]}...")
+ else:
+ print("No markdown (text) content extracted.")
+ else:
+ print(f"Failed to process PDF: {result.error_message}")
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+### Pros and Cons
+**Pros:**
+- Enables `AsyncWebCrawler` to handle PDF sources directly using familiar `arun` calls.
+- Provides a consistent interface for specifying PDF sources (URLs or local paths).
+- Abstracts the source handling, allowing a separate scraping strategy to focus on PDF content parsing.
+
+**Cons:**
+- Does not perform any PDF data extraction itself; it strictly relies on a compatible scraping strategy (like `PDFContentScrapingStrategy`) to process the PDF.
+- Has limited utility on its own; most of its value comes from being paired with a PDF-specific content scraping strategy.
+
+---
+
+## `PDFContentScrapingStrategy`
+
+### Overview
+`PDFContentScrapingStrategy` is an implementation of `ContentScrapingStrategy` designed to extract text, metadata, and optionally images from PDF documents. It is intended to be used in conjunction with a crawler strategy that can provide it with a PDF source, such as `PDFCrawlerStrategy`. This strategy uses the `NaivePDFProcessorStrategy` internally to perform the low-level PDF parsing.
+
+### When to Use
+Use `PDFContentScrapingStrategy` when your `AsyncWebCrawler` (often configured with `PDFCrawlerStrategy`) needs to:
+- Extract textual content page by page from a PDF document.
+- Retrieve standard metadata embedded within the PDF (e.g., title, author, subject, creation date, page count).
+- Optionally, extract images contained within the PDF pages. These images can be saved to a local directory or made available for further processing.
+- Produce a `ScrapingResult` that can be converted into a `CrawlResult`, making PDF content accessible in a manner similar to HTML web content (e.g., text in `result.markdown`, metadata in `result.metadata`).
+
+### Key Configuration Attributes
+When initializing `PDFContentScrapingStrategy`, you can configure its behavior using the following attributes:
+- **`extract_images: bool = False`**: If `True`, the strategy will attempt to extract images from the PDF.
+- **`save_images_locally: bool = False`**: If `True` (and `extract_images` is also `True`), extracted images will be saved to disk in the `image_save_dir`. If `False`, image data might be available in another form (e.g., base64, depending on the underlying processor) but not saved as separate files by this strategy.
+- **`image_save_dir: str = None`**: Specifies the directory where extracted images should be saved if `save_images_locally` is `True`. If `None`, a default or temporary directory might be used.
+- **`batch_size: int = 4`**: Defines how many PDF pages are processed in a single batch. This can be useful for managing memory when dealing with very large PDF documents.
+- **`logger: AsyncLogger = None`**: An optional `AsyncLogger` instance for logging.
+
+### Key Methods and Their Behavior
+- **`__init__(self, save_images_locally: bool = False, extract_images: bool = False, image_save_dir: str = None, batch_size: int = 4, logger: AsyncLogger = None)`**:
+ - Initializes the strategy with configurations for image handling, batch processing, and logging. It sets up an internal `NaivePDFProcessorStrategy` instance which performs the actual PDF parsing.
+- **`scrap(self, url: str, html: str, **params) -> ScrapingResult`**:
+ - This is the primary synchronous method called by the crawler (via `ascrap`) to process the PDF.
+ - `url`: The path or URL to the PDF file (provided by `PDFCrawlerStrategy` or similar).
+ - `html`: Typically an empty string when used with `PDFCrawlerStrategy`, as the content is a PDF, not HTML.
+ - It first ensures the PDF is accessible locally (downloads it to a temporary file if `url` is remote).
+ - It then uses its internal PDF processor to extract text, metadata, and images (if configured).
+ - The extracted information is compiled into a `ScrapingResult` object:
+ - `cleaned_html`: Contains an HTML-like representation of the PDF, where each page's content is often wrapped in a `
Basic HTML document for testing purposes.
+