diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..0632736a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +# Ignore Markdown files for language statistics +*.md linguist-documentation=true + +# Force Python files to be detected +*.py linguist-language=Python diff --git a/README.md b/README.md index af4278ab..dc84b0ed 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,15 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) +
+📦 My Personal Story +I’ve always loved exploring the web development, back from when HTML and JavaScript were hardly intertwined. My curiosity drove me into web development, mathematics, AI, and machine learning, always keeping a close tie to real industrial applications. In 2009–2010, as a postgraduate student, I created platforms to gather and organize published papers for Master’s and PhD researchers. Faced with post-grad students’ data challenges, I built a helper app to crawl newly published papers and public data. Relying on Internet Explorer and DLL hacks was far more cumbersome than modern tools, highlighting my longtime background in data extraction. + +Fast-forward to 2023: I needed to fetch web data and transform it into neat **markdown** for my AI pipeline. All solutions I found were either **closed-source**, overpriced, or produced low-quality output. As someone who has built large edu-tech ventures (like KidoCode), I believe **data belongs to the people**. We shouldn’t pay $16 just to parse the web’s publicly available content. This friction led me to create my own library, **Crawl4AI**, in a matter of days to meet my immediate needs. Unexpectedly, it went **viral**, accumulating thousands of GitHub stars. + +Now, in **January 2025**, Crawl4AI has surpassed **21,000 stars** and remains the #1 trending repository. It’s my way of giving back to the community after benefiting from open source for years. I’m thrilled by how many of you share that passion. Thank you for being here, join our Discord, file issues, submit PRs, or just spread the word. Let’s build the best data extraction, crawling, and scraping library **together**. +
+ ## 🧐 Why Crawl4AI? 1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications. diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 8ec3d053..ea8194f4 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.247" +__version__ = "0.4.248" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b879413c..cf376f9a 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -453,12 +453,7 @@ class BrowserManager: return browser_args - async def setup_context( - self, - context: BrowserContext, - crawlerRunConfig: CrawlerRunConfig, - is_default=False, - ): + async def setup_context(self, context: BrowserContext, crawlerRunConfig: CrawlerRunConfig = None, is_default=False): """ Set up a browser context with the configured options. @@ -516,16 +511,17 @@ class BrowserManager: # Add default cookie await context.add_cookies( - [{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}] + [{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/"}] ) # Handle navigator overrides - if ( - crawlerRunConfig.override_navigator - or crawlerRunConfig.simulate_user - or crawlerRunConfig.magic - ): - await context.add_init_script(load_js_script("navigator_overrider")) + if crawlerRunConfig: + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) async def create_browser_context(self): """ diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 3e688f13..f6c62cc9 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -74,280 +74,7 @@ class NoExtractionStrategy(ExtractionStrategy): def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] -####################################################### -# Strategies using LLM-based extraction for text data # -####################################################### -class LLMExtractionStrategy(ExtractionStrategy): - """ - A strategy that uses an LLM to extract meaningful content from the HTML. - - Attributes: - provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". - api_token: The API token for the provider. - instruction: The instruction to use for the LLM model. - schema: Pydantic model schema for structured data. - extraction_type: "block" or "schema". - chunk_token_threshold: Maximum tokens per chunk. - overlap_rate: Overlap between chunks. - word_token_rate: Word to token conversion rate. - apply_chunking: Whether to apply chunking. - base_url: The base URL for the API request. - api_base: The base URL for the API request. - extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. - verbose: Whether to print verbose output. - usages: List of individual token usages. - total_usage: Accumulated token usage. - """ - def __init__(self, - provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, - instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): - """ - Initialize the strategy with clustering parameters. - - Args: - provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". - api_token: The API token for the provider. - instruction: The instruction to use for the LLM model. - schema: Pydantic model schema for structured data. - extraction_type: "block" or "schema". - chunk_token_threshold: Maximum tokens per chunk. - overlap_rate: Overlap between chunks. - word_token_rate: Word to token conversion rate. - apply_chunking: Whether to apply chunking. - base_url: The base URL for the API request. - api_base: The base URL for the API request. - extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. - verbose: Whether to print verbose output. - usages: List of individual token usages. - total_usage: Accumulated token usage. - - """ - super().__init__(**kwargs) - self.provider = provider - self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") - self.instruction = instruction - self.extract_type = extraction_type - self.schema = schema - if schema: - self.extract_type = "schema" - - self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) - self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) - self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) - self.apply_chunking = kwargs.get("apply_chunking", True) - self.base_url = kwargs.get("base_url", None) - self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) - self.extra_args = kwargs.get("extra_args", {}) - if not self.apply_chunking: - self.chunk_token_threshold = 1e9 - - self.verbose = kwargs.get("verbose", False) - self.usages = [] # Store individual usages - self.total_usage = TokenUsage() # Accumulated usage - - if not self.api_token: - raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") - - - def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: - """ - Extract meaningful blocks or chunks from the given HTML using an LLM. - - How it works: - 1. Construct a prompt with variables. - 2. Make a request to the LLM using the prompt. - 3. Parse the response and extract blocks or chunks. - - Args: - url: The URL of the webpage. - ix: Index of the block. - html: The HTML content of the webpage. - - Returns: - A list of extracted blocks or chunks. - """ - if self.verbose: - # print("[LOG] Extracting blocks from URL:", url) - print(f"[LOG] Call LLM for {url} - block index: {ix}") - - variable_values = { - "URL": url, - "HTML": escape_json_string(sanitize_html(html)), - } - - prompt_with_variables = PROMPT_EXTRACT_BLOCKS - if self.instruction: - variable_values["REQUEST"] = self.instruction - prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION - - if self.extract_type == "schema" and self.schema: - variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) - prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION - - for variable in variable_values: - prompt_with_variables = prompt_with_variables.replace( - "{" + variable + "}", variable_values[variable] - ) - - response = perform_completion_with_backoff( - self.provider, - prompt_with_variables, - self.api_token, - base_url=self.api_base or self.base_url, - extra_args = self.extra_args - ) # , json_response=self.extract_type == "schema") - # Track usage - usage = TokenUsage( - completion_tokens=response.usage.completion_tokens, - prompt_tokens=response.usage.prompt_tokens, - total_tokens=response.usage.total_tokens, - completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, - prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} - ) - self.usages.append(usage) - - # Update totals - self.total_usage.completion_tokens += usage.completion_tokens - self.total_usage.prompt_tokens += usage.prompt_tokens - self.total_usage.total_tokens += usage.total_tokens - - try: - blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] - blocks = json.loads(blocks) - for block in blocks: - block['error'] = False - except Exception as e: - parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) - blocks = parsed - if unparsed: - blocks.append({ - "index": 0, - "error": True, - "tags": ["error"], - "content": unparsed - }) - - if self.verbose: - print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) - return blocks - - def _merge(self, documents, chunk_token_threshold, overlap): - """ - Merge documents into sections based on chunk_token_threshold and overlap. - """ - chunks = [] - sections = [] - total_tokens = 0 - - # Calculate the total tokens across all documents - for document in documents: - total_tokens += len(document.split(' ')) * self.word_token_rate - - # Calculate the number of sections needed - num_sections = math.floor(total_tokens / chunk_token_threshold) - if num_sections < 1: - num_sections = 1 # Ensure there is at least one section - adjusted_chunk_threshold = total_tokens / num_sections - - total_token_so_far = 0 - current_chunk = [] - - for document in documents: - tokens = document.split(' ') - token_count = len(tokens) * self.word_token_rate - - if total_token_so_far + token_count <= adjusted_chunk_threshold: - current_chunk.extend(tokens) - total_token_so_far += token_count - else: - # Ensure to handle the last section properly - if len(sections) == num_sections - 1: - current_chunk.extend(tokens) - continue - - # Add overlap if specified - if overlap > 0 and current_chunk: - overlap_tokens = current_chunk[-overlap:] - current_chunk.extend(overlap_tokens) - - sections.append(' '.join(current_chunk)) - current_chunk = tokens - total_token_so_far = token_count - - # Add the last chunk - if current_chunk: - sections.append(' '.join(current_chunk)) - - return sections - - - def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: - """ - Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. - - Args: - url: The URL of the webpage. - sections: List of sections (strings) to process. - - Returns: - A list of extracted blocks or chunks. - """ - - merged_sections = self._merge( - sections, self.chunk_token_threshold, - overlap= int(self.chunk_token_threshold * self.overlap_rate) - ) - extracted_content = [] - if self.provider.startswith("groq/"): - # Sequential processing with a delay - for ix, section in enumerate(merged_sections): - extract_func = partial(self.extract, url) - extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) - time.sleep(0.5) # 500 ms delay between each processing - else: - # Parallel processing using ThreadPoolExecutor - # extract_func = partial(self.extract, url) - # for ix, section in enumerate(merged_sections): - # extracted_content.append(extract_func(ix, section)) - - with ThreadPoolExecutor(max_workers=4) as executor: - extract_func = partial(self.extract, url) - futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] - - for future in as_completed(futures): - try: - extracted_content.extend(future.result()) - except Exception as e: - if self.verbose: - print(f"Error in thread execution: {e}") - # Add error information to extracted_content - extracted_content.append({ - "index": 0, - "error": True, - "tags": ["error"], - "content": str(e) - }) - - - return extracted_content - - - def show_usage(self) -> None: - """Print a detailed token usage report showing total and per-request usage.""" - print("\n=== Token Usage Summary ===") - print(f"{'Type':<15} {'Count':>12}") - print("-" * 30) - print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") - print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") - print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") - - print("\n=== Usage History ===") - print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") - print("-" * 48) - for i, usage in enumerate(self.usages, 1): - print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") - ####################################################### # Strategies using clustering for text data extraction # ####################################################### @@ -664,6 +391,284 @@ class CosineStrategy(ExtractionStrategy): # This strategy processes all sections together return self.extract(url, self.DEL.join(sections), **kwargs) + + + +####################################################### +# Strategies using LLM-based extraction for text data # +####################################################### +class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + + def __init__(self, + provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + + """ + super().__init__(**kwargs) + self.provider = provider + self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") + self.instruction = instruction + self.extract_type = extraction_type + self.schema = schema + if schema: + self.extract_type = "schema" + + self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) + self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) + self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + self.base_url = kwargs.get("base_url", None) + self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) + self.extra_args = kwargs.get("extra_args", {}) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 + + self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage + + if not self.api_token: + raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") + + + def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ + if self.verbose: + # print("[LOG] Extracting blocks from URL:", url) + print(f"[LOG] Call LLM for {url} - block index: {ix}") + + variable_values = { + "URL": url, + "HTML": escape_json_string(sanitize_html(html)), + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + if self.instruction: + variable_values["REQUEST"] = self.instruction + prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION + + if self.extract_type == "schema" and self.schema: + variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) + prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION + + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.api_base or self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + for block in blocks: + block['error'] = False + except Exception as e: + parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) + blocks = parsed + if unparsed: + blocks.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": unparsed + }) + + if self.verbose: + print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) + return blocks + + def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ + chunks = [] + sections = [] + total_tokens = 0 + + # Calculate the total tokens across all documents + for document in documents: + total_tokens += len(document.split(' ')) * self.word_token_rate + + # Calculate the number of sections needed + num_sections = math.floor(total_tokens / chunk_token_threshold) + if num_sections < 1: + num_sections = 1 # Ensure there is at least one section + adjusted_chunk_threshold = total_tokens / num_sections + + total_token_so_far = 0 + current_chunk = [] + + for document in documents: + tokens = document.split(' ') + token_count = len(tokens) * self.word_token_rate + + if total_token_so_far + token_count <= adjusted_chunk_threshold: + current_chunk.extend(tokens) + total_token_so_far += token_count + else: + # Ensure to handle the last section properly + if len(sections) == num_sections - 1: + current_chunk.extend(tokens) + continue + + # Add overlap if specified + if overlap > 0 and current_chunk: + overlap_tokens = current_chunk[-overlap:] + current_chunk.extend(overlap_tokens) + + sections.append(' '.join(current_chunk)) + current_chunk = tokens + total_token_so_far = token_count + + # Add the last chunk + if current_chunk: + sections.append(' '.join(current_chunk)) + + return sections + + + def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: + """ + Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. + """ + + merged_sections = self._merge( + sections, self.chunk_token_threshold, + overlap= int(self.chunk_token_threshold * self.overlap_rate) + ) + extracted_content = [] + if self.provider.startswith("groq/"): + # Sequential processing with a delay + for ix, section in enumerate(merged_sections): + extract_func = partial(self.extract, url) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) + time.sleep(0.5) # 500 ms delay between each processing + else: + # Parallel processing using ThreadPoolExecutor + # extract_func = partial(self.extract, url) + # for ix, section in enumerate(merged_sections): + # extracted_content.append(extract_func(ix, section)) + + with ThreadPoolExecutor(max_workers=4) as executor: + extract_func = partial(self.extract, url) + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] + + for future in as_completed(futures): + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) + + + return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") + + ####################################################### # New extraction strategies for JSON-based extraction # diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py index 09e0bc17..06b509bd 100644 --- a/docs/examples/hooks_example.py +++ b/docs/examples/hooks_example.py @@ -36,7 +36,7 @@ async def main(): 'domain': '.example.com', 'path': '/' }]) - await page.set_viewport_size({"width": 1920, "height": 1080}) + await page.set_viewport_size({"width": 1080, "height": 800}) return page async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs): diff --git a/docs/md_v3/tutorials/advanced-features.md b/docs/md_v2/advanced/advanced-features.md similarity index 88% rename from docs/md_v3/tutorials/advanced-features.md rename to docs/md_v2/advanced/advanced-features.md index 16f85874..1f402948 100644 --- a/docs/md_v3/tutorials/advanced-features.md +++ b/docs/md_v2/advanced/advanced-features.md @@ -1,15 +1,16 @@ -# Advanced Features (Proxy, PDF, Screenshot, SSL, Headers, & Storage State) +# Overview of Some Important Advanced Features +(Proxy, PDF, Screenshot, SSL, Headers, & Storage State) Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers: -1. **Proxy Usage** -2. **Capturing PDFs & Screenshots** -3. **Handling SSL Certificates** -4. **Custom Headers** -5. **Session Persistence & Local Storage** +1. **Proxy Usage** +2. **Capturing PDFs & Screenshots** +3. **Handling SSL Certificates** +4. **Custom Headers** +5. **Session Persistence & Local Storage** > **Prerequisites** -> - You have a basic grasp of [AsyncWebCrawler Basics](./async-webcrawler-basics.md) +> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md) > - You know how to run or configure your Python environment with Playwright installed --- @@ -84,7 +85,7 @@ async def main(): # Save PDF if result.pdf: with open("wikipedia_page.pdf", "wb") as f: - f.write(b64decode(result.pdf)) + f.write(result.pdf) print("[OK] PDF & screenshot captured.") else: @@ -186,7 +187,7 @@ if __name__ == "__main__": **Notes** - Some sites may react differently to certain headers (e.g., `Accept-Language`). -- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-anti-bot.md) or use `UserAgentGenerator`. +- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-based-crawling.md) or use `UserAgentGenerator`. --- @@ -246,7 +247,7 @@ You can sign in once, export the browser context, and reuse it later—without r - **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file. - Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step. -**See**: [Detailed session management tutorial](./hooks-custom.md#using-storage_state) or [Explanations → Browser Context & Managed Browser](../../explanations/browser-management.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages). +**See**: [Detailed session management tutorial](./session-management.md) or [Explanations → Browser Context & Managed Browser](./identity-based-crawling.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages). --- @@ -283,7 +284,10 @@ async def main(): # 3. Crawl async with AsyncWebCrawler(config=browser_cfg) as crawler: - result = await crawler.arun("https://secure.example.com/protected", config=crawler_cfg) + result = await crawler.arun( + url = "https://secure.example.com/protected", + config=crawler_cfg + ) if result.success: print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", []))) @@ -318,12 +322,6 @@ You’ve now explored several **advanced** features: - **Custom Headers** for language or specialized requests - **Session Persistence** via storage state -**Where to go next**: - -- **[Hooks & Custom Code](./hooks-custom.md)**: For multi-step interactions (clicking “Load More,” performing logins, etc.) -- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: If you need more sophisticated user simulation or stealth. -- **[Reference → BrowserConfig & CrawlerRunConfig](../../reference/configuration.md)**: Detailed param descriptions for everything you’ve seen here and more. - With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline. -**Last Updated**: 2024-XX-XX \ No newline at end of file +**Last Updated**: 2025-01-01 \ No newline at end of file diff --git a/docs/md_v2/advanced/content-processing.md b/docs/md_v2/advanced/content-processing.md deleted file mode 100644 index 25ed6172..00000000 --- a/docs/md_v2/advanced/content-processing.md +++ /dev/null @@ -1,136 +0,0 @@ -# Content Processing - -Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction. - -## Media Processing - -Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance. - -### Image Processing - -The library handles various image scenarios, including: -- Regular images -- Lazy-loaded images -- Background images -- Responsive images -- Image metadata and context - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -config = CrawlerRunConfig() -result = await crawler.arun(url="https://example.com", config=config) - -for image in result.media["images"]: - # Each image includes rich metadata - print(f"Source: {image['src']}") - print(f"Alt text: {image['alt']}") - print(f"Description: {image['desc']}") - print(f"Context: {image['context']}") # Surrounding text - print(f"Relevance score: {image['score']}") # 0-10 score -``` - -### Handling Lazy-Loaded Content - -Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`: - -```python -config = CrawlerRunConfig( - wait_for="css:img[data-src]", # Wait for lazy images - delay_before_return_html=2.0 # Additional wait time -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Video and Audio Content - -The library extracts video and audio elements with their metadata: - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -config = CrawlerRunConfig() -result = await crawler.arun(url="https://example.com", config=config) - -# Process videos -for video in result.media["videos"]: - print(f"Video source: {video['src']}") - print(f"Type: {video['type']}") - print(f"Duration: {video.get('duration')}") - print(f"Thumbnail: {video.get('poster')}") - -# Process audio -for audio in result.media["audios"]: - print(f"Audio source: {audio['src']}") - print(f"Type: {audio['type']}") - print(f"Duration: {audio.get('duration')}") -``` - -## Link Analysis - -Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns. - -### Link Classification - -The library automatically categorizes links into: -- Internal links (same domain) -- External links (different domains) -- Social media links -- Navigation links -- Content links - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -config = CrawlerRunConfig() -result = await crawler.arun(url="https://example.com", config=config) - -# Analyze internal links -for link in result.links["internal"]: - print(f"Internal: {link['href']}") - print(f"Link text: {link['text']}") - print(f"Context: {link['context']}") # Surrounding text - print(f"Type: {link['type']}") # nav, content, etc. - -# Analyze external links -for link in result.links["external"]: - print(f"External: {link['href']}") - print(f"Domain: {link['domain']}") - print(f"Type: {link['type']}") -``` - -### Smart Link Filtering - -Control which links are included in the results with `CrawlerRunConfig`: - -```python -config = CrawlerRunConfig( - exclude_external_links=True, # Remove external links - exclude_social_media_links=True, # Remove social media links - exclude_social_media_domains=[ # Custom social media domains - "facebook.com", "twitter.com", "instagram.com" - ], - exclude_domains=["ads.example.com"] # Exclude specific domains -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -## Metadata Extraction - -Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content: - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -config = CrawlerRunConfig() -result = await crawler.arun(url="https://example.com", config=config) - -metadata = result.metadata -print(f"Title: {metadata['title']}") -print(f"Description: {metadata['description']}") -print(f"Keywords: {metadata['keywords']}") -print(f"Author: {metadata['author']}") -print(f"Published Date: {metadata['published_date']}") -print(f"Modified Date: {metadata['modified_date']}") -print(f"Language: {metadata['language']}") -``` diff --git a/docs/md_v2/advanced/crawl-dispatcher.md b/docs/md_v2/advanced/crawl-dispatcher.md new file mode 100644 index 00000000..e4059f25 --- /dev/null +++ b/docs/md_v2/advanced/crawl-dispatcher.md @@ -0,0 +1,12 @@ +# Crawl Dispatcher + +We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress. + +Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X. + +Below is a **sample** of how the dispatcher’s performance monitor might look in action: + +![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png) + + +We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates! \ No newline at end of file diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/advanced/file-downloading.md similarity index 92% rename from docs/md_v2/basic/file-download.md rename to docs/md_v2/advanced/file-downloading.md index eac0f5cb..2fa3759f 100644 --- a/docs/md_v2/basic/file-download.md +++ b/docs/md_v2/advanced/file-downloading.md @@ -17,18 +17,6 @@ async def main(): asyncio.run(main()) ``` -Or, enable it for a specific crawl by using `CrawlerRunConfig`: - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -async def main(): - async with AsyncWebCrawler() as crawler: - config = CrawlerRunConfig(accept_downloads=True) - result = await crawler.arun(url="https://example.com", config=config) - # ... -``` - ## Specifying Download Location Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory. @@ -98,7 +86,8 @@ async def download_multiple_files(url: str, download_path: str): const downloadLinks = document.querySelectorAll('a[download]'); for (const link of downloadLinks) { link.click(); - await new Promise(r => setTimeout(r, 2000)); // Delay between clicks + // Delay between clicks + await new Promise(r => setTimeout(r, 2000)); } """, wait_for=10 # Wait for all downloads to start diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index 66042229..6787abd9 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -1,121 +1,254 @@ -# Hooks & Auth for AsyncWebCrawler +# Hooks & Auth in AsyncWebCrawler -Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`. +Crawl4AI’s **hooks** let you customize the crawler at specific points in the pipeline: -## Example: Using Crawler Hooks with AsyncWebCrawler +1. **`on_browser_created`** – After browser creation. +2. **`on_page_context_created`** – After a new context & page are created. +3. **`before_goto`** – Just before navigating to a page. +4. **`after_goto`** – Right after navigation completes. +5. **`on_user_agent_updated`** – Whenever the user agent changes. +6. **`on_execution_started`** – Once custom JavaScript execution begins. +7. **`before_retrieve_html`** – Just before the crawler retrieves final HTML. +8. **`before_return_html`** – Right before returning the HTML content. -In this example, we'll: +**Important**: Avoid heavy tasks in `on_browser_created` since you don’t yet have a page context. If you need to *log in*, do so in **`on_page_context_created`**. -1. Configure the browser and set up authentication when it's created. -2. Apply custom routing and initial actions when the page context is created. -3. Add custom headers before navigating to the URL. -4. Log the current URL after navigation. -5. Perform actions after JavaScript execution. -6. Log the length of the HTML before returning it. +> note "Important Hook Usage Warning" + **Avoid Misusing Hooks**: Do not manipulate page objects in the wrong hook or at the wrong time, as it can crash the pipeline or produce incorrect results. A common mistake is attempting to handle authentication prematurely—such as creating or closing pages in `on_browser_created`. -### Hook Definitions +> **Use the Right Hook for Auth**: If you need to log in or set tokens, use `on_page_context_created`. This ensures you have a valid page/context to work with, without disrupting the main crawling flow. + +> **Identity-Based Crawling**: For robust auth, consider identity-based crawling (or passing a session ID) to preserve state. Run your initial login steps in a separate, well-defined process, then feed that session to your main crawl—rather than shoehorning complex authentication into early hooks. Check out [Identity-Based Crawling](../advanced/identity-based-crawling.md) for more details. + +> **Be Cautious**: Overwriting or removing elements in the wrong hook can compromise the final crawl. Keep hooks focused on smaller tasks (like route filters, custom headers), and let your main logic (crawling, data extraction) proceed normally. + + +Below is an example demonstration. + +--- + +## Example: Using Hooks in AsyncWebCrawler ```python import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from playwright.async_api import Page, Browser, BrowserContext +import json +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from playwright.async_api import Page, BrowserContext -def log_routing(route): - # Example: block loading images - if route.request.resource_type == "image": - print(f"[HOOK] Blocking image request: {route.request.url}") - asyncio.create_task(route.abort()) - else: - asyncio.create_task(route.continue_()) - -async def on_browser_created(browser: Browser, **kwargs): - print("[HOOK] on_browser_created") - # Example: Set browser viewport size and log in - context = await browser.new_context(viewport={"width": 1920, "height": 1080}) - page = await context.new_page() - await page.goto("https://example.com/login") - await page.fill("input[name='username']", "testuser") - await page.fill("input[name='password']", "password123") - await page.click("button[type='submit']") - await page.wait_for_selector("#welcome") - await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}]) - await page.close() - await context.close() - -async def on_page_context_created(context: BrowserContext, page: Page, **kwargs): - print("[HOOK] on_page_context_created") - await context.route("**", log_routing) - -async def before_goto(page: Page, context: BrowserContext, **kwargs): - print("[HOOK] before_goto") - await page.set_extra_http_headers({"X-Test-Header": "test"}) - -async def after_goto(page: Page, context: BrowserContext, **kwargs): - print("[HOOK] after_goto") - print(f"Current URL: {page.url}") - -async def on_execution_started(page: Page, context: BrowserContext, **kwargs): - print("[HOOK] on_execution_started") - await page.evaluate("console.log('Custom JS executed')") - -async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs): - print("[HOOK] before_return_html") - print(f"HTML length: {len(html)}") - return page -``` - -### Using the Hooks with AsyncWebCrawler - -```python async def main(): - print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!") + print("🔗 Hooks Example: Demonstrating recommended usage") - # Configure browser and crawler settings + # 1) Configure the browser browser_config = BrowserConfig( headless=True, - viewport_width=1920, - viewport_height=1080 + verbose=True ) - + + # 2) Configure the crawler run crawler_run_config = CrawlerRunConfig( js_code="window.scrollTo(0, document.body.scrollHeight);", - wait_for="footer" + wait_for="body", + cache_mode=CacheMode.BYPASS ) - # Initialize crawler - async with AsyncWebCrawler(config=browser_config) as crawler: - crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) - crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) - crawler.crawler_strategy.set_hook("before_goto", before_goto) - crawler.crawler_strategy.set_hook("after_goto", after_goto) - crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) - crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + # 3) Create the crawler instance + crawler = AsyncWebCrawler(config=browser_config) - # Run the crawler - result = await crawler.arun(url="https://example.com", config=crawler_run_config) + # + # Define Hook Functions + # - print("\n📦 Crawler Hooks Result:") - print(result) + async def on_browser_created(browser, **kwargs): + # Called once the browser instance is created (but no pages or contexts yet) + print("[HOOK] on_browser_created - Browser created successfully!") + # Typically, do minimal setup here if needed + return browser -asyncio.run(main()) + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + # Called right after a new page + context are created (ideal for auth or route config). + print("[HOOK] on_page_context_created - Setting up page & context.") + + # Example 1: Route filtering (e.g., block images) + async def route_filter(route): + if route.request.resource_type == "image": + print(f"[HOOK] Blocking image request: {route.request.url}") + await route.abort() + else: + await route.continue_() + + await context.route("**", route_filter) + + # Example 2: (Optional) Simulate a login scenario + # (We do NOT create or close pages here, just do quick steps if needed) + # e.g., await page.goto("https://example.com/login") + # e.g., await page.fill("input[name='username']", "testuser") + # e.g., await page.fill("input[name='password']", "password123") + # e.g., await page.click("button[type='submit']") + # e.g., await page.wait_for_selector("#welcome") + # e.g., await context.add_cookies([...]) + # Then continue + + # Example 3: Adjust the viewport + await page.set_viewport_size({"width": 1080, "height": 600}) + return page + + async def before_goto( + page: Page, context: BrowserContext, url: str, **kwargs + ): + # Called before navigating to each URL. + print(f"[HOOK] before_goto - About to navigate: {url}") + # e.g., inject custom headers + await page.set_extra_http_headers({ + "Custom-Header": "my-value" + }) + return page + + async def after_goto( + page: Page, context: BrowserContext, + url: str, response, **kwargs + ): + # Called after navigation completes. + print(f"[HOOK] after_goto - Successfully loaded: {url}") + # e.g., wait for a certain element if we want to verify + try: + await page.wait_for_selector('.content', timeout=1000) + print("[HOOK] Found .content element!") + except: + print("[HOOK] .content not found, continuing anyway.") + return page + + async def on_user_agent_updated( + page: Page, context: BrowserContext, + user_agent: str, **kwargs + ): + # Called whenever the user agent updates. + print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}") + return page + + async def on_execution_started(page: Page, context: BrowserContext, **kwargs): + # Called after custom JavaScript execution begins. + print("[HOOK] on_execution_started - JS code is running!") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + # Called before final HTML retrieval. + print("[HOOK] before_retrieve_html - We can do final actions") + # Example: Scroll again + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + async def before_return_html( + page: Page, context: BrowserContext, html: str, **kwargs + ): + # Called just before returning the HTML in the result. + print(f"[HOOK] before_return_html - HTML length: {len(html)}") + return page + + # + # Attach Hooks + # + + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook( + "on_page_context_created", on_page_context_created + ) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook( + "on_user_agent_updated", on_user_agent_updated + ) + crawler.crawler_strategy.set_hook( + "on_execution_started", on_execution_started + ) + crawler.crawler_strategy.set_hook( + "before_retrieve_html", before_retrieve_html + ) + crawler.crawler_strategy.set_hook( + "before_return_html", before_return_html + ) + + await crawler.start() + + # 4) Run the crawler on an example page + url = "https://example.com" + result = await crawler.arun(url, config=crawler_run_config) + + if result.success: + print("\nCrawled URL:", result.url) + print("HTML length:", len(result.html)) + else: + print("Error:", result.error_message) + + await crawler.close() + +if __name__ == "__main__": + asyncio.run(main()) ``` -### Explanation of Hooks +--- -- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies). -- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL. -- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions. -- **`after_goto`**: Called after navigation. Use this to verify content or log the URL. -- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions. -- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content. +## Hook Lifecycle Summary -### Additional Customizations +1. **`on_browser_created`**: + - Browser is up, but **no** pages or contexts yet. + - Light setup only—don’t try to open or close pages here (that belongs in `on_page_context_created`). -- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts). -- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL. -- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens. -- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content. +2. **`on_page_context_created`**: + - Perfect for advanced **auth** or route blocking. + - You have a **page** + **context** ready but haven’t navigated to the target URL yet. -These hooks provide powerful customization options for tailoring the crawling process to your needs. +3. **`before_goto`**: + - Right before navigation. Typically used for setting **custom headers** or logging the target URL. + +4. **`after_goto`**: + - After page navigation is done. Good place for verifying content or waiting on essential elements. + +5. **`on_user_agent_updated`**: + - Whenever the user agent changes (for stealth or different UA modes). + +6. **`on_execution_started`**: + - If you set `js_code` or run custom scripts, this runs once your JS is about to start. + +7. **`before_retrieve_html`**: + - Just before the final HTML snapshot is taken. Often you do a final scroll or lazy-load triggers here. + +8. **`before_return_html`**: + - The last hook before returning HTML to the `CrawlResult`. Good for logging HTML length or minor modifications. + +--- + +## When to Handle Authentication + +**Recommended**: Use **`on_page_context_created`** if you need to: + +- Navigate to a login page or fill forms +- Set cookies or localStorage tokens +- Block resource routes to avoid ads + +This ensures the newly created context is under your control **before** `arun()` navigates to the main URL. + +--- + +## Additional Considerations + +- **Session Management**: If you want multiple `arun()` calls to reuse a single session, pass `session_id=` in your `CrawlerRunConfig`. Hooks remain the same. +- **Performance**: Hooks can slow down crawling if they do heavy tasks. Keep them concise. +- **Error Handling**: If a hook fails, the overall crawl might fail. Catch exceptions or handle them gracefully. +- **Concurrency**: If you run `arun_many()`, each URL triggers these hooks in parallel. Ensure your hooks are thread/async-safe. + +--- + +## Conclusion + +Hooks provide **fine-grained** control over: + +- **Browser** creation (light tasks only) +- **Page** and **context** creation (auth, route blocking) +- **Navigation** phases +- **Final HTML** retrieval + +Follow the recommended usage: +- **Login** or advanced tasks in `on_page_context_created` +- **Custom headers** or logs in `before_goto` / `after_goto` +- **Scrolling** or final checks in `before_retrieve_html` / `before_return_html` diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md new file mode 100644 index 00000000..702d9475 --- /dev/null +++ b/docs/md_v2/advanced/identity-based-crawling.md @@ -0,0 +1,180 @@ +# Preserve Your Identity with Crawl4AI + +Crawl4AI empowers you to navigate and interact with the web using your **authentic digital identity**, ensuring you’re recognized as a human and not mistaken for a bot. This tutorial covers: + +1. **Managed Browsers** – The recommended approach for persistent profiles and identity-based crawling. +2. **Magic Mode** – A simplified fallback solution for quick automation without persistent identity. + +--- + +## 1. Managed Browsers: Your Digital Identity Solution + +**Managed Browsers** let developers create and use **persistent browser profiles**. These profiles store local storage, cookies, and other session data, letting you browse as your **real self**—complete with logins, preferences, and cookies. + +### Key Benefits + +- **Authentic Browsing Experience**: Retain session data and browser fingerprints as though you’re a normal user. +- **Effortless Configuration**: Once you log in or solve CAPTCHAs in your chosen data directory, you can re-run crawls without repeating those steps. +- **Empowered Data Access**: If you can see the data in your own browser, you can automate its retrieval with your genuine identity. + +--- + +Below is a **partial update** to your **Managed Browsers** tutorial, specifically the section about **creating a user-data directory** using **Playwright’s Chromium** binary rather than a system-wide Chrome/Edge. We’ll show how to **locate** that binary and launch it with a `--user-data-dir` argument to set up your profile. You can then point `BrowserConfig.user_data_dir` to that folder for subsequent crawls. + +--- + +### Creating a User Data Directory (Command-Line Approach via Playwright) + +If you installed Crawl4AI (which installs Playwright under the hood), you already have a Playwright-managed Chromium on your system. Follow these steps to launch that **Chromium** from your command line, specifying a **custom** data directory: + +1. **Find** the Playwright Chromium binary: + - On most systems, installed browsers go under a `~/.cache/ms-playwright/` folder or similar path. + - To see an overview of installed browsers, run: + ```bash + python -m playwright install --dry-run + ``` + or + ```bash + playwright install --dry-run + ``` + (depending on your environment). This shows where Playwright keeps Chromium. + + - For instance, you might see a path like: + ``` + ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome + ``` + on Linux, or a corresponding folder on macOS/Windows. + +2. **Launch** the Playwright Chromium binary with a **custom** user-data directory: + ```bash + # Linux example + ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome \ + --user-data-dir=/home//my_chrome_profile + ``` + ```bash + # macOS example (Playwright’s internal binary) + ~/Library/Caches/ms-playwright/chromium-1234/chrome-mac/Chromium.app/Contents/MacOS/Chromium \ + --user-data-dir=/Users//my_chrome_profile + ``` + ```powershell + # Windows example (PowerShell/cmd) + "C:\Users\\AppData\Local\ms-playwright\chromium-1234\chrome-win\chrome.exe" ^ + --user-data-dir="C:\Users\\my_chrome_profile" + ``` + + **Replace** the path with the actual subfolder indicated in your `ms-playwright` cache structure. + - This **opens** a fresh Chromium with your new or existing data folder. + - **Log into** any sites or configure your browser the way you want. + - **Close** when done—your profile data is saved in that folder. + +3. **Use** that folder in **`BrowserConfig.user_data_dir`**: + ```python + from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir="/home//my_chrome_profile", + browser_type="chromium" + ) + ``` + - Next time you run your code, it reuses that folder—**preserving** your session data, cookies, local storage, etc. + +--- + +## 3. Using Managed Browsers in Crawl4AI + +Once you have a data directory with your session data, pass it to **`BrowserConfig`**: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + # 1) Reference your persistent data directory + browser_config = BrowserConfig( + headless=True, # 'True' for automated runs + verbose=True, + use_managed_browser=True, # Enables persistent browser strategy + browser_type="chromium", + user_data_dir="/path/to/my-chrome-profile" + ) + + # 2) Standard crawl config + crawl_config = CrawlerRunConfig( + wait_for="css:.logged-in-content" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com/private", config=crawl_config) + if result.success: + print("Successfully accessed private data with your identity!") + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Workflow + +1. **Login** externally (via CLI or your normal Chrome with `--user-data-dir=...`). +2. **Close** that browser. +3. **Use** the same folder in `user_data_dir=` in Crawl4AI. +4. **Crawl** – The site sees your identity as if you’re the same user who just logged in. + +--- + +## 4. Magic Mode: Simplified Automation + +If you **don’t** need a persistent profile or identity-based approach, **Magic Mode** offers a quick way to simulate human-like browsing without storing long-term data. + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig( + magic=True, # Simplifies a lot of interaction + remove_overlay_elements=True, + page_timeout=60000 + ) + ) +``` + +**Magic Mode**: + +- Simulates a user-like experience +- Randomizes user agent & navigator +- Randomizes interactions & timings +- Masks automation signals +- Attempts pop-up handling + +**But** it’s no substitute for **true** user-based sessions if you want a fully legitimate identity-based solution. + +--- + +## 5. Comparing Managed Browsers vs. Magic Mode + +| Feature | **Managed Browsers** | **Magic Mode** | +|----------------------------|---------------------------------------------------------------|-----------------------------------------------------| +| **Session Persistence** | Full localStorage/cookies retained in user_data_dir | No persistent data (fresh each run) | +| **Genuine Identity** | Real user profile with full rights & preferences | Emulated user-like patterns, but no actual identity | +| **Complex Sites** | Best for login-gated sites or heavy config | Simple tasks, minimal login or config needed | +| **Setup** | External creation of user_data_dir, then use in Crawl4AI | Single-line approach (`magic=True`) | +| **Reliability** | Extremely consistent (same data across runs) | Good for smaller tasks, can be less stable | + +--- + +## 6. Summary + +- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`. +- **Log in** or configure sites as needed, then close the browser. +- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`. +- Enjoy **persistent** sessions that reflect your real identity. +- If you only need quick, ephemeral automation, **Magic Mode** might suffice. + +**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary. + +With these approaches, you preserve your **authentic** browsing environment, ensuring the site sees you exactly as a normal user—no repeated logins or wasted time. \ No newline at end of file diff --git a/docs/md_v2/advanced/identity_based_crawling.md b/docs/md_v2/advanced/identity_based_crawling.md deleted file mode 100644 index c0ab7fd5..00000000 --- a/docs/md_v2/advanced/identity_based_crawling.md +++ /dev/null @@ -1,156 +0,0 @@ -### Preserve Your Identity with Crawl4AI - -Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios. - ---- - -### Managed Browsers: Your Digital Identity Solution - -**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web. - -#### Why Use Managed Browsers? -1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior. -2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access. -3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions. - -#### Steps to Use Managed Browsers - -1. **Setup the Browser Configuration**: - ```python - from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig - from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - - browser_config = BrowserConfig( - headless=False, # Set to False for initial setup to view browser actions - verbose=True, - user_agent_mode="random", - use_managed_browser=True, # Enables persistent browser sessions - browser_type="chromium", - user_data_dir="/path/to/user_profile_data" # Path to save session data - ) - ``` - -2. **Perform an Initial Run**: - - Run the crawler with `headless=False`. - - Manually interact with the site (e.g., solve CAPTCHA or log in). - - The browser session saves cookies, local storage, and other required data. - -3. **Subsequent Runs**: - - Switch to `headless=True` for automation. - - The session data is reused, allowing seamless crawling. - -#### Example: Extracting Data Using Managed Browsers - -```python -import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -async def main(): - # Define schema for structured data extraction - schema = { - "name": "Example Data", - "baseSelector": "div.example", - "fields": [ - {"name": "title", "selector": "h1", "type": "text"}, - {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} - ] - } - - # Configure crawler - browser_config = BrowserConfig( - headless=True, # Automate subsequent runs - verbose=True, - use_managed_browser=True, - user_data_dir="/path/to/user_profile_data" - ) - - crawl_config = CrawlerRunConfig( - extraction_strategy=JsonCssExtractionStrategy(schema), - wait_for="css:div.example" # Wait for the targeted element to load - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - url="https://example.com", - config=crawl_config - ) - - if result.success: - print("Extracted Data:", result.extracted_content) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -### Benefits of Managed Browsers Over Other Methods -Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely. - ---- - -### Magic Mode: Simplified Automation - -While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration. - -#### What Magic Mode Does: -- Simulates human browsing by randomizing interaction patterns and timing. -- Masks browser automation signals. -- Handles cookie popups and modals. -- Modifies navigator properties for enhanced compatibility. - -#### Using Magic Mode - -```python -async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - url="https://example.com", - magic=True # Enables all automation features - ) -``` - -Magic Mode is particularly useful for: -- Quick prototyping when a Managed Browser setup is not available. -- Basic sites requiring minimal interaction or configuration. - -#### Example: Combining Magic Mode with Additional Options - -```python -async def crawl_with_magic_mode(url: str): - async with AsyncWebCrawler(headless=True) as crawler: - result = await crawler.arun( - url=url, - magic=True, - remove_overlay_elements=True, # Remove popups/modals - page_timeout=60000 # Increased timeout for complex pages - ) - - return result.markdown if result.success else None -``` - -### Magic Mode vs. Managed Browsers -While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor. - ---- - -### Key Comparison: Managed Browsers vs. Magic Mode - -| Feature | **Managed Browsers** | **Magic Mode** | -|-------------------------|------------------------------------------|-------------------------------------| -| **Session Persistence** | Retains cookies and local storage. | No session retention. | -| **Human Interaction** | Uses real user profiles and data. | Simulates human-like patterns. | -| **Complex Sites** | Best suited for heavily configured sites.| Works well with simpler challenges.| -| **Setup Complexity** | Requires initial manual interaction. | Fully automated, one-line setup. | - -#### Recommendation: -- Use **Managed Browsers** for reliable, session-based crawling and data extraction. -- Use **Magic Mode** for quick prototyping or when persistent profiles are not required. - ---- - -### Conclusion - -- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites. -- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed. - -By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs. - diff --git a/docs/md_v2/advanced/lazy-loading.md b/docs/md_v2/advanced/lazy-loading.md new file mode 100644 index 00000000..04688264 --- /dev/null +++ b/docs/md_v2/advanced/lazy-loading.md @@ -0,0 +1,104 @@ +## Handling Lazy-Loaded Images + +Many websites now load images **lazily** as you scroll. If you need to ensure they appear in your final crawl (and in `result.media`), consider: + +1. **`wait_for_images=True`** – Wait for images to fully load. +2. **`scan_full_page`** – Force the crawler to scroll the entire page, triggering lazy loads. +3. **`scroll_delay`** – Add small delays between scroll steps. + +**Note**: If the site requires multiple “Load More” triggers or complex interactions, see the [Page Interaction docs](../core/page-interaction.md). + +### Example: Ensuring Lazy Images Appear + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig +from crawl4ai.async_configs import CacheMode + +async def main(): + config = CrawlerRunConfig( + # Force the crawler to wait until images are fully loaded + wait_for_images=True, + + # Option 1: If you want to automatically scroll the page to load images + scan_full_page=True, # Tells the crawler to try scrolling the entire page + scroll_delay=0.5, # Delay (seconds) between scroll steps + + # Option 2: If the site uses a 'Load More' or JS triggers for images, + # you can also specify js_code or wait_for logic here. + + cache_mode=CacheMode.BYPASS, + verbose=True + ) + + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + result = await crawler.arun("https://www.example.com/gallery", config=config) + + if result.success: + images = result.media.get("images", []) + print("Images found:", len(images)) + for i, img in enumerate(images[:5]): + print(f"[Image {i}] URL: {img['src']}, Score: {img.get('score','N/A')}") + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Explanation**: + +- **`wait_for_images=True`** + The crawler tries to ensure images have finished loading before finalizing the HTML. +- **`scan_full_page=True`** + Tells the crawler to attempt scrolling from top to bottom. Each scroll step helps trigger lazy loading. +- **`scroll_delay=0.5`** + Pause half a second between each scroll step. Helps the site load images before continuing. + +**When to Use**: + +- **Lazy-Loading**: If images appear only when the user scrolls into view, `scan_full_page` + `scroll_delay` helps the crawler see them. +- **Heavier Pages**: If a page is extremely long, be mindful that scanning the entire page can be slow. Adjust `scroll_delay` or the max scroll steps as needed. + +--- + +## Combining with Other Link & Media Filters + +You can still combine **lazy-load** logic with the usual **exclude_external_images**, **exclude_domains**, or link filtration: + +```python +config = CrawlerRunConfig( + wait_for_images=True, + scan_full_page=True, + scroll_delay=0.5, + + # Filter out external images if you only want local ones + exclude_external_images=True, + + # Exclude certain domains for links + exclude_domains=["spammycdn.com"], +) +``` + +This approach ensures you see **all** images from the main domain while ignoring external ones, and the crawler physically scrolls the entire page so that lazy-loading triggers. + +--- + +## Tips & Troubleshooting + +1. **Long Pages** + - Setting `scan_full_page=True` on extremely long or infinite-scroll pages can be resource-intensive. + - Consider using [hooks](../core/page-interaction.md) or specialized logic to load specific sections or “Load More” triggers repeatedly. + +2. **Mixed Image Behavior** + - Some sites load images in batches as you scroll. If you’re missing images, increase your `scroll_delay` or call multiple partial scrolls in a loop with JS code or hooks. + +3. **Combining with Dynamic Wait** + - If the site has a placeholder that only changes to a real image after a certain event, you might do `wait_for="css:img.loaded"` or a custom JS `wait_for`. + +4. **Caching** + - If `cache_mode` is enabled, repeated crawls might skip some network fetches. If you suspect caching is missing new images, set `cache_mode=CacheMode.BYPASS` for fresh fetches. + +--- + +With **lazy-loading** support, **wait_for_images**, and **scan_full_page** settings, you can capture the entire gallery or feed of images you expect—even if the site only loads them as the user scrolls. Combine these with the standard media filtering and domain exclusion for a complete link & media handling strategy. \ No newline at end of file diff --git a/docs/md_v2/advanced/magic-mode.md b/docs/md_v2/advanced/magic-mode.md deleted file mode 100644 index 16c7229e..00000000 --- a/docs/md_v2/advanced/magic-mode.md +++ /dev/null @@ -1,52 +0,0 @@ -# Magic Mode & Anti-Bot Protection - -Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution. - -## Magic Mode - -The easiest way to bypass anti-bot protections: - -```python -async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - url="https://example.com", - magic=True # Enables all anti-detection features - ) -``` - -Magic Mode automatically: -- Masks browser automation signals -- Simulates human-like behavior -- Overrides navigator properties -- Handles cookie consent popups -- Manages browser fingerprinting -- Randomizes timing patterns - -## Manual Anti-Bot Options - -While Magic Mode is recommended, you can also configure individual anti-detection features: - -```python -result = await crawler.arun( - url="https://example.com", - simulate_user=True, # Simulate human behavior - override_navigator=True # Mask automation signals -) -``` - -Note: When `magic=True` is used, you don't need to set these individual options. - -## Example: Handling Protected Sites - -```python -async def crawl_protected_site(url: str): - async with AsyncWebCrawler(headless=True) as crawler: - result = await crawler.arun( - url=url, - magic=True, - remove_overlay_elements=True, # Remove popups/modals - page_timeout=60000 # Increased timeout for protection checks - ) - - return result.markdown if result.success else None -``` diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md deleted file mode 100644 index bbe07f2f..00000000 --- a/docs/md_v2/advanced/managed_browser.md +++ /dev/null @@ -1,188 +0,0 @@ -# Creating Browser Instances, Contexts, and Pages - -## 1 Introduction - -### Overview of Browser Management in Crawl4AI -Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling. - -### Key Objectives -- **Anti-Bot Handling**: - - Implements stealth techniques to evade detection mechanisms used by modern websites. - - Simulates human-like behavior, such as mouse movements, scrolling, and key presses. - - Supports integration with third-party services to bypass CAPTCHA challenges. -- **Persistent Sessions**: - - Retains session data (cookies, local storage) for workflows requiring user authentication. - - Allows seamless continuation of tasks across multiple runs without re-authentication. -- **Scalable Crawling**: - - Optimized resource utilization for handling thousands of URLs concurrently. - - Flexible configuration options to tailor crawling behavior to specific requirements. - ---- - -## 2 Browser Creation Methods - -### Standard Browser Creation -Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization. - -#### Features and Limitations -- **Features**: - - Quick and straightforward setup for small-scale tasks. - - Supports headless and headful modes. -- **Limitations**: - - Lacks advanced customization options like session reuse. - - May struggle with sites employing strict anti-bot measures. - -#### Example Usage -```python -from crawl4ai import AsyncWebCrawler, BrowserConfig - -browser_config = BrowserConfig(browser_type="chromium", headless=True) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun("https://crawl4ai.com") - print(result.markdown) -``` - -### Persistent Contexts -Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information. - -#### Benefits of Using `user_data_dir` -- **Session Persistence**: - - Stores cookies, local storage, and cache between crawling sessions. - - Reduces overhead for repetitive logins or multi-step workflows. -- **Enhanced Performance**: - - Leverages pre-loaded resources for faster page loading. -- **Flexibility**: - - Adapts to complex workflows requiring user-specific configurations. - -#### Example: Setting Up Persistent Contexts -```python -config = BrowserConfig(user_data_dir="/path/to/user/data") -async with AsyncWebCrawler(config=config) as crawler: - result = await crawler.arun("https://crawl4ai.com") - print(result.markdown) -``` - -### Managed Browser -The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures. - -#### How It Works -- **Browser Process Management**: - - Automates initialization and cleanup of browser processes. - - Optimizes resource usage by pooling and reusing browser instances. -- **Debugging Support**: - - Integrates with debugging tools like Chrome Developer Tools for real-time inspection. -- **Anti-Bot Measures**: - - Implements stealth plugins to mimic real user behavior and bypass bot detection. - -#### Features -- **Customizable Configurations**: - - Supports advanced options such as viewport resizing, proxy settings, and header manipulation. -- **Debugging and Logging**: - - Logs detailed browser interactions for debugging and performance analysis. -- **Scalability**: - - Handles multiple browser instances concurrently, scaling dynamically based on workload. - -#### Example: Using `ManagedBrowser` -```python -from crawl4ai import AsyncWebCrawler, BrowserConfig - -config = BrowserConfig(headless=False, debug_port=9222) -async with AsyncWebCrawler(config=config) as crawler: - result = await crawler.arun("https://crawl4ai.com") - print(result.markdown) -``` - ---- - -## 3 Context and Page Management - -### Creating and Configuring Browser Contexts -Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage. - -#### Customizations -- **Headers and Cookies**: - - Define custom headers to mimic specific devices or browsers. - - Set cookies for authenticated sessions. -- **Session Reuse**: - - Retain and reuse session data across multiple requests. - - Example: Preserve login states for authenticated crawls. - -#### Example: Context Initialization -```python -from crawl4ai import CrawlerRunConfig - -config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"}) -async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://crawl4ai.com", config=config) - print(result.markdown) -``` - -### Creating Pages -Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions. - -#### Key Features -- **IFrame Handling**: - - Extract content from embedded iframes. - - Navigate and interact with nested content. -- **Viewport Customization**: - - Adjust viewport size to match target device dimensions. -- **Lazy Loading**: - - Ensure dynamic elements are fully loaded before extraction. - -#### Example: Page Initialization -```python -config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080) -async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://crawl4ai.com", config=config) - print(result.markdown) -``` - ---- - -## 4 Advanced Features and Best Practices - -### Debugging and Logging -Remote debugging provides a powerful way to troubleshoot complex crawling workflows. - -#### Example: Enabling Remote Debugging -```python -config = BrowserConfig(debug_port=9222) -async with AsyncWebCrawler(config=config) as crawler: - result = await crawler.arun("https://crawl4ai.com") -``` - -### Anti-Bot Techniques -- **Human Behavior Simulation**: - - Mimic real user actions, such as scrolling, clicking, and typing. - - Example: Use JavaScript to simulate interactions. -- **Captcha Handling**: - - Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving. - -#### Example: Simulating User Actions -```python -js_code = """ -(async () => { - document.querySelector('input[name="search"]').value = 'test'; - document.querySelector('button[type="submit"]').click(); -})(); -""" -config = CrawlerRunConfig(js_code=[js_code]) -async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://crawl4ai.com", config=config) -``` - -### Optimizations for Performance and Scalability -- **Persistent Contexts**: - - Reuse browser contexts to minimize resource consumption. -- **Concurrent Crawls**: - - Use `arun_many` with a controlled semaphore count for efficient batch processing. - -#### Example: Scaling Crawls -```python -urls = ["https://example1.com", "https://example2.com"] -config = CrawlerRunConfig(semaphore_count=10) -async with AsyncWebCrawler() as crawler: - results = await crawler.arun_many(urls, config=config) - for result in results: - print(result.url, result.markdown) -``` diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md new file mode 100644 index 00000000..a1d2b423 --- /dev/null +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -0,0 +1,264 @@ +# Optimized Multi-URL Crawling + +> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated. + + +Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to: + +1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation. +2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser. + +When the entire process finishes, you close the browser once—**minimizing** memory and resource usage. + +--- + +## 1. Why Avoid Simple Loops per URL? + +If you naively do: + +```python +for url in urls: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url) +``` + +You end up: + +1. Spinning up a **new** browser for each URL +2. Closing it immediately after the single crawl +3. Potentially using a lot of CPU/memory for short-living browsers +4. Missing out on session reusability if you have login or ongoing states + +**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead. + +--- + +## 2. Sequential Crawling with Session Reuse + +### 2.1 Overview + +1. **One** `AsyncWebCrawler` instance for **all** URLs. +2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed. +3. The crawler is only closed at the **end**. + +**This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs). + +### 2.2 Example Code + +```python +import asyncio +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def crawl_sequential(urls: List[str]): + print("\n=== Sequential Crawling with Session Reuse ===") + + browser_config = BrowserConfig( + headless=True, + # For better performance in Docker or low-memory environments: + extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], + ) + + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator() + ) + + # Create the crawler (opens the browser) + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + session_id = "session1" # Reuse the same session across all URLs + for url in urls: + result = await crawler.arun( + url=url, + config=crawl_config, + session_id=session_id + ) + if result.success: + print(f"Successfully crawled: {url}") + # E.g. check markdown length + print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}") + else: + print(f"Failed: {url} - Error: {result.error_message}") + finally: + # After all URLs are done, close the crawler (and the browser) + await crawler.close() + +async def main(): + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3" + ] + await crawl_sequential(urls) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Why It’s Good**: + +- **One** browser launch. +- Minimal memory usage. +- If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs. + +--- + +## 3. Parallel Crawling with Browser Reuse + +### 3.1 Overview + +To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task. + +### 3.2 Example Code + +For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package. + +```bash +pip install psutil +``` + +Then you can run the following code: + +```python +import os +import sys +import psutil +import asyncio + +__location__ = os.path.dirname(os.path.abspath(__file__)) +__output__ = os.path.join(__location__, "output") + +# Append parent directory to system path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def crawl_parallel(urls: List[str], max_concurrent: int = 3): + print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===") + + # We'll keep track of peak memory usage across all tasks + peak_memory = 0 + process = psutil.Process(os.getpid()) + + def log_memory(prefix: str = ""): + nonlocal peak_memory + current_mem = process.memory_info().rss # in bytes + if current_mem > peak_memory: + peak_memory = current_mem + print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB") + + # Minimal browser config + browser_config = BrowserConfig( + headless=True, + verbose=False, # corrected from 'verbos=False' + extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], + ) + crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + # Create the crawler instance + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + # We'll chunk the URLs in batches of 'max_concurrent' + success_count = 0 + fail_count = 0 + for i in range(0, len(urls), max_concurrent): + batch = urls[i : i + max_concurrent] + tasks = [] + + for j, url in enumerate(batch): + # Unique session_id per concurrent sub-task + session_id = f"parallel_session_{i + j}" + task = crawler.arun(url=url, config=crawl_config, session_id=session_id) + tasks.append(task) + + # Check memory usage prior to launching tasks + log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ") + + # Gather results + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Check memory usage after tasks complete + log_memory(prefix=f"After batch {i//max_concurrent + 1}: ") + + # Evaluate results + for url, result in zip(batch, results): + if isinstance(result, Exception): + print(f"Error crawling {url}: {result}") + fail_count += 1 + elif result.success: + success_count += 1 + else: + fail_count += 1 + + print(f"\nSummary:") + print(f" - Successfully crawled: {success_count}") + print(f" - Failed: {fail_count}") + + finally: + print("\nClosing crawler...") + await crawler.close() + # Final memory log + log_memory(prefix="Final: ") + print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}") + +async def main(): + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + "https://example.com/page4" + ] + await crawl_parallel(urls, max_concurrent=2) + +if __name__ == "__main__": + asyncio.run(main()) + +``` + +**Notes**: + +- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser. +- Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired). +- We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory. + +--- + +## 4. Performance Tips + +1. **Extra Browser Args** + - `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments. + - `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems. + +2. **Session Reuse** + - If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`. + - If you want isolation (each URL fresh), create unique sessions. + +3. **Batching** + - If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`). + - Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible. + +4. **Cache** + - If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`. + - If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`. + +5. **Hooks** + - You can set up global hooks for each crawler (like to block images) or per-run if you want. + - Keep them consistent if you’re reusing sessions. + +--- + +## 5. Summary + +- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL. +- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs. +- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead. +- Close the crawler once at the end, ensuring the browser is only opened/closed once. + +For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling! \ No newline at end of file diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index 8989777b..b98c17e5 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -1,6 +1,4 @@ -# Proxy & Security - -Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction. +# Proxy ## Basic Proxy Setup @@ -58,38 +56,3 @@ async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=browser_config) ``` -## Custom Headers - -Add security-related headers via `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig - -headers = { - "X-Forwarded-For": "203.0.113.195", - "Accept-Language": "en-US,en;q=0.9", - "Cache-Control": "no-cache", - "Pragma": "no-cache" -} - -browser_config = BrowserConfig(headers=headers) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Combining with Magic Mode - -For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig - -browser_config = BrowserConfig( - proxy="http://proxy.example.com:8080", - headers={"Accept-Language": "en-US"} -) -crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features - -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com", config=crawler_config) -``` diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md deleted file mode 100644 index ba1ae0a0..00000000 --- a/docs/md_v2/advanced/session-management-advanced.md +++ /dev/null @@ -1,179 +0,0 @@ -### Session-Based Crawling for Dynamic Content - -In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. Crawl4AI provides session-based crawling capabilities to handle such scenarios effectively. - -This guide explores advanced techniques for crawling dynamic content using Crawl4AI's session management features. - ---- - -## Understanding Session-Based Crawling - -Session-based crawling allows you to reuse a persistent browser session across multiple actions. This means the same browser tab (or page object) is used throughout, enabling: - -1. **Efficient handling of dynamic content** without reloading the page. -2. **JavaScript actions before and after crawling** (e.g., clicking buttons or scrolling). -3. **State maintenance** for authenticated sessions or multi-step workflows. -4. **Faster sequential crawling**, as it avoids reopening tabs or reallocating resources. - -**Note:** Session-based crawling is ideal for sequential operations, not parallel tasks. - ---- - -## Basic Concepts - -Before diving into examples, here are some key concepts: - -- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple requests to maintain state. -- **BrowserConfig & CrawlerRunConfig**: These configuration objects control browser settings and crawling behavior. -- **JavaScript Execution**: Use `js_code` to perform actions like clicking buttons. -- **CSS Selectors**: Target specific elements for interaction or data extraction. -- **Extraction Strategy**: Define rules to extract structured data. -- **Wait Conditions**: Specify conditions to wait for before proceeding. - ---- - -## Example 1: Basic Session-Based Crawling - -A simple example using session-based crawling: - -```python -import asyncio -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.cache_context import CacheMode - -async def basic_session_crawl(): - async with AsyncWebCrawler() as crawler: - session_id = "dynamic_content_session" - url = "https://example.com/dynamic-content" - - for page in range(3): - config = CrawlerRunConfig( - url=url, - session_id=session_id, - js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, - css_selector=".content-item", - cache_mode=CacheMode.BYPASS - ) - - result = await crawler.arun(config=config) - print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") - - await crawler.crawler_strategy.kill_session(session_id) - -asyncio.run(basic_session_crawl()) -``` - -This example shows: -1. Reusing the same `session_id` across multiple requests. -2. Executing JavaScript to load more content dynamically. -3. Properly closing the session to free resources. - ---- - -## Advanced Technique 1: Custom Execution Hooks - -Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically: - -```python -async def advanced_session_crawl_with_hooks(): - first_commit = "" - - async def on_execution_started(page): - nonlocal first_commit - try: - while True: - await page.wait_for_selector("li.commit-item h4") - commit = await page.query_selector("li.commit-item h4") - commit = await commit.evaluate("(element) => element.textContent").strip() - if commit and commit != first_commit: - first_commit = commit - break - await asyncio.sleep(0.5) - except Exception as e: - print(f"Warning: New content didn't appear: {e}") - - async with AsyncWebCrawler() as crawler: - session_id = "commit_session" - url = "https://github.com/example/repo/commits/main" - crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) - - js_next_page = """document.querySelector('a.pagination-next').click();""" - - for page in range(3): - config = CrawlerRunConfig( - url=url, - session_id=session_id, - js_code=js_next_page if page > 0 else None, - css_selector="li.commit-item", - js_only=page > 0, - cache_mode=CacheMode.BYPASS - ) - - result = await crawler.arun(config=config) - print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") - - await crawler.crawler_strategy.kill_session(session_id) - -asyncio.run(advanced_session_crawl_with_hooks()) -``` - -This technique ensures new content loads before the next action. - ---- - -## Advanced Technique 2: Integrated JavaScript Execution and Waiting - -Combine JavaScript execution and waiting logic for concise handling of dynamic content: - -```python -async def integrated_js_and_wait_crawl(): - async with AsyncWebCrawler() as crawler: - session_id = "integrated_session" - url = "https://github.com/example/repo/commits/main" - - js_next_page_and_wait = """ - (async () => { - const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim(); - const initialCommit = getCurrentCommit(); - document.querySelector('a.pagination-next').click(); - while (getCurrentCommit() === initialCommit) { - await new Promise(resolve => setTimeout(resolve, 100)); - } - })(); - """ - - for page in range(3): - config = CrawlerRunConfig( - url=url, - session_id=session_id, - js_code=js_next_page_and_wait if page > 0 else None, - css_selector="li.commit-item", - js_only=page > 0, - cache_mode=CacheMode.BYPASS - ) - - result = await crawler.arun(config=config) - print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") - - await crawler.crawler_strategy.kill_session(session_id) - -asyncio.run(integrated_js_and_wait_crawl()) -``` - ---- - -## Best Practices for Session-Based Crawling - -1. **Unique Session IDs**: Assign descriptive and unique `session_id` values. -2. **Close Sessions**: Always clean up sessions with `kill_session` after use. -3. **Error Handling**: Anticipate and handle errors gracefully. -4. **Respect Websites**: Follow terms of service and robots.txt. -5. **Delays**: Add delays to avoid overwhelming servers. -6. **Optimize JavaScript**: Keep scripts concise for better performance. -7. **Monitor Resources**: Track memory and CPU usage for long sessions. - ---- - -## Conclusion - -Session-based crawling in Crawl4AI is a robust solution for handling dynamic content and multi-step workflows. By combining session management, JavaScript execution, and structured extraction strategies, you can effectively navigate and extract data from modern web applications. Always adhere to ethical web scraping practices and respect website policies. \ No newline at end of file diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index e9348223..180dfc85 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -1,4 +1,4 @@ -### Session Management +# Session Management Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for: @@ -20,8 +20,12 @@ async with AsyncWebCrawler() as crawler: session_id = "my_session" # Define configurations - config1 = CrawlerRunConfig(url="https://example.com/page1", session_id=session_id) - config2 = CrawlerRunConfig(url="https://example.com/page2", session_id=session_id) + config1 = CrawlerRunConfig( + url="https://example.com/page1", session_id=session_id + ) + config2 = CrawlerRunConfig( + url="https://example.com/page2", session_id=session_id + ) # First request result1 = await crawler.arun(config=config1) @@ -54,7 +58,9 @@ async def crawl_dynamic_content(): schema = { "name": "Commit Extractor", "baseSelector": "li.Box-sc-g0xbh4-0", - "fields": [{"name": "title", "selector": "h4.markdown-title", "type": "text"}], + "fields": [{ + "name": "title", "selector": "h4.markdown-title", "type": "text" + }], } extraction_strategy = JsonCssExtractionStrategy(schema) @@ -87,51 +93,146 @@ async def crawl_dynamic_content(): --- -#### Session Best Practices +## Example 1: Basic Session-Based Crawling -1. **Descriptive Session IDs**: - Use meaningful names for session IDs to organize workflows: - ```python - session_id = "login_flow_session" - session_id = "product_catalog_session" - ``` +A simple example using session-based crawling: -2. **Resource Management**: - Always ensure sessions are cleaned up to free resources: - ```python - try: - # Your crawling code here - pass - finally: - await crawler.crawler_strategy.kill_session(session_id) - ``` +```python +import asyncio +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.cache_context import CacheMode -3. **State Maintenance**: - Reuse the session for subsequent actions within the same workflow: - ```python - # Step 1: Login - login_config = CrawlerRunConfig( - url="https://example.com/login", - session_id=session_id, - js_code="document.querySelector('form').submit();" - ) - await crawler.arun(config=login_config) +async def basic_session_crawl(): + async with AsyncWebCrawler() as crawler: + session_id = "dynamic_content_session" + url = "https://example.com/dynamic-content" - # Step 2: Verify login success - dashboard_config = CrawlerRunConfig( - url="https://example.com/dashboard", - session_id=session_id, - wait_for="css:.user-profile" # Wait for authenticated content - ) - result = await crawler.arun(config=dashboard_config) - ``` + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, + css_selector=".content-item", + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") + + await crawler.crawler_strategy.kill_session(session_id) + +asyncio.run(basic_session_crawl()) +``` + +This example shows: +1. Reusing the same `session_id` across multiple requests. +2. Executing JavaScript to load more content dynamically. +3. Properly closing the session to free resources. + +--- + +## Advanced Technique 1: Custom Execution Hooks + +> Warning: You might feel confused by the end of the next few examples 😅, so make sure you are comfortable with the order of the parts before you start this. + +Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically: + +```python +async def advanced_session_crawl_with_hooks(): + first_commit = "" + + async def on_execution_started(page): + nonlocal first_commit + try: + while True: + await page.wait_for_selector("li.commit-item h4") + commit = await page.query_selector("li.commit-item h4") + commit = await commit.evaluate("(element) => element.textContent").strip() + if commit and commit != first_commit: + first_commit = commit + break + await asyncio.sleep(0.5) + except Exception as e: + print(f"Warning: New content didn't appear: {e}") + + async with AsyncWebCrawler() as crawler: + session_id = "commit_session" + url = "https://github.com/example/repo/commits/main" + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + + js_next_page = """document.querySelector('a.pagination-next').click();""" + + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + js_code=js_next_page if page > 0 else None, + css_selector="li.commit-item", + js_only=page > 0, + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + +asyncio.run(advanced_session_crawl_with_hooks()) +``` + +This technique ensures new content loads before the next action. + +--- + +## Advanced Technique 2: Integrated JavaScript Execution and Waiting + +Combine JavaScript execution and waiting logic for concise handling of dynamic content: + +```python +async def integrated_js_and_wait_crawl(): + async with AsyncWebCrawler() as crawler: + session_id = "integrated_session" + url = "https://github.com/example/repo/commits/main" + + js_next_page_and_wait = """ + (async () => { + const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim(); + const initialCommit = getCurrentCommit(); + document.querySelector('a.pagination-next').click(); + while (getCurrentCommit() === initialCommit) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + })(); + """ + + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + js_code=js_next_page_and_wait if page > 0 else None, + css_selector="li.commit-item", + js_only=page > 0, + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + +asyncio.run(integrated_js_and_wait_crawl()) +``` --- #### Common Use Cases for Sessions -1. **Authentication Flows**: Login and interact with secured pages. -2. **Pagination Handling**: Navigate through multiple pages. -3. **Form Submissions**: Fill forms, submit, and process results. -4. **Multi-step Processes**: Complete workflows that span multiple actions. -5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content. +1. **Authentication Flows**: Login and interact with secured pages. + +2. **Pagination Handling**: Navigate through multiple pages. + +3. **Form Submissions**: Fill forms, submit, and process results. + +4. **Multi-step Processes**: Complete workflows that span multiple actions. + +5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content. diff --git a/docs/md_v2/advanced/ssl-certificate.md b/docs/md_v2/advanced/ssl-certificate.md new file mode 100644 index 00000000..fa04716a --- /dev/null +++ b/docs/md_v2/advanced/ssl-certificate.md @@ -0,0 +1,179 @@ +# `SSLCertificate` Reference + +The **`SSLCertificate`** class encapsulates an SSL certificate’s data and allows exporting it in various formats (PEM, DER, JSON, or text). It’s used within **Crawl4AI** whenever you set **`fetch_ssl_certificate=True`** in your **`CrawlerRunConfig`**. + +## 1. Overview + +**Location**: `crawl4ai/ssl_certificate.py` + +```python +class SSLCertificate: + """ + Represents an SSL certificate with methods to export in various formats. + + Main Methods: + - from_url(url, timeout=10) + - from_file(file_path) + - from_binary(binary_data) + - to_json(filepath=None) + - to_pem(filepath=None) + - to_der(filepath=None) + ... + + Common Properties: + - issuer + - subject + - valid_from + - valid_until + - fingerprint + """ +``` + +### Typical Use Case +1. You **enable** certificate fetching in your crawl by: + ```python + CrawlerRunConfig(fetch_ssl_certificate=True, ...) + ``` +2. After `arun()`, if `result.ssl_certificate` is present, it’s an instance of **`SSLCertificate`**. +3. You can **read** basic properties (issuer, subject, validity) or **export** them in multiple formats. + +--- + +## 2. Construction & Fetching + +### 2.1 **`from_url(url, timeout=10)`** +Manually load an SSL certificate from a given URL (port 443). Typically used internally, but you can call it directly if you want: + +```python +cert = SSLCertificate.from_url("https://example.com") +if cert: + print("Fingerprint:", cert.fingerprint) +``` + +### 2.2 **`from_file(file_path)`** +Load from a file containing certificate data in ASN.1 or DER. Rarely needed unless you have local cert files: + +```python +cert = SSLCertificate.from_file("/path/to/cert.der") +``` + +### 2.3 **`from_binary(binary_data)`** +Initialize from raw binary. E.g., if you captured it from a socket or another source: + +```python +cert = SSLCertificate.from_binary(raw_bytes) +``` + +--- + +## 3. Common Properties + +After obtaining a **`SSLCertificate`** instance (e.g. `result.ssl_certificate` from a crawl), you can read: + +1. **`issuer`** *(dict)* + - E.g. `{"CN": "My Root CA", "O": "..."}` +2. **`subject`** *(dict)* + - E.g. `{"CN": "example.com", "O": "ExampleOrg"}` +3. **`valid_from`** *(str)* + - NotBefore date/time. Often in ASN.1/UTC format. +4. **`valid_until`** *(str)* + - NotAfter date/time. +5. **`fingerprint`** *(str)* + - The SHA-256 digest (lowercase hex). + - E.g. `"d14d2e..."` + +--- + +## 4. Export Methods + +Once you have a **`SSLCertificate`** object, you can **export** or **inspect** it: + +### 4.1 **`to_json(filepath=None)` → `Optional[str]`** +- Returns a JSON string containing the parsed certificate fields. +- If `filepath` is provided, saves it to disk instead, returning `None`. + +**Usage**: +```python +json_data = cert.to_json() # returns JSON string +cert.to_json("certificate.json") # writes file, returns None +``` + +### 4.2 **`to_pem(filepath=None)` → `Optional[str]`** +- Returns a PEM-encoded string (common for web servers). +- If `filepath` is provided, saves it to disk instead. + +```python +pem_str = cert.to_pem() # in-memory PEM string +cert.to_pem("/path/to/cert.pem") # saved to file +``` + +### 4.3 **`to_der(filepath=None)` → `Optional[bytes]`** +- Returns the original DER (binary ASN.1) bytes. +- If `filepath` is specified, writes the bytes there instead. + +```python +der_bytes = cert.to_der() +cert.to_der("certificate.der") +``` + +### 4.4 (Optional) **`export_as_text()`** +- If you see a method like `export_as_text()`, it typically returns an OpenSSL-style textual representation. +- Not always needed, but can help for debugging or manual inspection. + +--- + +## 5. Example Usage in Crawl4AI + +Below is a minimal sample showing how the crawler obtains an SSL cert from a site, then reads or exports it. The code snippet: + +```python +import asyncio +import os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + tmp_dir = "tmp" + os.makedirs(tmp_dir, exist_ok=True) + + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + # 1. Basic Info + print("Issuer CN:", cert.issuer.get("CN", "")) + print("Valid until:", cert.valid_until) + print("Fingerprint:", cert.fingerprint) + + # 2. Export + cert.to_json(os.path.join(tmp_dir, "certificate.json")) + cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) + cert.to_der(os.path.join(tmp_dir, "certificate.der")) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 6. Notes & Best Practices + +1. **Timeout**: `SSLCertificate.from_url` internally uses a default **10s** socket connect and wraps SSL. +2. **Binary Form**: The certificate is loaded in ASN.1 (DER) form, then re-parsed by `OpenSSL.crypto`. +3. **Validation**: This does **not** validate the certificate chain or trust store. It only fetches and parses. +4. **Integration**: Within Crawl4AI, you typically just set `fetch_ssl_certificate=True` in `CrawlerRunConfig`; the final result’s `ssl_certificate` is automatically built. +5. **Export**: If you need to store or analyze a cert, the `to_json` and `to_pem` are quite universal. + +--- + +### Summary + +- **`SSLCertificate`** is a convenience class for capturing and exporting the **TLS certificate** from your crawled site(s). +- Common usage is in the **`CrawlResult.ssl_certificate`** field, accessible after setting `fetch_ssl_certificate=True`. +- Offers quick access to essential certificate details (`issuer`, `subject`, `fingerprint`) and is easy to export (PEM, DER, JSON) for further analysis or server usage. + +Use it whenever you need **insight** into a site’s certificate or require some form of cryptographic or compliance check. \ No newline at end of file diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index 509991e5..d1d5eae9 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -1,244 +1,305 @@ -# Complete Parameter Guide for arun() +Below is a **revised parameter guide** for **`arun()`** in **AsyncWebCrawler**, reflecting the **new** approach where all parameters are passed via a **`CrawlerRunConfig`** instead of directly to `arun()`. Each section includes example usage in the new style, ensuring a clear, modern approach. -The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality. +--- -## Core Parameters +# `arun()` Parameter Guide (New Approach) + +In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide: ```python await crawler.arun( - url="https://example.com", # Required: URL to crawl - verbose=True, # Enable detailed logging - cache_mode=CacheMode.ENABLED, # Control cache behavior - warmup=True # Whether to run warmup check + url="https://example.com", + config=my_run_config ) ``` -## Cache Control +Below is an organized look at the parameters that can go inside `CrawlerRunConfig`, divided by their functional areas. For **Browser** settings (e.g., `headless`, `browser_type`), see [BrowserConfig](./parameters.md). + +--- + +## 1. Core Usage ```python -from crawl4ai import CacheMode +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -await crawler.arun( - cache_mode=CacheMode.ENABLED, # Normal caching (read/write) - # Other cache modes: - # cache_mode=CacheMode.DISABLED # No caching at all - # cache_mode=CacheMode.READ_ONLY # Only read from cache - # cache_mode=CacheMode.WRITE_ONLY # Only write to cache - # cache_mode=CacheMode.BYPASS # Skip cache for this operation +async def main(): + run_config = CrawlerRunConfig( + verbose=True, # Detailed logging + cache_mode=CacheMode.ENABLED, # Use normal read/write cache + # ... other parameters + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(result.cleaned_html[:500]) + +``` + +**Key Fields**: +- `verbose=True` logs each crawl step. +- `cache_mode` decides how to read/write the local crawl cache. + +--- + +## 2. Cache Control + +**`cache_mode`** (default: `CacheMode.ENABLED`) +Use a built-in enum from `CacheMode`: +- `ENABLED`: Normal caching—reads if available, writes if missing. +- `DISABLED`: No caching—always refetch pages. +- `READ_ONLY`: Reads from cache only; no new writes. +- `WRITE_ONLY`: Writes to cache but doesn’t read existing data. +- `BYPASS`: Skips reading cache for this crawl (though it might still write if set up that way). + +```python +run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS ) ``` -## Content Processing Parameters +**Additional flags**: +- `bypass_cache=True` acts like `CacheMode.BYPASS`. +- `disable_cache=True` acts like `CacheMode.DISABLED`. +- `no_cache_read=True` acts like `CacheMode.WRITE_ONLY`. +- `no_cache_write=True` acts like `CacheMode.READ_ONLY`. + +--- + +## 3. Content Processing & Selection + +### 3.1 Text Processing -### Text Processing ```python -await crawler.arun( - word_count_threshold=10, # Minimum words per content block - image_description_min_word_threshold=5, # Minimum words for image descriptions - only_text=False, # Extract only text content - excluded_tags=['form', 'nav'], # HTML tags to exclude - keep_data_attributes=False, # Preserve data-* attributes +run_config = CrawlerRunConfig( + word_count_threshold=10, # Ignore text blocks <10 words + only_text=False, # If True, tries to remove non-text elements + keep_data_attributes=False # Keep or discard data-* attributes ) ``` -### Content Selection +### 3.2 Content Selection + ```python -await crawler.arun( - css_selector=".main-content", # CSS selector for content extraction - remove_forms=True, # Remove all form elements - remove_overlay_elements=True, # Remove popups/modals/overlays +run_config = CrawlerRunConfig( + css_selector=".main-content", # Focus on .main-content region only + excluded_tags=["form", "nav"], # Remove entire tag blocks + remove_forms=True, # Specifically strip
elements + remove_overlay_elements=True, # Attempt to remove modals/popups ) ``` -### Link Handling +### 3.3 Link Handling + ```python -await crawler.arun( - exclude_external_links=True, # Remove external links - exclude_social_media_links=True, # Remove social media links - exclude_external_images=True, # Remove external images - exclude_domains=["ads.example.com"], # Specific domains to exclude - social_media_domains=[ # Additional social media domains - "facebook.com", - "twitter.com", - "instagram.com" - ] +run_config = CrawlerRunConfig( + exclude_external_links=True, # Remove external links from final content + exclude_social_media_links=True, # Remove links to known social sites + exclude_domains=["ads.example.com"], # Exclude links to these domains + exclude_social_media_domains=["facebook.com","twitter.com"], # Extend the default list ) ``` -## Browser Control Parameters +### 3.4 Media Filtering -### Basic Browser Settings ```python -await crawler.arun( - headless=True, # Run browser in headless mode - browser_type="chromium", # Browser engine: "chromium", "firefox", "webkit" - page_timeout=60000, # Page load timeout in milliseconds - user_agent="custom-agent", # Custom user agent +run_config = CrawlerRunConfig( + exclude_external_images=True # Strip images from other domains ) ``` -### Navigation and Waiting +--- + +## 4. Page Navigation & Timing + +### 4.1 Basic Browser Flow + ```python -await crawler.arun( - wait_for="css:.dynamic-content", # Wait for element/condition - delay_before_return_html=2.0, # Wait before returning HTML (seconds) +run_config = CrawlerRunConfig( + wait_for="css:.dynamic-content", # Wait for .dynamic-content + delay_before_return_html=2.0, # Wait 2s before capturing final HTML + page_timeout=60000, # Navigation & script timeout (ms) ) ``` -### JavaScript Execution +**Key Fields**: +- `wait_for`: + - `"css:selector"` or + - `"js:() => boolean"` + e.g. `js:() => document.querySelectorAll('.item').length > 10`. + +- `mean_delay` & `max_range`: define random delays for `arun_many()` calls. +- `semaphore_count`: concurrency limit when crawling multiple URLs. + +### 4.2 JavaScript Execution + ```python -await crawler.arun( - js_code=[ # JavaScript to execute (string or list) +run_config = CrawlerRunConfig( + js_code=[ "window.scrollTo(0, document.body.scrollHeight);", - "document.querySelector('.load-more').click();" + "document.querySelector('.load-more')?.click();" ], - js_only=False, # Only execute JavaScript without reloading page + js_only=False ) ``` -### Anti-Bot Features +- `js_code` can be a single string or a list of strings. +- `js_only=True` means “I’m continuing in the same session with new JS steps, no new full navigation.” + +### 4.3 Anti-Bot + ```python -await crawler.arun( - magic=True, # Enable all anti-detection features - simulate_user=True, # Simulate human behavior - override_navigator=True # Override navigator properties +run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True +) +``` +- `magic=True` tries multiple stealth features. +- `simulate_user=True` mimics mouse movements or random delays. +- `override_navigator=True` fakes some navigator properties (like user agent checks). + +--- + +## 5. Session Management + +**`session_id`**: +```python +run_config = CrawlerRunConfig( + session_id="my_session123" +) +``` +If re-used in subsequent `arun()` calls, the same tab/page context is continued (helpful for multi-step tasks or stateful browsing). + +--- + +## 6. Screenshot, PDF & Media Options + +```python +run_config = CrawlerRunConfig( + screenshot=True, # Grab a screenshot as base64 + screenshot_wait_for=1.0, # Wait 1s before capturing + pdf=True, # Also produce a PDF + image_description_min_word_threshold=5, # If analyzing alt text + image_score_threshold=3, # Filter out low-score images +) +``` +**Where they appear**: +- `result.screenshot` → Base64 screenshot string. +- `result.pdf` → Byte array with PDF data. + +--- + +## 7. Extraction Strategy + +**For advanced data extraction** (CSS/LLM-based), set `extraction_strategy`: + +```python +run_config = CrawlerRunConfig( + extraction_strategy=my_css_or_llm_strategy ) ``` -### Session Management -```python -await crawler.arun( - session_id="my_session", # Session identifier for persistent browsing -) -``` +The extracted data will appear in `result.extracted_content`. -### Screenshot Options -```python -await crawler.arun( - screenshot=True, # Take page screenshot - screenshot_wait_for=2.0, # Wait before screenshot (seconds) -) -``` +--- + +## 8. Comprehensive Example + +Below is a snippet combining many parameters: -### Proxy Configuration ```python -await crawler.arun( - proxy="http://proxy.example.com:8080", # Simple proxy URL - proxy_config={ # Advanced proxy settings - "server": "http://proxy.example.com:8080", - "username": "user", - "password": "pass" +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def main(): + # Example schema + schema = { + "name": "Articles", + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] } -) -``` -## Content Extraction Parameters - -### Extraction Strategy -```python -await crawler.arun( - extraction_strategy=LLMExtractionStrategy( - provider="ollama/llama2", - schema=MySchema.schema(), - instruction="Extract specific data" + run_config = CrawlerRunConfig( + # Core + verbose=True, + cache_mode=CacheMode.ENABLED, + + # Content + word_count_threshold=10, + css_selector="main.content", + excluded_tags=["nav", "footer"], + exclude_external_links=True, + + # Page & JS + js_code="document.querySelector('.show-more')?.click();", + wait_for="css:.loaded-block", + page_timeout=30000, + + # Extraction + extraction_strategy=JsonCssExtractionStrategy(schema), + + # Session + session_id="persistent_session", + + # Media + screenshot=True, + pdf=True, + + # Anti-bot + simulate_user=True, + magic=True, ) -) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/posts", config=run_config) + if result.success: + print("HTML length:", len(result.cleaned_html)) + print("Extraction JSON:", result.extracted_content) + if result.screenshot: + print("Screenshot length:", len(result.screenshot)) + if result.pdf: + print("PDF bytes length:", len(result.pdf)) + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) ``` -### Chunking Strategy -```python -await crawler.arun( - chunking_strategy=RegexChunking( - patterns=[r'\n\n', r'\.\s+'] - ) -) -``` +**What we covered**: +1. **Crawling** the main content region, ignoring external links. +2. Running **JavaScript** to click “.show-more”. +3. **Waiting** for “.loaded-block” to appear. +4. Generating a **screenshot** & **PDF** of the final page. +5. Extracting repeated “article.post” elements with a **CSS-based** extraction strategy. -### HTML to Text Options -```python -await crawler.arun( - html2text={ - "ignore_links": False, - "ignore_images": False, - "escape_dot": False, - "body_width": 0, - "protect_links": True, - "unicode_snob": True - } -) -``` +--- -## Debug Options -```python -await crawler.arun( - log_console=True, # Log browser console messages -) -``` +## 9. Best Practices -## Parameter Interactions and Notes +1. **Use `BrowserConfig` for global browser** settings (headless, user agent). +2. **Use `CrawlerRunConfig`** to handle the **specific** crawl needs: content filtering, caching, JS, screenshot, extraction, etc. +3. Keep your **parameters consistent** in run configs—especially if you’re part of a large codebase with multiple crawls. +4. **Limit** large concurrency (`semaphore_count`) if the site or your system can’t handle it. +5. For dynamic pages, set `js_code` or `scan_full_page` so you load all content. -1. **Cache and Performance Setup** - ```python - # Optimal caching for repeated crawls - await crawler.arun( - cache_mode=CacheMode.ENABLED, - word_count_threshold=10, - process_iframes=False - ) - ``` +--- -2. **Dynamic Content Handling** - ```python - # Handle lazy-loaded content - await crawler.arun( - js_code="window.scrollTo(0, document.body.scrollHeight);", - wait_for="css:.lazy-content", - delay_before_return_html=2.0, - cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load - ) - ``` +## 10. Conclusion -3. **Content Extraction Pipeline** - ```python - # Complete extraction setup - await crawler.arun( - css_selector=".main-content", - word_count_threshold=20, - extraction_strategy=my_strategy, - chunking_strategy=my_chunking, - process_iframes=True, - remove_overlay_elements=True, - cache_mode=CacheMode.ENABLED - ) - ``` +All parameters that used to be direct arguments to `arun()` now belong in **`CrawlerRunConfig`**. This approach: -## Best Practices +- Makes code **clearer** and **more maintainable**. +- Minimizes confusion about which arguments affect global vs. per-crawl behavior. +- Allows you to create **reusable** config objects for different pages or tasks. -1. **Performance Optimization** - ```python - await crawler.arun( - cache_mode=CacheMode.ENABLED, # Use full caching - word_count_threshold=10, # Filter out noise - process_iframes=False # Skip iframes if not needed - ) - ``` +For a **full** reference, check out the [CrawlerRunConfig Docs](./parameters.md). -2. **Reliable Scraping** - ```python - await crawler.arun( - magic=True, # Enable anti-detection - delay_before_return_html=1.0, # Wait for dynamic content - page_timeout=60000, # Longer timeout for slow pages - cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl - ) - ``` - -3. **Clean Content** - ```python - await crawler.arun( - remove_overlay_elements=True, # Remove popups - excluded_tags=['nav', 'aside'],# Remove unnecessary elements - keep_data_attributes=False, # Remove data attributes - cache_mode=CacheMode.ENABLED # Use cache for faster processing - ) - ``` \ No newline at end of file +Happy crawling with your **structured, flexible** config approach! \ No newline at end of file diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md index be956101..51a6ccce 100644 --- a/docs/md_v2/api/async-webcrawler.md +++ b/docs/md_v2/api/async-webcrawler.md @@ -1,320 +1,283 @@ +Below is the **updated** guide for the **AsyncWebCrawler** class, reflecting the **new** recommended approach of configuring the browser via **`BrowserConfig`** and each crawl via **`CrawlerRunConfig`**. While the crawler still accepts legacy parameters for backward compatibility, the modern, maintainable way is shown below. + +--- + # AsyncWebCrawler -The `AsyncWebCrawler` class is the main interface for web crawling operations. It provides asynchronous web crawling capabilities with extensive configuration options. +The **`AsyncWebCrawler`** is the core class for asynchronous web crawling in Crawl4AI. You typically create it **once**, optionally customize it with a **`BrowserConfig`** (e.g., headless, user agent), then **run** multiple **`arun()`** calls with different **`CrawlerRunConfig`** objects. -## Constructor +**Recommended usage**: +1. **Create** a `BrowserConfig` for global browser settings. +2. **Instantiate** `AsyncWebCrawler(config=browser_config)`. +3. **Use** the crawler in an async context manager (`async with`) or manage start/close manually. +4. **Call** `arun(url, config=crawler_run_config)` for each page you want. + +--- + +## 1. Constructor Overview ```python -AsyncWebCrawler( - # Browser Settings - browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit" - headless: bool = True, # Run browser in headless mode - verbose: bool = False, # Enable verbose logging - - # Cache Settings - always_by_pass_cache: bool = False, # Always bypass cache - base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache - - # Network Settings - proxy: str = None, # Simple proxy URL - proxy_config: Dict = None, # Advanced proxy configuration - - # Browser Behavior - sleep_on_close: bool = False, # Wait before closing browser - - # Custom Settings - user_agent: str = None, # Custom user agent - headers: Dict[str, str] = {}, # Custom HTTP headers - js_code: Union[str, List[str]] = None, # Default JavaScript to execute -) +class AsyncWebCrawler: + def __init__( + self, + crawler_strategy: Optional[AsyncCrawlerStrategy] = None, + config: Optional[BrowserConfig] = None, + always_bypass_cache: bool = False, # deprecated + always_by_pass_cache: Optional[bool] = None, # also deprecated + base_directory: str = ..., + thread_safe: bool = False, + **kwargs, + ): + """ + Create an AsyncWebCrawler instance. + + Args: + crawler_strategy: (Advanced) Provide a custom crawler strategy if needed. + config: A BrowserConfig object specifying how the browser is set up. + always_bypass_cache: (Deprecated) Use CrawlerRunConfig.cache_mode instead. + base_directory: Folder for storing caches/logs (if relevant). + thread_safe: If True, attempts some concurrency safeguards. Usually False. + **kwargs: Additional legacy or debugging parameters. + """ ``` -### Parameters in Detail +### Typical Initialization -#### Browser Settings +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig -- **browser_type** (str, optional) - - Default: `"chromium"` - - Options: `"chromium"`, `"firefox"`, `"webkit"` - - Controls which browser engine to use - ```python - # Example: Using Firefox - crawler = AsyncWebCrawler(browser_type="firefox") - ``` +browser_cfg = BrowserConfig( + browser_type="chromium", + headless=True, + verbose=True +) -- **headless** (bool, optional) - - Default: `True` - - When `True`, browser runs without GUI - - Set to `False` for debugging - ```python - # Visible browser for debugging - crawler = AsyncWebCrawler(headless=False) - ``` +crawler = AsyncWebCrawler(config=browser_cfg) +``` -- **verbose** (bool, optional) - - Default: `False` - - Enables detailed logging - ```python - # Enable detailed logging - crawler = AsyncWebCrawler(verbose=True) - ``` +**Notes**: +- **Legacy** parameters like `always_bypass_cache` remain for backward compatibility, but prefer to set **caching** in `CrawlerRunConfig`. -#### Cache Settings +--- -- **always_by_pass_cache** (bool, optional) - - Default: `False` - - When `True`, always fetches fresh content - ```python - # Always fetch fresh content - crawler = AsyncWebCrawler(always_by_pass_cache=True) - ``` +## 2. Lifecycle: Start/Close or Context Manager -- **base_directory** (str, optional) - - Default: User's home directory - - Base path for cache storage - ```python - # Custom cache directory - crawler = AsyncWebCrawler(base_directory="/path/to/cache") - ``` +### 2.1 Context Manager (Recommended) -#### Network Settings +```python +async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://example.com") + # The crawler automatically starts/closes resources +``` -- **proxy** (str, optional) - - Simple proxy URL - ```python - # Using simple proxy - crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080") - ``` +When the `async with` block ends, the crawler cleans up (closes the browser, etc.). -- **proxy_config** (Dict, optional) - - Advanced proxy configuration with authentication - ```python - # Advanced proxy with auth - crawler = AsyncWebCrawler(proxy_config={ - "server": "http://proxy.example.com:8080", - "username": "user", - "password": "pass" - }) - ``` +### 2.2 Manual Start & Close -#### Browser Behavior +```python +crawler = AsyncWebCrawler(config=browser_cfg) +await crawler.start() -- **sleep_on_close** (bool, optional) - - Default: `False` - - Adds delay before closing browser - ```python - # Wait before closing - crawler = AsyncWebCrawler(sleep_on_close=True) - ``` +result1 = await crawler.arun("https://example.com") +result2 = await crawler.arun("https://another.com") -#### Custom Settings +await crawler.close() +``` -- **user_agent** (str, optional) - - Custom user agent string - ```python - # Custom user agent - crawler = AsyncWebCrawler( - user_agent="Mozilla/5.0 (Custom Agent) Chrome/90.0" - ) - ``` +Use this style if you have a **long-running** application or need full control of the crawler’s lifecycle. -- **headers** (Dict[str, str], optional) - - Custom HTTP headers - ```python - # Custom headers - crawler = AsyncWebCrawler( - headers={ - "Accept-Language": "en-US", - "Custom-Header": "Value" - } - ) - ``` +--- -- **js_code** (Union[str, List[str]], optional) - - Default JavaScript to execute on each page - ```python - # Default JavaScript - crawler = AsyncWebCrawler( - js_code=[ - "window.scrollTo(0, document.body.scrollHeight);", - "document.querySelector('.load-more').click();" - ] - ) - ``` - -## Methods - -### arun() - -The primary method for crawling web pages. +## 3. Primary Method: `arun()` ```python async def arun( - # Required - url: str, # URL to crawl - - # Content Selection - css_selector: str = None, # CSS selector for content - word_count_threshold: int = 10, # Minimum words per block - - # Cache Control - bypass_cache: bool = False, # Bypass cache for this request - - # Session Management - session_id: str = None, # Session identifier - - # Screenshot Options - screenshot: bool = False, # Take screenshot - screenshot_wait_for: float = None, # Wait before screenshot - - # Content Processing - process_iframes: bool = False, # Process iframe content - remove_overlay_elements: bool = False, # Remove popups/modals - - # Anti-Bot Settings - simulate_user: bool = False, # Simulate human behavior - override_navigator: bool = False, # Override navigator properties - magic: bool = False, # Enable all anti-detection - - # Content Filtering - excluded_tags: List[str] = None, # HTML tags to exclude - exclude_external_links: bool = False, # Remove external links - exclude_social_media_links: bool = False, # Remove social media links - - # JavaScript Handling - js_code: Union[str, List[str]] = None, # JavaScript to execute - wait_for: str = None, # Wait condition - - # Page Loading - page_timeout: int = 60000, # Page load timeout (ms) - delay_before_return_html: float = None, # Wait before return - - # Extraction - extraction_strategy: ExtractionStrategy = None # Extraction strategy + self, + url: str, + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters for backward compatibility... ) -> CrawlResult: + ... ``` -### Usage Examples +### 3.1 New Approach -#### Basic Crawling -```python -async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com") -``` - -#### Advanced Crawling -```python -async with AsyncWebCrawler( - browser_type="firefox", - verbose=True, - headers={"Custom-Header": "Value"} -) as crawler: - result = await crawler.arun( - url="https://example.com", - css_selector=".main-content", - word_count_threshold=20, - process_iframes=True, - magic=True, - wait_for="css:.dynamic-content", - screenshot=True - ) -``` - -#### Session Management -```python -async with AsyncWebCrawler() as crawler: - # First request - result1 = await crawler.arun( - url="https://example.com/login", - session_id="my_session" - ) - - # Subsequent request using same session - result2 = await crawler.arun( - url="https://example.com/protected", - session_id="my_session" - ) -``` - -## Context Manager - -AsyncWebCrawler implements the async context manager protocol: +You pass a `CrawlerRunConfig` object that sets up everything about a crawl—content filtering, caching, session reuse, JS code, screenshots, etc. ```python -async def __aenter__(self) -> 'AsyncWebCrawler': - # Initialize browser and resources - return self +import asyncio +from crawl4ai import CrawlerRunConfig, CacheMode -async def __aexit__(self, *args): - # Cleanup resources - pass -``` - -Always use AsyncWebCrawler with async context manager: -```python -async with AsyncWebCrawler() as crawler: - # Your crawling code here - pass -``` - -## Best Practices - -1. **Resource Management** -```python -# Always use context manager -async with AsyncWebCrawler() as crawler: - # Crawler will be properly cleaned up - pass -``` - -2. **Error Handling** -```python -try: - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com") - if not result.success: - print(f"Crawl failed: {result.error_message}") -except Exception as e: - print(f"Error: {str(e)}") -``` - -3. **Performance Optimization** -```python -# Enable caching for better performance -crawler = AsyncWebCrawler( - always_by_pass_cache=False, - verbose=True +run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="main.article", + word_count_threshold=10, + screenshot=True ) + +async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://example.com/news", config=run_cfg) + print("Crawled HTML length:", len(result.cleaned_html)) + if result.screenshot: + print("Screenshot base64 length:", len(result.screenshot)) ``` -4. **Anti-Detection** +### 3.2 Legacy Parameters Still Accepted + +For **backward** compatibility, `arun()` can still accept direct arguments like `css_selector=...`, `word_count_threshold=...`, etc., but we strongly advise migrating them into a **`CrawlerRunConfig`**. + +--- + +## 4. Helper Methods + +### 4.1 `arun_many()` + ```python -# Maximum stealth -crawler = AsyncWebCrawler( - headless=True, - user_agent="Mozilla/5.0...", - headers={"Accept-Language": "en-US"} -) -result = await crawler.arun( - url="https://example.com", - magic=True, - simulate_user=True -) +async def arun_many( + self, + urls: List[str], + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters... +) -> List[CrawlResult]: + ... ``` -## Note on Browser Types +Crawls multiple URLs in concurrency. Accepts the same style `CrawlerRunConfig`. Example: -Each browser type has its characteristics: - -- **chromium**: Best overall compatibility -- **firefox**: Good for specific use cases -- **webkit**: Lighter weight, good for basic crawling - -Choose based on your specific needs: ```python -# High compatibility -crawler = AsyncWebCrawler(browser_type="chromium") +run_cfg = CrawlerRunConfig( + # e.g., concurrency, wait_for, caching, extraction, etc. + semaphore_count=5 +) -# Memory efficient -crawler = AsyncWebCrawler(browser_type="webkit") -``` \ No newline at end of file +async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many( + urls=["https://example.com", "https://another.com"], + config=run_cfg + ) + for r in results: + print(r.url, ":", len(r.cleaned_html)) +``` + +### 4.2 `start()` & `close()` + +Allows manual lifecycle usage instead of context manager: + +```python +crawler = AsyncWebCrawler(config=browser_cfg) +await crawler.start() + +# Perform multiple operations +resultA = await crawler.arun("https://exampleA.com", config=run_cfg) +resultB = await crawler.arun("https://exampleB.com", config=run_cfg) + +await crawler.close() +``` + +--- + +## 5. `CrawlResult` Output + +Each `arun()` returns a **`CrawlResult`** containing: + +- `url`: Final URL (if redirected). +- `html`: Original HTML. +- `cleaned_html`: Sanitized HTML. +- `markdown_v2` (or future `markdown`): Markdown outputs (raw, fit, etc.). +- `extracted_content`: If an extraction strategy was used (JSON for CSS/LLM strategies). +- `screenshot`, `pdf`: If screenshots/PDF requested. +- `media`, `links`: Information about discovered images/links. +- `success`, `error_message`: Status info. + +For details, see [CrawlResult doc](./crawl-result.md). + +--- + +## 6. Quick Example + +Below is an example hooking it all together: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +import json + +async def main(): + # 1. Browser config + browser_cfg = BrowserConfig( + browser_type="firefox", + headless=False, + verbose=True + ) + + # 2. Run config + schema = { + "name": "Articles", + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + word_count_threshold=15, + remove_overlay_elements=True, + wait_for="css:.post" # Wait for posts to appear + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://example.com/blog", + config=run_cfg + ) + + if result.success: + print("Cleaned HTML length:", len(result.cleaned_html)) + if result.extracted_content: + articles = json.loads(result.extracted_content) + print("Extracted articles:", articles[:2]) + else: + print("Error:", result.error_message) + +asyncio.run(main()) +``` + +**Explanation**: +- We define a **`BrowserConfig`** with Firefox, no headless, and `verbose=True`. +- We define a **`CrawlerRunConfig`** that **bypasses cache**, uses a **CSS** extraction schema, has a `word_count_threshold=15`, etc. +- We pass them to `AsyncWebCrawler(config=...)` and `arun(url=..., config=...)`. + +--- + +## 7. Best Practices & Migration Notes + +1. **Use** `BrowserConfig` for **global** settings about the browser’s environment. +2. **Use** `CrawlerRunConfig` for **per-crawl** logic (caching, content filtering, extraction strategies, wait conditions). +3. **Avoid** legacy parameters like `css_selector` or `word_count_threshold` directly in `arun()`. Instead: + + ```python + run_cfg = CrawlerRunConfig(css_selector=".main-content", word_count_threshold=20) + result = await crawler.arun(url="...", config=run_cfg) + ``` + +4. **Context Manager** usage is simplest unless you want a persistent crawler across many calls. + +--- + +## 8. Summary + +**AsyncWebCrawler** is your entry point to asynchronous crawling: + +- **Constructor** accepts **`BrowserConfig`** (or defaults). +- **`arun(url, config=CrawlerRunConfig)`** is the main method for single-page crawls. +- **`arun_many(urls, config=CrawlerRunConfig)`** handles concurrency across multiple URLs. +- For advanced lifecycle control, use `start()` and `close()` explicitly. + +**Migration**: +- If you used `AsyncWebCrawler(browser_type="chromium", css_selector="...")`, move browser settings to `BrowserConfig(...)` and content/crawl logic to `CrawlerRunConfig(...)`. + +This modular approach ensures your code is **clean**, **scalable**, and **easy to maintain**. For any advanced or rarely used parameters, see the [BrowserConfig docs](../api/parameters.md). \ No newline at end of file diff --git a/docs/md_v2/api/crawl-config.md b/docs/md_v2/api/crawl-config.md deleted file mode 100644 index 928ae1e2..00000000 --- a/docs/md_v2/api/crawl-config.md +++ /dev/null @@ -1,85 +0,0 @@ -# CrawlerRunConfig Parameters Documentation - -## Content Processing Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content | -| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy | -| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction | -| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content | -| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content | -| `only_text` | bool | False | If True, attempt to extract text-only content where applicable | -| `css_selector` | str | None | CSS selector to extract a specific portion of the page | -| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing | -| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes | -| `remove_forms` | bool | False | If True, remove all `` elements from the HTML | -| `prettiify` | bool | False | If True, apply `fast_format_html` to produce prettified HTML output | - -## Caching Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `cache_mode` | CacheMode | None | Defines how caching is handled. Defaults to CacheMode.ENABLED internally | -| `session_id` | str | None | Optional session ID to persist browser context and page instance | -| `bypass_cache` | bool | False | Legacy parameter, if True acts like CacheMode.BYPASS | -| `disable_cache` | bool | False | Legacy parameter, if True acts like CacheMode.DISABLED | -| `no_cache_read` | bool | False | Legacy parameter, if True acts like CacheMode.WRITE_ONLY | -| `no_cache_write` | bool | False | Legacy parameter, if True acts like CacheMode.READ_ONLY | - -## Page Navigation and Timing Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `wait_until` | str | "domcontentloaded" | The condition to wait for when navigating | -| `page_timeout` | int | 60000 | Timeout in milliseconds for page operations like navigation | -| `wait_for` | str | None | CSS selector or JS condition to wait for before extracting content | -| `wait_for_images` | bool | True | If True, wait for images to load before extracting content | -| `delay_before_return_html` | float | 0.1 | Delay in seconds before retrieving final HTML | -| `mean_delay` | float | 0.1 | Mean base delay between requests when calling arun_many | -| `max_range` | float | 0.3 | Max random additional delay range for requests in arun_many | -| `semaphore_count` | int | 5 | Number of concurrent operations allowed | - -## Page Interaction Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `js_code` | str or list[str] | None | JavaScript code/snippets to run on the page | -| `js_only` | bool | False | If True, indicates subsequent calls are JS-driven updates | -| `ignore_body_visibility` | bool | True | If True, ignore whether the body is visible before proceeding | -| `scan_full_page` | bool | False | If True, scroll through the entire page to load all content | -| `scroll_delay` | float | 0.2 | Delay in seconds between scroll steps if scan_full_page is True | -| `process_iframes` | bool | False | If True, attempts to process and inline iframe content | -| `remove_overlay_elements` | bool | False | If True, remove overlays/popups before extracting HTML | -| `simulate_user` | bool | False | If True, simulate user interactions for anti-bot measures | -| `override_navigator` | bool | False | If True, overrides navigator properties for more human-like behavior | -| `magic` | bool | False | If True, attempts automatic handling of overlays/popups | -| `adjust_viewport_to_content` | bool | False | If True, adjust viewport according to page content dimensions | - -## Media Handling Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `screenshot` | bool | False | Whether to take a screenshot after crawling | -| `screenshot_wait_for` | float | None | Additional wait time before taking a screenshot | -| `screenshot_height_threshold` | int | 20000 | Threshold for page height to decide screenshot strategy | -| `pdf` | bool | False | Whether to generate a PDF of the page | -| `image_description_min_word_threshold` | int | 50 | Minimum words for image description extraction | -| `image_score_threshold` | int | 3 | Minimum score threshold for processing an image | -| `exclude_external_images` | bool | False | If True, exclude all external images from processing | - -## Link and Domain Handling Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `exclude_social_media_domains` | list[str] | SOCIAL_MEDIA_DOMAINS | List of domains to exclude for social media links | -| `exclude_external_links` | bool | False | If True, exclude all external links from the results | -| `exclude_social_media_links` | bool | False | If True, exclude links pointing to social media domains | -| `exclude_domains` | list[str] | [] | List of specific domains to exclude from results | - -## Debugging and Logging Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `verbose` | bool | True | Enable verbose logging | -| `log_console` | bool | False | If True, log console messages from the page | \ No newline at end of file diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md index 7e3bda98..929114c7 100644 --- a/docs/md_v2/api/crawl-result.md +++ b/docs/md_v2/api/crawl-result.md @@ -1,302 +1,330 @@ -# CrawlResult +# `CrawlResult` Reference -The `CrawlResult` class represents the result of a web crawling operation. It provides access to various forms of extracted content and metadata from the crawled webpage. +The **`CrawlResult`** class encapsulates everything returned after a single crawl operation. It provides the **raw or processed content**, details on links and media, plus optional metadata (like screenshots, PDFs, or extracted JSON). -## Class Definition +**Location**: `crawl4ai/crawler/models.py` (for reference) ```python class CrawlResult(BaseModel): - """Result of a web crawling operation.""" - - # Basic Information - url: str # Crawled URL - success: bool # Whether crawl succeeded - status_code: Optional[int] = None # HTTP status code - error_message: Optional[str] = None # Error message if failed - - # Content - html: str # Raw HTML content - cleaned_html: Optional[str] = None # Cleaned HTML - fit_html: Optional[str] = None # Most relevant HTML content - markdown: Optional[str] = None # HTML converted to markdown - fit_markdown: Optional[str] = None # Most relevant markdown content - downloaded_files: Optional[List[str]] = None # Downloaded files - - # Extracted Data - extracted_content: Optional[str] = None # Content from extraction strategy - media: Dict[str, List[Dict]] = {} # Extracted media information - links: Dict[str, List[Dict]] = {} # Extracted links - metadata: Optional[dict] = None # Page metadata - - # Additional Data - screenshot: Optional[str] = None # Base64 encoded screenshot - session_id: Optional[str] = None # Session identifier - response_headers: Optional[dict] = None # HTTP response headers + url: str + html: str + success: bool + cleaned_html: Optional[str] = None + media: Dict[str, List[Dict]] = {} + links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None + screenshot: Optional[str] = None + pdf : Optional[bytes] = None + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None + fit_markdown: Optional[str] = None + fit_html: Optional[str] = None + extracted_content: Optional[str] = None + metadata: Optional[dict] = None + error_message: Optional[str] = None + session_id: Optional[str] = None + response_headers: Optional[dict] = None + status_code: Optional[int] = None + ssl_certificate: Optional[SSLCertificate] = None + ... ``` -## Properties and Their Data Structures +Below is a **field-by-field** explanation and possible usage patterns. -### Basic Information +--- +## 1. Basic Crawl Info + +### 1.1 **`url`** *(str)* +**What**: The final crawled URL (after any redirects). +**Usage**: ```python -# Access basic information -result = await crawler.arun(url="https://example.com") - -print(result.url) # "https://example.com" -print(result.success) # True/False -print(result.status_code) # 200, 404, etc. -print(result.error_message) # Error details if failed +print(result.url) # e.g., "https://example.com/" ``` -### Content Properties - -#### HTML Content +### 1.2 **`success`** *(bool)* +**What**: `True` if the crawl pipeline ended without major errors; `False` otherwise. +**Usage**: ```python -# Raw HTML -html_content = result.html - -# Cleaned HTML (removed ads, popups, etc.) -clean_content = result.cleaned_html - -# Most relevant HTML content -main_content = result.fit_html +if not result.success: + print(f"Crawl failed: {result.error_message}") ``` -#### Markdown Content +### 1.3 **`status_code`** *(Optional[int])* +**What**: The page’s HTTP status code (e.g., 200, 404). +**Usage**: ```python -# Full markdown version -markdown_content = result.markdown - -# Most relevant markdown content -main_content = result.fit_markdown +if result.status_code == 404: + print("Page not found!") ``` -### Media Content - -The media dictionary contains organized media elements: - +### 1.4 **`error_message`** *(Optional[str])* +**What**: If `success=False`, a textual description of the failure. +**Usage**: ```python -# Structure -media = { - "images": [ - { - "src": str, # Image URL - "alt": str, # Alt text - "desc": str, # Contextual description - "score": float, # Relevance score (0-10) - "type": str, # "image" - "width": int, # Image width (if available) - "height": int, # Image height (if available) - "context": str, # Surrounding text - "lazy": bool # Whether image was lazy-loaded - } - ], - "videos": [ - { - "src": str, # Video URL - "type": str, # "video" - "title": str, # Video title - "poster": str, # Thumbnail URL - "duration": str, # Video duration - "description": str # Video description - } - ], - "audios": [ - { - "src": str, # Audio URL - "type": str, # "audio" - "title": str, # Audio title - "duration": str, # Audio duration - "description": str # Audio description - } - ] -} - -# Example usage -for image in result.media["images"]: - if image["score"] > 5: # High-relevance images - print(f"High-quality image: {image['src']}") - print(f"Context: {image['context']}") +if not result.success: + print("Error:", result.error_message) ``` -### Link Analysis - -The links dictionary organizes discovered links: - +### 1.5 **`session_id`** *(Optional[str])* +**What**: The ID used for reusing a browser context across multiple calls. +**Usage**: ```python -# Structure -links = { - "internal": [ - { - "href": str, # URL - "text": str, # Link text - "title": str, # Title attribute - "type": str, # Link type (nav, content, etc.) - "context": str, # Surrounding text - "score": float # Relevance score - } - ], - "external": [ - { - "href": str, # External URL - "text": str, # Link text - "title": str, # Title attribute - "domain": str, # Domain name - "type": str, # Link type - "context": str # Surrounding text - } - ] -} +# If you used session_id="login_session" in CrawlerRunConfig, see it here: +print("Session:", result.session_id) +``` -# Example usage +### 1.6 **`response_headers`** *(Optional[dict])* +**What**: Final HTTP response headers. +**Usage**: +```python +if result.response_headers: + print("Server:", result.response_headers.get("Server", "Unknown")) +``` + +### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])* +**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site’s certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, + `subject`, `valid_from`, `valid_until`, etc. +**Usage**: +```python +if result.ssl_certificate: + print("Issuer:", result.ssl_certificate.issuer) +``` + +--- + +## 2. Raw / Cleaned Content + +### 2.1 **`html`** *(str)* +**What**: The **original** unmodified HTML from the final page load. +**Usage**: +```python +# Possibly large +print(len(result.html)) +``` + +### 2.2 **`cleaned_html`** *(Optional[str])* +**What**: A sanitized HTML version—scripts, styles, or excluded tags are removed based on your `CrawlerRunConfig`. +**Usage**: +```python +print(result.cleaned_html[:500]) # Show a snippet +``` + +### 2.3 **`fit_html`** *(Optional[str])* +**What**: If a **content filter** or heuristic (e.g., Pruning/BM25) modifies the HTML, the “fit” or post-filter version. +**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it. +**Usage**: +```python +if result.fit_html: + print("High-value HTML content:", result.fit_html[:300]) +``` + +--- + +## 3. Markdown Fields + +### 3.1 The Markdown Generation Approach + +Crawl4AI can convert HTML→Markdown, optionally including: + +- **Raw** markdown +- **Links as citations** (with a references section) +- **Fit** markdown if a **content filter** is used (like Pruning or BM25) + +### 3.2 **`markdown_v2`** *(Optional[MarkdownGenerationResult])* +**What**: The **structured** object holding multiple markdown variants. Soon to be consolidated into `markdown`. + +**`MarkdownGenerationResult`** includes: +- **`raw_markdown`** *(str)*: The full HTML→Markdown conversion. +- **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations. +- **`references_markdown`** *(str)*: The reference list or footnotes at the end. +- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered “fit” text. +- **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`. + +**Usage**: +```python +if result.markdown_v2: + md_res = result.markdown_v2 + print("Raw MD:", md_res.raw_markdown[:300]) + print("Citations MD:", md_res.markdown_with_citations[:300]) + print("References:", md_res.references_markdown) + if md_res.fit_markdown: + print("Pruned text:", md_res.fit_markdown[:300]) +``` + +### 3.3 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])* +**What**: In future versions, `markdown` will fully replace `markdown_v2`. Right now, it might be a `str` or a `MarkdownGenerationResult`. +**Usage**: +```python +# Soon, you might see: +if isinstance(result.markdown, MarkdownGenerationResult): + print(result.markdown.raw_markdown[:200]) +else: + print(result.markdown) +``` + +### 3.4 **`fit_markdown`** *(Optional[str])* +**What**: A direct reference to the final filtered markdown (legacy approach). +**When**: This is set if a filter or content strategy explicitly writes there. Usually overshadowed by `markdown_v2.fit_markdown`. +**Usage**: +```python +print(result.fit_markdown) # Legacy field, prefer result.markdown_v2.fit_markdown +``` + +**Important**: “Fit” content (in `fit_markdown`/`fit_html`) only exists if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`. + +--- + +## 4. Media & Links + +### 4.1 **`media`** *(Dict[str, List[Dict]])* +**What**: Contains info about discovered images, videos, or audio. Typically keys: `"images"`, `"videos"`, `"audios"`. +**Common Fields** in each item: + +- `src` *(str)*: Media URL +- `alt` or `title` *(str)*: Descriptive text +- `score` *(float)*: Relevance score if the crawler’s heuristic found it “important” +- `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text + +**Usage**: +```python +images = result.media.get("images", []) +for img in images: + if img.get("score", 0) > 5: + print("High-value image:", img["src"]) +``` + +### 4.2 **`links`** *(Dict[str, List[Dict]])* +**What**: Holds internal and external link data. Usually two keys: `"internal"` and `"external"`. +**Common Fields**: + +- `href` *(str)*: The link target +- `text` *(str)*: Link text +- `title` *(str)*: Title attribute +- `context` *(str)*: Surrounding text snippet +- `domain` *(str)*: If external, the domain + +**Usage**: +```python for link in result.links["internal"]: - print(f"Internal link: {link['href']}") - print(f"Context: {link['context']}") + print(f"Internal link to {link['href']} with text {link['text']}") ``` -### Metadata +--- -The metadata dictionary contains page information: +## 5. Additional Fields +### 5.1 **`extracted_content`** *(Optional[str])* +**What**: If you used **`extraction_strategy`** (CSS, LLM, etc.), the structured output (JSON). +**Usage**: ```python -# Structure -metadata = { - "title": str, # Page title - "description": str, # Meta description - "keywords": List[str], # Meta keywords - "author": str, # Author information - "published_date": str, # Publication date - "modified_date": str, # Last modified date - "language": str, # Page language - "canonical_url": str, # Canonical URL - "og_data": Dict, # Open Graph data - "twitter_data": Dict # Twitter card data -} - -# Example usage -if result.metadata: - print(f"Title: {result.metadata['title']}") - print(f"Author: {result.metadata.get('author', 'Unknown')}") -``` - -### Extracted Content - -Content from extraction strategies: - -```python -# For LLM or CSS extraction strategies if result.extracted_content: - structured_data = json.loads(result.extracted_content) - print(structured_data) + data = json.loads(result.extracted_content) + print(data) ``` -### Screenshot - -Base64 encoded screenshot: - +### 5.2 **`downloaded_files`** *(Optional[List[str]])* +**What**: If `accept_downloads=True` in your `BrowserConfig` + `downloads_path`, lists local file paths for downloaded items. +**Usage**: ```python -# Save screenshot if available +if result.downloaded_files: + for file_path in result.downloaded_files: + print("Downloaded:", file_path) +``` + +### 5.3 **`screenshot`** *(Optional[str])* +**What**: Base64-encoded screenshot if `screenshot=True` in `CrawlerRunConfig`. +**Usage**: +```python +import base64 if result.screenshot: - import base64 - - # Decode and save - with open("screenshot.png", "wb") as f: + with open("page.png", "wb") as f: f.write(base64.b64decode(result.screenshot)) ``` -## Usage Examples - -### Basic Content Access +### 5.4 **`pdf`** *(Optional[bytes])* +**What**: Raw PDF bytes if `pdf=True` in `CrawlerRunConfig`. +**Usage**: ```python -async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com") +if result.pdf: + with open("page.pdf", "wb") as f: + f.write(result.pdf) +``` + +### 5.5 **`metadata`** *(Optional[dict])* +**What**: Page-level metadata if discovered (title, description, OG data, etc.). +**Usage**: +```python +if result.metadata: + print("Title:", result.metadata.get("title")) + print("Author:", result.metadata.get("author")) +``` + +--- + +## 6. Example: Accessing Everything + +```python +async def handle_result(result: CrawlResult): + if not result.success: + print("Crawl error:", result.error_message) + return - if result.success: - # Get clean content - print(result.fit_markdown) - - # Process images - for image in result.media["images"]: - if image["score"] > 7: - print(f"High-quality image: {image['src']}") + # Basic info + print("Crawled URL:", result.url) + print("Status code:", result.status_code) + + # HTML + print("Original HTML size:", len(result.html)) + print("Cleaned HTML size:", len(result.cleaned_html or "")) + + # Markdown output + if result.markdown_v2: + print("Raw Markdown:", result.markdown_v2.raw_markdown[:300]) + print("Citations Markdown:", result.markdown_v2.markdown_with_citations[:300]) + if result.markdown_v2.fit_markdown: + print("Fit Markdown:", result.markdown_v2.fit_markdown[:200]) + else: + print("Raw Markdown (legacy):", result.markdown[:200] if result.markdown else "N/A") + + # Media & Links + if "images" in result.media: + print("Image count:", len(result.media["images"])) + if "internal" in result.links: + print("Internal link count:", len(result.links["internal"])) + + # Extraction strategy result + if result.extracted_content: + print("Structured data:", result.extracted_content) + + # Screenshot/PDF + if result.screenshot: + print("Screenshot length:", len(result.screenshot)) + if result.pdf: + print("PDF bytes length:", len(result.pdf)) ``` -### Complete Data Processing -```python -async def process_webpage(url: str) -> Dict: - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url=url) - - if not result.success: - raise Exception(f"Crawl failed: {result.error_message}") - - return { - "content": result.fit_markdown, - "images": [ - img for img in result.media["images"] - if img["score"] > 5 - ], - "internal_links": [ - link["href"] for link in result.links["internal"] - ], - "metadata": result.metadata, - "status": result.status_code - } -``` +--- -### Error Handling -```python -async def safe_crawl(url: str) -> Dict: - async with AsyncWebCrawler() as crawler: - try: - result = await crawler.arun(url=url) - - if not result.success: - return { - "success": False, - "error": result.error_message, - "status": result.status_code - } - - return { - "success": True, - "content": result.fit_markdown, - "status": result.status_code - } - - except Exception as e: - return { - "success": False, - "error": str(e), - "status": None - } -``` +## 7. Key Points & Future -## Best Practices +1. **`markdown_v2` vs `markdown`** + - Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**. + - In future versions, everything will unify under **`markdown`**. If you rely on advanced features (citations, fit content), check `markdown_v2`. -1. **Always Check Success** -```python -if not result.success: - print(f"Error: {result.error_message}") - return -``` +2. **Fit Content** + - **`fit_markdown`** and **`fit_html`** appear only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly. + - If no filter is used, they remain `None`. -2. **Use fit_markdown for Articles** -```python -# Better for article content -content = result.fit_markdown if result.fit_markdown else result.markdown -``` +3. **References & Citations** + - If you enable link citations in your `DefaultMarkdownGenerator` (`options={"citations": True}`), you’ll see `markdown_with_citations` plus a **`references_markdown`** block. This helps large language models or academic-like referencing. -3. **Filter Media by Score** -```python -relevant_images = [ - img for img in result.media["images"] - if img["score"] > 5 -] -``` +4. **Links & Media** + - `links["internal"]` and `links["external"]` group discovered anchors by domain. + - `media["images"]` / `["videos"]` / `["audios"]` store extracted media elements with optional scoring or context. -4. **Handle Missing Data** -```python -metadata = result.metadata or {} -title = metadata.get('title', 'Unknown Title') -``` \ No newline at end of file +5. **Error Cases** + - If `success=False`, check `error_message` (e.g., timeouts, invalid URLs). + - `status_code` might be `None` if we failed before an HTTP response. + +Use **`CrawlResult`** to glean all final outputs and feed them into your data pipelines, AI models, or archives. With the synergy of a properly configured **BrowserConfig** and **CrawlerRunConfig**, the crawler can produce robust, structured results here in **`CrawlResult`**. \ No newline at end of file diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index c1c4d2ea..7645084c 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -1,36 +1,226 @@ -# Parameter Reference Table +# 1. **BrowserConfig** – Controlling the Browser + +`BrowserConfig` focuses on **how** the browser is launched and behaves. This includes headless mode, proxies, user agents, and other environment tweaks. + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +browser_cfg = BrowserConfig( + browser_type="chromium", + headless=True, + viewport_width=1280, + viewport_height=720, + proxy="http://user:pass@proxy:8080", + user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36", +) +``` + +## 1.1 Parameter Highlights + +| **Parameter** | **Type / Default** | **What It Does** | +|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------| +| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`
*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. | +| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. | +| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. | +| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). | +| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. | +| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. | +| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. | +| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. | +| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). | +| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. | +| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. | +| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. | +| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom or random user agent. `user_agent_mode="random"` can shuffle it. | +| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. | +| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. | +| **`use_managed_browser`** | `bool` (default: `False`) | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on. | +| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. | + +**Tips**: +- Set `headless=False` to visually **debug** how pages load or how interactions proceed. +- If you need **authentication** storage or repeated sessions, consider `use_persistent_context=True` and specify `user_data_dir`. +- For large pages, you might need a bigger `viewport_width` and `viewport_height` to handle dynamic content. + +--- + +# 2. **CrawlerRunConfig** – Controlling Each Crawl + +While `BrowserConfig` sets up the **environment**, `CrawlerRunConfig` details **how** each **crawl operation** should behave: caching, content filtering, link or domain blocking, timeouts, JavaScript code, etc. + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +run_cfg = CrawlerRunConfig( + wait_for="css:.main-content", + word_count_threshold=15, + excluded_tags=["nav", "footer"], + exclude_external_links=True, +) +``` + +## 2.1 Parameter Highlights + +We group them by category. + +### A) **Content Processing** + +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------| +| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | +| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | +| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | +| **`content_filter`** | `RelevantContentFilter` (None) | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`. | +| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. | +| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | +| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. | +| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. | +| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). | +| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. | +| **`remove_forms`** | `bool` (False) | If `True`, remove all `` elements. | + +--- + +### B) **Caching & Session** + +| **Parameter** | **Type / Default** | **What It Does** | +|-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------| +| **`cache_mode`** | `CacheMode or None` | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`. | +| **`session_id`** | `str or None` | Assign a unique ID to reuse a single browser session across multiple `arun()` calls. | +| **`bypass_cache`** | `bool` (False) | If `True`, acts like `CacheMode.BYPASS`. | +| **`disable_cache`** | `bool` (False) | If `True`, acts like `CacheMode.DISABLED`. | +| **`no_cache_read`** | `bool` (False) | If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). | +| **`no_cache_write`** | `bool` (False) | If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). | + +Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits. + +--- + +### C) **Page Navigation & Timing** + +| **Parameter** | **Type / Default** | **What It Does** | +|----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| +| **`wait_until`** | `str` (domcontentloaded)| Condition for navigation to “complete”. Often `"networkidle"` or `"domcontentloaded"`. | +| **`page_timeout`** | `int` (60000 ms) | Timeout for page navigation or JS steps. Increase for slow sites. | +| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. | +| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. | +| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. | +| **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. | +| **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. | + +--- + +### D) **Page Interaction** + +| **Parameter** | **Type / Default** | **What It Does** | +|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| +| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. | +| **`js_only`** | `bool` (False) | If `True`, indicates we’re reusing an existing session and only applying JS. No full reload. | +| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `` is visible. Usually best to keep `True`. | +| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). | +| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. | +| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. | +| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. | +| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. | +| **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. | +| **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. | +| **`adjust_viewport_to_content`** | `bool` (False) | Resizes viewport to match page content height. | + +If your page is a single-page app with repeated JS updates, set `js_only=True` in subsequent calls, plus a `session_id` for reusing the same tab. + +--- + +### E) **Media Handling** + +| **Parameter** | **Type / Default** | **What It Does** | +|--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------| +| **`screenshot`** | `bool` (False) | Capture a screenshot (base64) in `result.screenshot`. | +| **`screenshot_wait_for`** | `float or None` | Extra wait time before the screenshot. | +| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. | +| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. | +| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an image’s alt text or description to be considered valid. | +| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). | +| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. | + +--- + +### F) **Link/Domain Handling** + +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------| +| **`exclude_social_media_domains`** | `list` (e.g. Facebook/Twitter) | A default list can be extended. Any link to these domains is removed from final output. | +| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. | +| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). | +| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). | + +Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains). + +--- + +### G) **Debug & Logging** + +| **Parameter** | **Type / Default** | **What It Does** | +|----------------|--------------------|---------------------------------------------------------------------------| +| **`verbose`** | `bool` (True) | Prints logs detailing each step of crawling, interactions, or errors. | +| **`log_console`** | `bool` (False) | Logs the page’s JavaScript console output if you want deeper JS debugging.| + +--- + +## 2.2 Example Usage + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + # Configure the browser + browser_cfg = BrowserConfig( + headless=False, + viewport_width=1280, + viewport_height=720, + proxy="http://user:pass@myproxy:8080", + text_mode=True + ) + + # Configure the run + run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="my_session", + css_selector="main.article", + excluded_tags=["script", "style"], + exclude_external_links=True, + wait_for="css:.article-loaded", + screenshot=True + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://example.com/news", + config=run_cfg + ) + if result.success: + print("Final cleaned_html length:", len(result.cleaned_html)) + if result.screenshot: + print("Screenshot captured (base64, length):", len(result.screenshot)) + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s Happening**: +- **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl. +- We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content. +- We only keep `main.article` content by specifying `css_selector="main.article"`. +- We exclude external links (`exclude_external_links=True`). +- We do a quick screenshot (`screenshot=True`) before finishing. + +--- + +## 3. Putting It All Together + +- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. +- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS. +- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`). -| File Name | Parameter Name | Code Usage | Strategy/Class | Description | -|-----------|---------------|------------|----------------|-------------| -| async_crawler_strategy.py | user_agent | `kwargs.get("user_agent")` | AsyncPlaywrightCrawlerStrategy | User agent string for browser identification | -| async_crawler_strategy.py | proxy | `kwargs.get("proxy")` | AsyncPlaywrightCrawlerStrategy | Proxy server configuration for network requests | -| async_crawler_strategy.py | proxy_config | `kwargs.get("proxy_config")` | AsyncPlaywrightCrawlerStrategy | Detailed proxy configuration including auth | -| async_crawler_strategy.py | headless | `kwargs.get("headless", True)` | AsyncPlaywrightCrawlerStrategy | Whether to run browser in headless mode | -| async_crawler_strategy.py | browser_type | `kwargs.get("browser_type", "chromium")` | AsyncPlaywrightCrawlerStrategy | Type of browser to use (chromium/firefox/webkit) | -| async_crawler_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests | -| async_crawler_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output | -| async_crawler_strategy.py | sleep_on_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser | -| async_crawler_strategy.py | use_managed_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance | -| async_crawler_strategy.py | user_data_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data | -| async_crawler_strategy.py | session_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session | -| async_crawler_strategy.py | override_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties | -| async_crawler_strategy.py | simulate_user | `kwargs.get("simulate_user", False)` | AsyncPlaywrightCrawlerStrategy | Simulate human-like behavior | -| async_crawler_strategy.py | magic | `kwargs.get("magic", False)` | AsyncPlaywrightCrawlerStrategy | Enable advanced anti-detection features | -| async_crawler_strategy.py | log_console | `kwargs.get("log_console", False)` | AsyncPlaywrightCrawlerStrategy | Log browser console messages | -| async_crawler_strategy.py | js_only | `kwargs.get("js_only", False)` | AsyncPlaywrightCrawlerStrategy | Only execute JavaScript without page load | -| async_crawler_strategy.py | page_timeout | `kwargs.get("page_timeout", 60000)` | AsyncPlaywrightCrawlerStrategy | Timeout for page load in milliseconds | -| async_crawler_strategy.py | ignore_body_visibility | `kwargs.get("ignore_body_visibility", True)` | AsyncPlaywrightCrawlerStrategy | Process page even if body is hidden | -| async_crawler_strategy.py | js_code | `kwargs.get("js_code", kwargs.get("js", self.js_code))` | AsyncPlaywrightCrawlerStrategy | Custom JavaScript code to execute | -| async_crawler_strategy.py | wait_for | `kwargs.get("wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait for specific element/condition | -| async_crawler_strategy.py | process_iframes | `kwargs.get("process_iframes", False)` | AsyncPlaywrightCrawlerStrategy | Extract content from iframes | -| async_crawler_strategy.py | delay_before_return_html | `kwargs.get("delay_before_return_html")` | AsyncPlaywrightCrawlerStrategy | Additional delay before returning HTML | -| async_crawler_strategy.py | remove_overlay_elements | `kwargs.get("remove_overlay_elements", False)` | AsyncPlaywrightCrawlerStrategy | Remove pop-ups and overlay elements | -| async_crawler_strategy.py | screenshot | `kwargs.get("screenshot")` | AsyncPlaywrightCrawlerStrategy | Take page screenshot | -| async_crawler_strategy.py | screenshot_wait_for | `kwargs.get("screenshot_wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait before taking screenshot | -| async_crawler_strategy.py | semaphore_count | `kwargs.get("semaphore_count", 5)` | AsyncPlaywrightCrawlerStrategy | Concurrent request limit | -| async_webcrawler.py | verbose | `kwargs.get("verbose", False)` | AsyncWebCrawler | Enable detailed logging | -| async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request | -| async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse | -| async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content | -| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | -| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request | \ No newline at end of file diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md index f0f8f57c..06b757d4 100644 --- a/docs/md_v2/api/strategies.md +++ b/docs/md_v2/api/strategies.md @@ -218,12 +218,12 @@ result = await crawler.arun( ## Best Practices -1. **Choose the Right Strategy** +1. **Choose the Right Strategy** - Use `LLMExtractionStrategy` for complex, unstructured content - Use `JsonCssExtractionStrategy` for well-structured HTML - Use `CosineStrategy` for content similarity and clustering -2. **Optimize Chunking** +2. **Optimize Chunking** ```python # For long documents strategy = LLMExtractionStrategy( @@ -232,7 +232,7 @@ result = await crawler.arun( ) ``` -3. **Handle Errors** +3. **Handle Errors** ```python try: result = await crawler.arun( @@ -245,7 +245,7 @@ result = await crawler.arun( print(f"Extraction failed: {e}") ``` -4. **Monitor Performance** +4. **Monitor Performance** ```python strategy = CosineStrategy( verbose=True, # Enable logging diff --git a/docs/md_v2/assets/images/dispatcher.png b/docs/md_v2/assets/images/dispatcher.png new file mode 100644 index 00000000..37e06972 Binary files /dev/null and b/docs/md_v2/assets/images/dispatcher.png differ diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css index 68a93f5d..1aed2822 100644 --- a/docs/md_v2/assets/styles.css +++ b/docs/md_v2/assets/styles.css @@ -7,6 +7,7 @@ :root { --global-font-size: 16px; + --global-code-font-size: 14px; --global-line-height: 1.5em; --global-space: 10px; --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, @@ -20,6 +21,7 @@ --invert-font-color: #151515; /* Dark color for inverted elements */ --primary-color: #1a95e0; /* Primary color can remain the same or be adjusted for better contrast */ --secondary-color: #727578; /* Secondary color for less important text */ + --secondary-dimmed-color: #8b857a; /* Dimmed secondary color */ --error-color: #ff5555; /* Bright color for errors */ --progress-bar-background: #444; /* Darker background for progress bar */ --progress-bar-fill: #1a95e0; /* Bright color for progress bar fill */ @@ -37,8 +39,9 @@ --secondary-color: #a3abba; --secondary-color: #d5cec0; --tertiary-color: #a3abba; - --primary-color: #09b5a5; /* Updated to the brand color */ + --primary-dimmed-color: #09b5a5; /* Updated to the brand color */ --primary-color: #50ffff; /* Updated to the brand color */ + --accent-color: rgb(243, 128, 245); --error-color: #ff3c74; --progress-bar-background: #3f3f44; --progress-bar-fill: #09b5a5; /* Updated to the brand color */ @@ -80,10 +83,16 @@ pre, code { line-height: var(--global-line-height); } -strong, +strong { + /* color : var(--primary-dimmed-color); */ + /* background-color: #50ffff17; */ + text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color); +} + .highlight { /* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */ - background-color: #50ffff33; + background-color: #50ffff17; + } .terminal-card > header { @@ -157,4 +166,71 @@ ol li::before { counter-increment: item; /* float: left; */ /* padding-right: 5px; */ +} + + +/* 8 TERMINAL CSS */ + +.terminal code { + font-size: var(--global-code-font-size); + background: var(--block-background-color); + /* color: var(--secondary-color); */ + color: var(--primary-dimmed-color); +} + +.terminal pre code { + background: var(--block-background-color); + color: var(--secondary-color); +} + +.hljs-keyword, .hljs-selector-tag, .hljs-built_in, .hljs-name, .hljs-tag { + color: var(--accent-color); +} +.hljs-string { + color: var(--primary-dimmed-color); +} +.hljs-comment { + color: var(--secondary-dimmed-color); + font-style: italic; + font-size: 0.9em; +} +.hljs-number { + color: var(--primary-dimmed-color); +} + +.terminal strong > code, .terminal h2 > code , .terminal h3 > code { + background-color: transparent; + /* color: var(--font-color); */ + color: var(--primary-dimmed-color); + text-shadow: none; +} + +blockquote { + background-color: var(--invert-font-color); + padding: 1em 2em; + border-left: 2px solid var(--primary-dimmed-color); +} + +blockquote::after { + content: "💡"; + white-space: pre; + position: absolute; + top: 1em; + left: 5px; + line-height: var(--global-line-height); + color: #9ca2ab; +} + +pre { + display: block; + word-break: break-word; + word-wrap: break-word; +} + +.terminal h1 { + font-size: 2em; +} + +.terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 { + text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color); } \ No newline at end of file diff --git a/docs/md_v2/basic/browser-config.md b/docs/md_v2/basic/browser-config.md deleted file mode 100644 index 7df4a97b..00000000 --- a/docs/md_v2/basic/browser-config.md +++ /dev/null @@ -1,208 +0,0 @@ -# Browser Configuration - -Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior. - -## Browser Types - -Choose from three browser engines: - -```python -# Chromium (default) -async with AsyncWebCrawler(browser_type="chromium") as crawler: - result = await crawler.arun(url="https://example.com") - -# Firefox -async with AsyncWebCrawler(browser_type="firefox") as crawler: - result = await crawler.arun(url="https://example.com") - -# WebKit -async with AsyncWebCrawler(browser_type="webkit") as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Basic Configuration - -Common browser settings: - -```python -async with AsyncWebCrawler( - headless=True, # Run in headless mode (no GUI) - verbose=True, # Enable detailed logging - sleep_on_close=False # No delay when closing browser -) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Identity Management - -Control how your crawler appears to websites: - -```python -# Custom user agent -async with AsyncWebCrawler( - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" -) as crawler: - result = await crawler.arun(url="https://example.com") - -# Custom headers -headers = { - "Accept-Language": "en-US,en;q=0.9", - "Cache-Control": "no-cache" -} -async with AsyncWebCrawler(headers=headers) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Screenshot Capabilities - -Capture page screenshots with enhanced error handling: - -```python -result = await crawler.arun( - url="https://example.com", - screenshot=True, # Enable screenshot - screenshot_wait_for=2.0 # Wait 2 seconds before capture -) - -if result.screenshot: # Base64 encoded image - import base64 - with open("screenshot.png", "wb") as f: - f.write(base64.b64decode(result.screenshot)) -``` - -## Timeouts and Waiting - -Control page loading behavior: - -```python -result = await crawler.arun( - url="https://example.com", - page_timeout=60000, # Page load timeout (ms) - delay_before_return_html=2.0, # Wait before content capture - wait_for="css:.dynamic-content" # Wait for specific element -) -``` - -## JavaScript Execution - -Execute custom JavaScript before crawling: - -```python -# Single JavaScript command -result = await crawler.arun( - url="https://example.com", - js_code="window.scrollTo(0, document.body.scrollHeight);" -) - -# Multiple commands -js_commands = [ - "window.scrollTo(0, document.body.scrollHeight);", - "document.querySelector('.load-more').click();" -] -result = await crawler.arun( - url="https://example.com", - js_code=js_commands -) -``` - -## Proxy Configuration - -Use proxies for enhanced access: - -```python -# Simple proxy -async with AsyncWebCrawler( - proxy="http://proxy.example.com:8080" -) as crawler: - result = await crawler.arun(url="https://example.com") - -# Proxy with authentication -proxy_config = { - "server": "http://proxy.example.com:8080", - "username": "user", - "password": "pass" -} -async with AsyncWebCrawler(proxy_config=proxy_config) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Anti-Detection Features - -Enable stealth features to avoid bot detection: - -```python -result = await crawler.arun( - url="https://example.com", - simulate_user=True, # Simulate human behavior - override_navigator=True, # Mask automation signals - magic=True # Enable all anti-detection features -) -``` - -## Handling Dynamic Content - -Configure browser to handle dynamic content: - -```python -# Wait for dynamic content -result = await crawler.arun( - url="https://example.com", - wait_for="js:() => document.querySelector('.content').children.length > 10", - process_iframes=True # Process iframe content -) - -# Handle lazy-loaded images -result = await crawler.arun( - url="https://example.com", - js_code="window.scrollTo(0, document.body.scrollHeight);", - delay_before_return_html=2.0 # Wait for images to load -) -``` - -## Comprehensive Example - -Here's how to combine various browser configurations: - -```python -async def crawl_with_advanced_config(url: str): - async with AsyncWebCrawler( - # Browser setup - browser_type="chromium", - headless=True, - verbose=True, - - # Identity - user_agent="Custom User Agent", - headers={"Accept-Language": "en-US"}, - - # Proxy setup - proxy="http://proxy.example.com:8080" - ) as crawler: - result = await crawler.arun( - url=url, - # Content handling - process_iframes=True, - screenshot=True, - - # Timing - page_timeout=60000, - delay_before_return_html=2.0, - - # Anti-detection - magic=True, - simulate_user=True, - - # Dynamic content - js_code=[ - "window.scrollTo(0, document.body.scrollHeight);", - "document.querySelector('.load-more')?.click();" - ], - wait_for="css:.dynamic-content" - ) - - return { - "content": result.markdown, - "screenshot": result.screenshot, - "success": result.success - } -``` \ No newline at end of file diff --git a/docs/md_v2/basic/content-selection.md b/docs/md_v2/basic/content-selection.md deleted file mode 100644 index ec838f2d..00000000 --- a/docs/md_v2/basic/content-selection.md +++ /dev/null @@ -1,135 +0,0 @@ -### Content Selection - -Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need. - -#### CSS Selectors - -Extract specific content using a `CrawlerRunConfig` with CSS selectors: - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -config = CrawlerRunConfig(css_selector=".main-article") # Target main article content -result = await crawler.arun(url="https://crawl4ai.com", config=config) - -config = CrawlerRunConfig(css_selector="article h1, article .content") # Target heading and content -result = await crawler.arun(url="https://crawl4ai.com", config=config) -``` - -#### Content Filtering - -Control content inclusion or exclusion with `CrawlerRunConfig`: - -```python -config = CrawlerRunConfig( - word_count_threshold=10, # Minimum words per block - excluded_tags=['form', 'header', 'footer', 'nav'], # Excluded tags - exclude_external_links=True, # Remove external links - exclude_social_media_links=True, # Remove social media links - exclude_external_images=True # Remove external images -) - -result = await crawler.arun(url="https://crawl4ai.com", config=config) -``` - -#### Iframe Content - -Process iframe content by enabling specific options in `CrawlerRunConfig`: - -```python -config = CrawlerRunConfig( - process_iframes=True, # Extract iframe content - remove_overlay_elements=True # Remove popups/modals that might block iframes -) - -result = await crawler.arun(url="https://crawl4ai.com", config=config) -``` - -#### Structured Content Selection Using LLMs - -Leverage LLMs for intelligent content extraction: - -```python -from crawl4ai.extraction_strategy import LLMExtractionStrategy -from pydantic import BaseModel -from typing import List - -class ArticleContent(BaseModel): - title: str - main_points: List[str] - conclusion: str - -strategy = LLMExtractionStrategy( - provider="ollama/nemotron", - schema=ArticleContent.schema(), - instruction="Extract the main article title, key points, and conclusion" -) - -config = CrawlerRunConfig(extraction_strategy=strategy) - -result = await crawler.arun(url="https://crawl4ai.com", config=config) -article = json.loads(result.extracted_content) -``` - -#### Pattern-Based Selection - -Extract content matching repetitive patterns: - -```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -schema = { - "name": "News Articles", - "baseSelector": "article.news-item", - "fields": [ - {"name": "headline", "selector": "h2", "type": "text"}, - {"name": "summary", "selector": ".summary", "type": "text"}, - {"name": "category", "selector": ".category", "type": "text"}, - { - "name": "metadata", - "type": "nested", - "fields": [ - {"name": "author", "selector": ".author", "type": "text"}, - {"name": "date", "selector": ".date", "type": "text"} - ] - } - ] -} - -strategy = JsonCssExtractionStrategy(schema) -config = CrawlerRunConfig(extraction_strategy=strategy) - -result = await crawler.arun(url="https://crawl4ai.com", config=config) -articles = json.loads(result.extracted_content) -``` - -#### Comprehensive Example - -Combine different selection methods using `CrawlerRunConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig - -async def extract_article_content(url: str): - # Define structured extraction - article_schema = { - "name": "Article", - "baseSelector": "article.main", - "fields": [ - {"name": "title", "selector": "h1", "type": "text"}, - {"name": "content", "selector": ".content", "type": "text"} - ] - } - - # Define configuration - config = CrawlerRunConfig( - extraction_strategy=JsonCssExtractionStrategy(article_schema), - word_count_threshold=10, - excluded_tags=['nav', 'footer'], - exclude_external_links=True - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url=url, config=config) - return json.loads(result.extracted_content) -``` diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md deleted file mode 100644 index 14f48ec6..00000000 --- a/docs/md_v2/basic/content_filtering.md +++ /dev/null @@ -1,83 +0,0 @@ -# Content Filtering in Crawl4AI - -This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. - -## Relevance Content Filter - -The `RelevanceContentFilter` is an abstract class providing a common interface for content filtering strategies. Specific algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. - -## Pruning Content Filter - -The `PruningContentFilter` removes less relevant nodes based on metrics like text density, link density, and tag importance. Nodes that fall below a defined threshold are pruned, leaving only high-value content. - -### Usage - -```python -from crawl4ai.async_configs import CrawlerRunConfig -from crawl4ai.content_filter_strategy import PruningContentFilter - -config = CrawlerRunConfig( - content_filter=PruningContentFilter( - min_word_threshold=5, - threshold_type='dynamic', - threshold=0.45 - ), - fit_markdown=True # Activates markdown fitting -) - -result = await crawler.arun(url="https://example.com", config=config) - -if result.success: - print(f"Cleaned Markdown:\n{result.fit_markdown}") -``` - -### Parameters - -- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. -- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: - - `'fixed'`: Uses a constant threshold value for all nodes. - - `'dynamic'`: Adjusts thresholds based on node properties (e.g., tag importance, text/link ratios). -- **`threshold`**: (Optional, default 0.48) Base threshold for pruning: - - Fixed: Nodes scoring below this value are removed. - - Dynamic: This value adjusts based on node characteristics. - -### How It Works - -The algorithm evaluates each node using: -- **Text density**: Ratio of text to overall content. -- **Link density**: Proportion of text within links. -- **Tag importance**: Weights based on HTML tag type (e.g., `
`, `

`, `

`). -- **Content quality**: Metrics like text length and structural importance. - -## BM25 Algorithm - -The `BM25ContentFilter` uses the BM25 algorithm to rank and extract text chunks based on relevance to a search query or page metadata. - -### Usage - -```python -from crawl4ai.async_configs import CrawlerRunConfig -from crawl4ai.content_filter_strategy import BM25ContentFilter - -config = CrawlerRunConfig( - content_filter=BM25ContentFilter(user_query="fruit nutrition health"), - fit_markdown=True # Activates markdown fitting -) - -result = await crawler.arun(url="https://example.com", config=config) - -if result.success: - print(f"Filtered Content:\n{result.extracted_content}") - print(f"\nFiltered Markdown:\n{result.fit_markdown}") - print(f"\nFiltered HTML:\n{result.fit_html}") -else: - print("Error:", result.error_message) -``` - -### Parameters - -- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts metadata (title, description, keywords) and uses it as the query. -- **`bm25_threshold`**: (Optional, default 1.0) Threshold controlling relevance: - - Higher values return stricter, more relevant results. - - Lower values include more lenient filtering. - diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md deleted file mode 100644 index de8aeafa..00000000 --- a/docs/md_v2/basic/installation.md +++ /dev/null @@ -1,137 +0,0 @@ -# Installation 💻 - -Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server. - -## Option 1: Python Package Installation (Recommended) - -Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs: - -### Basic Installation - -For basic web crawling and scraping tasks: - -```bash -pip install crawl4ai -playwright install # Install Playwright dependencies -``` - -### Installation with PyTorch - -For advanced text clustering (includes CosineSimilarity cluster strategy): - -```bash -pip install crawl4ai[torch] -``` - -### Installation with Transformers - -For text summarization and Hugging Face models: - -```bash -pip install crawl4ai[transformer] -``` - -### Full Installation - -For all features: - -```bash -pip install crawl4ai[all] -``` - -### Development Installation - -For contributors who plan to modify the source code: - -```bash -git clone https://github.com/unclecode/crawl4ai.git -cd crawl4ai -pip install -e ".[all]" -playwright install # Install Playwright dependencies -``` - -💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models: - -```bash -crawl4ai-download-models -``` - -This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. - -## Playwright Installation Note for Ubuntu - -If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies: - -```bash -sudo apt-get install -y \ - libwoff1 \ - libopus0 \ - libwebp7 \ - libwebpdemux2 \ - libenchant-2-2 \ - libgudev-1.0-0 \ - libsecret-1-0 \ - libhyphen0 \ - libgdk-pixbuf2.0-0 \ - libegl1 \ - libnotify4 \ - libxslt1.1 \ - libevent-2.1-7 \ - libgles2 \ - libxcomposite1 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libepoxy0 \ - libgtk-3-0 \ - libharfbuzz-icu0 \ - libgstreamer-gl1.0-0 \ - libgstreamer-plugins-bad1.0-0 \ - gstreamer1.0-plugins-good \ - gstreamer1.0-plugins-bad \ - libxt6 \ - libxaw7 \ - xvfb \ - fonts-noto-color-emoji \ - libfontconfig \ - libfreetype6 \ - xfonts-cyrillic \ - xfonts-scalable \ - fonts-liberation \ - fonts-ipafont-gothic \ - fonts-wqy-zenhei \ - fonts-tlwg-loma-otf \ - fonts-freefont-ttf -``` - -## Option 2: Using Docker (Coming Soon) - -Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. - -## Option 3: Local Server Installation - -For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete. - -## Verifying Your Installation - -After installation, you can verify that Crawl4AI is working correctly by running a simple Python script: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler - -async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.example.com") - print(result.markdown[:500]) # Print first 500 characters - -if __name__ == "__main__": - asyncio.run(main()) -``` - -This script should successfully crawl the example website and print the first 500 characters of the extracted content. - -## Getting Help - -If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues). - -Happy crawling! 🕷️🤖 \ No newline at end of file diff --git a/docs/md_v2/basic/output-formats.md b/docs/md_v2/basic/output-formats.md deleted file mode 100644 index 3686c23c..00000000 --- a/docs/md_v2/basic/output-formats.md +++ /dev/null @@ -1,102 +0,0 @@ -# Output Formats - -Crawl4AI provides multiple output formats to suit different needs, ranging from raw HTML to structured data using LLM or pattern-based extraction, and versatile markdown outputs. - -## Basic Formats - -```python -result = await crawler.arun(url="https://example.com") - -# Access different formats -raw_html = result.html # Original HTML -clean_html = result.cleaned_html # Sanitized HTML -markdown_v2 = result.markdown_v2 # Detailed markdown generation results -fit_md = result.markdown_v2.fit_markdown # Most relevant content in markdown -``` - -> **Note**: The `markdown_v2` property will soon be replaced by `markdown`. It is recommended to start transitioning to using `markdown` for new implementations. - -## Raw HTML - -Original, unmodified HTML from the webpage. Useful when you need to: -- Preserve the exact page structure. -- Process HTML with your own tools. -- Debug page issues. - -```python -result = await crawler.arun(url="https://example.com") -print(result.html) # Complete HTML including headers, scripts, etc. -``` - -## Cleaned HTML - -Sanitized HTML with unnecessary elements removed. Automatically: -- Removes scripts and styles. -- Cleans up formatting. -- Preserves semantic structure. - -```python -config = CrawlerRunConfig( - excluded_tags=['form', 'header', 'footer'], # Additional tags to remove - keep_data_attributes=False # Remove data-* attributes -) -result = await crawler.arun(url="https://example.com", config=config) -print(result.cleaned_html) -``` - -## Standard Markdown - -HTML converted to clean markdown format. This output is useful for: -- Content analysis. -- Documentation. -- Readability. - -```python -config = CrawlerRunConfig( - markdown_generator=DefaultMarkdownGenerator( - options={"include_links": True} # Include links in markdown - ) -) -result = await crawler.arun(url="https://example.com", config=config) -print(result.markdown_v2.raw_markdown) # Standard markdown with links -``` - -## Fit Markdown - -Extract and convert only the most relevant content into markdown format. Best suited for: -- Article extraction. -- Focusing on the main content. -- Removing boilerplate. - -To generate `fit_markdown`, use a content filter like `PruningContentFilter`: - -```python -from crawl4ai.content_filter_strategy import PruningContentFilter - -config = CrawlerRunConfig( - content_filter=PruningContentFilter( - threshold=0.7, - threshold_type="dynamic", - min_word_threshold=100 - ) -) -result = await crawler.arun(url="https://example.com", config=config) -print(result.markdown_v2.fit_markdown) # Extracted main content in markdown -``` - -## Markdown with Citations - -Generate markdown that includes citations for links. This format is ideal for: -- Creating structured documentation. -- Including references for extracted content. - -```python -config = CrawlerRunConfig( - markdown_generator=DefaultMarkdownGenerator( - options={"citations": True} # Enable citations - ) -) -result = await crawler.arun(url="https://example.com", config=config) -print(result.markdown_v2.markdown_with_citations) -print(result.markdown_v2.references_markdown) # Citations section -``` diff --git a/docs/md_v2/basic/page-interaction.md b/docs/md_v2/basic/page-interaction.md deleted file mode 100644 index 07a2c9cd..00000000 --- a/docs/md_v2/basic/page-interaction.md +++ /dev/null @@ -1,190 +0,0 @@ -# Page Interaction - -Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events. - -## JavaScript Execution - -### Basic Execution - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -# Single JavaScript command -config = CrawlerRunConfig( - js_code="window.scrollTo(0, document.body.scrollHeight);" -) -result = await crawler.arun(url="https://example.com", config=config) - -# Multiple commands -js_commands = [ - "window.scrollTo(0, document.body.scrollHeight);", - "document.querySelector('.load-more').click();", - "document.querySelector('#consent-button').click();" -] -config = CrawlerRunConfig(js_code=js_commands) -result = await crawler.arun(url="https://example.com", config=config) -``` - -## Wait Conditions - -### CSS-Based Waiting - -Wait for elements to appear: - -```python -config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content' -result = await crawler.arun(url="https://example.com", config=config) -``` - -### JavaScript-Based Waiting - -Wait for custom conditions: - -```python -# Wait for number of elements -wait_condition = """() => { - return document.querySelectorAll('.item').length > 10; -}""" - -config = CrawlerRunConfig(wait_for=f"js:{wait_condition}") -result = await crawler.arun(url="https://example.com", config=config) - -# Wait for dynamic content to load -wait_for_content = """() => { - const content = document.querySelector('.content'); - return content && content.innerText.length > 100; -}""" - -config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}") -result = await crawler.arun(url="https://example.com", config=config) -``` - -## Handling Dynamic Content - -### Load More Content - -Handle infinite scroll or load more buttons: - -```python -config = CrawlerRunConfig( - js_code=[ - "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom - "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more - ], - wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Form Interaction - -Handle forms and inputs: - -```python -js_form_interaction = """ - document.querySelector('#search').value = 'search term'; // Fill form fields - document.querySelector('form').submit(); // Submit form -""" - -config = CrawlerRunConfig( - js_code=js_form_interaction, - wait_for="css:.results" # Wait for results to load -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -## Timing Control - -### Delays and Timeouts - -Control timing of interactions: - -```python -config = CrawlerRunConfig( - page_timeout=60000, # Page load timeout (ms) - delay_before_return_html=2.0 # Wait before capturing content -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -## Complex Interactions Example - -Here's an example of handling a dynamic page with multiple interactions: - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig - -async def crawl_dynamic_content(): - async with AsyncWebCrawler() as crawler: - # Initial page load - config = CrawlerRunConfig( - js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent - wait_for="css:.main-content" - ) - result = await crawler.arun(url="https://example.com", config=config) - - # Load more content - session_id = "dynamic_session" # Keep session for multiple interactions - - for page in range(3): # Load 3 pages of content - config = CrawlerRunConfig( - session_id=session_id, - js_code=[ - "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom - "window.previousCount = document.querySelectorAll('.item').length;", # Store item count - "document.querySelector('.load-more')?.click();" # Click load more - ], - wait_for="""() => { - const currentCount = document.querySelectorAll('.item').length; - return currentCount > window.previousCount; - }""", - js_only=(page > 0) # Execute JS without reloading page for subsequent interactions - ) - result = await crawler.arun(url="https://example.com", config=config) - print(f"Page {page + 1} items:", len(result.cleaned_html)) - - # Clean up session - await crawler.crawler_strategy.kill_session(session_id) -``` - -## Using with Extraction Strategies - -Combine page interaction with structured extraction: - -```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy -from crawl4ai.async_configs import CrawlerRunConfig - -# Pattern-based extraction after interaction -schema = { - "name": "Dynamic Items", - "baseSelector": ".item", - "fields": [ - {"name": "title", "selector": "h2", "type": "text"}, - {"name": "description", "selector": ".desc", "type": "text"} - ] -} - -config = CrawlerRunConfig( - js_code="window.scrollTo(0, document.body.scrollHeight);", - wait_for="css:.item:nth-child(10)", # Wait for 10 items - extraction_strategy=JsonCssExtractionStrategy(schema) -) -result = await crawler.arun(url="https://example.com", config=config) - -# Or use LLM to analyze dynamic content -class ContentAnalysis(BaseModel): - topics: List[str] - summary: str - -config = CrawlerRunConfig( - js_code="document.querySelector('.show-more').click();", - wait_for="css:.full-content", - extraction_strategy=LLMExtractionStrategy( - provider="ollama/nemotron", - schema=ContentAnalysis.schema(), - instruction="Analyze the full content" - ) -) -result = await crawler.arun(url="https://example.com", config=config) -``` diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md deleted file mode 100644 index ffc35986..00000000 --- a/docs/md_v2/basic/quickstart.md +++ /dev/null @@ -1,172 +0,0 @@ -# Quick Start Guide 🚀 - -Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI, covering everything from initial setup to advanced features like chunking and extraction strategies, using asynchronous programming. Let's dive in! 🌟 - ---- - -## Getting Started 🛠️ - -Set up your environment with `BrowserConfig` and create an `AsyncWebCrawler` instance. - -```python -import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import BrowserConfig - -async def main(): - browser_config = BrowserConfig(verbose=True) - async with AsyncWebCrawler(config=browser_config) as crawler: - # Add your crawling logic here - pass - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -### Basic Usage - -Provide a URL and let Crawl4AI do the work! - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -async def main(): - browser_config = BrowserConfig(verbose=True) - crawl_config = CrawlerRunConfig(url="https://www.nbcnews.com/business") - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(config=crawl_config) - print(f"Basic crawl result: {result.markdown[:500]}") # Print first 500 characters - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -### Taking Screenshots 📸 - -Capture and save webpage screenshots with `CrawlerRunConfig`: - -```python -from crawl4ai.async_configs import CacheMode - -async def capture_and_save_screenshot(url: str, output_path: str): - browser_config = BrowserConfig(verbose=True) - crawl_config = CrawlerRunConfig( - url=url, - screenshot=True, - cache_mode=CacheMode.BYPASS - ) - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(config=crawl_config) - - if result.success and result.screenshot: - import base64 - screenshot_data = base64.b64decode(result.screenshot) - with open(output_path, 'wb') as f: - f.write(screenshot_data) - print(f"Screenshot saved successfully to {output_path}") - else: - print("Failed to capture screenshot") -``` - ---- - -### Browser Selection 🌐 - -Choose from multiple browser engines using `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig - -# Use Firefox -firefox_config = BrowserConfig(browser_type="firefox", verbose=True, headless=True) -async with AsyncWebCrawler(config=firefox_config) as crawler: - result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) - -# Use WebKit -webkit_config = BrowserConfig(browser_type="webkit", verbose=True, headless=True) -async with AsyncWebCrawler(config=webkit_config) as crawler: - result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) - -# Use Chromium (default) -chromium_config = BrowserConfig(verbose=True, headless=True) -async with AsyncWebCrawler(config=chromium_config) as crawler: - result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) -``` - ---- - -### User Simulation 🎭 - -Simulate real user behavior to bypass detection: - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig - -browser_config = BrowserConfig(verbose=True, headless=True) -crawl_config = CrawlerRunConfig( - url="YOUR-URL-HERE", - cache_mode=CacheMode.BYPASS, - simulate_user=True, # Random mouse movements and clicks - override_navigator=True # Makes the browser appear like a real user -) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(config=crawl_config) -``` - ---- - -### Understanding Parameters 🧠 - -Explore caching and forcing fresh crawls: - -```python -async def main(): - browser_config = BrowserConfig(verbose=True) - - async with AsyncWebCrawler(config=browser_config) as crawler: - # First crawl (uses cache) - result1 = await crawler.arun(config=CrawlerRunConfig(url="https://www.nbcnews.com/business")) - print(f"First crawl result: {result1.markdown[:100]}...") - - # Force fresh crawl - result2 = await crawler.arun( - config=CrawlerRunConfig(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) - ) - print(f"Second crawl result: {result2.markdown[:100]}...") - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -### Adding a Chunking Strategy 🧩 - -Split content into chunks using `RegexChunking`: - -```python -from crawl4ai.chunking_strategy import RegexChunking - -async def main(): - browser_config = BrowserConfig(verbose=True) - crawl_config = CrawlerRunConfig( - url="https://www.nbcnews.com/business", - chunking_strategy=RegexChunking(patterns=["\n\n"]) - ) - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(config=crawl_config) - print(f"RegexChunking result: {result.extracted_content[:200]}...") - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -### Advanced Features and Configurations - -For advanced examples (LLM strategies, knowledge graphs, pagination handling), ensure all code aligns with the `BrowserConfig` and `CrawlerRunConfig` pattern shown above. diff --git a/docs/md_v2/blog/articles/dockerize_hooks.md b/docs/md_v2/blog/articles/dockerize_hooks.md index 965388ee..4866c224 100644 --- a/docs/md_v2/blog/articles/dockerize_hooks.md +++ b/docs/md_v2/blog/articles/dockerize_hooks.md @@ -34,9 +34,9 @@ sequenceDiagram **Benefits for Developers and Users** -1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl. -2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish. -3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent. +1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl. +2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish. +3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent. **Next Steps** diff --git a/docs/md_v2/blog/releases/0.4.2.md b/docs/md_v2/blog/releases/0.4.2.md index 6f8f39e9..1386979a 100644 --- a/docs/md_v2/blog/releases/0.4.2.md +++ b/docs/md_v2/blog/releases/0.4.2.md @@ -72,9 +72,9 @@ Two big upgrades here: ### 🔠 **Use Cases You’ll Love** -1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache. -2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website. -3. **PDF Export**: Create professional-looking page PDFs in seconds. +1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache. +2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website. +3. **PDF Export**: Create professional-looking page PDFs in seconds. --- diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md new file mode 100644 index 00000000..11fa3493 --- /dev/null +++ b/docs/md_v2/core/browser-crawler-config.md @@ -0,0 +1,248 @@ +# Browser & Crawler Configuration (Quick Overview) + +Crawl4AI’s flexibility stems from two key classes: + +1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). +2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). + +In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). + +--- + +## 1. BrowserConfig Essentials + +```python +class BrowserConfig: + def __init__( + browser_type="chromium", + headless=True, + proxy_config=None, + viewport_width=1080, + viewport_height=600, + verbose=True, + use_persistent_context=False, + user_data_dir=None, + cookies=None, + headers=None, + user_agent=None, + text_mode=False, + light_mode=False, + extra_args=None, + # ... other advanced parameters omitted here + ): + ... +``` + +### Key Fields to Note + + + +1. **`browser_type`** +- Options: `"chromium"`, `"firefox"`, or `"webkit"`. +- Defaults to `"chromium"`. +- If you need a different engine, specify it here. + +2. **`headless`** + - `True`: Runs the browser in headless mode (invisible browser). + - `False`: Runs the browser in visible mode, which helps with debugging. + +3. **`proxy_config`** + - A dictionary with fields like: +```json +{ + "server": "http://proxy.example.com:8080", + "username": "...", + "password": "..." +} +``` + - Leave as `None` if a proxy is not required. + +4. **`viewport_width` & `viewport_height`**: + - The initial window size. + - Some sites behave differently with smaller or bigger viewports. + +5. **`verbose`**: + - If `True`, prints extra logs. + - Handy for debugging. + +6. **`use_persistent_context`**: + - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs. + - Typically also set `user_data_dir` to point to a folder. + +7. **`cookies`** & **`headers`**: + - If you want to start with specific cookies or add universal HTTP headers, set them here. + - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`. + +8. **`user_agent`**: + - Custom User-Agent string. If `None`, a default is used. + - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection). + +9. **`text_mode`** & **`light_mode`**: + - `text_mode=True` disables images, possibly speeding up text-only crawls. + - `light_mode=True` turns off certain background features for performance. + +10. **`extra_args`**: + - Additional flags for the underlying browser. + - E.g. `["--disable-extensions"]`. + +**Minimal Example**: + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +browser_conf = BrowserConfig( + browser_type="firefox", + headless=False, + text_mode=True +) + +async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) +``` + +--- + +## 2. CrawlerRunConfig Essentials + +```python +class CrawlerRunConfig: + def __init__( + word_count_threshold=200, + extraction_strategy=None, + markdown_generator=None, + cache_mode=None, + js_code=None, + wait_for=None, + screenshot=False, + pdf=False, + verbose=True, + # ... other advanced parameters omitted + ): + ... +``` + +### Key Fields to Note + +1. **`word_count_threshold`**: + - The minimum word count before a block is considered. + - If your site has lots of short paragraphs or items, you can lower it. + +2. **`extraction_strategy`**: + - Where you plug in JSON-based extraction (CSS, LLM, etc.). + - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown). + +3. **`markdown_generator`**: + - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done. + - If `None`, a default approach is used. + +4. **`cache_mode`**: + - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.). + - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`. + +5. **`js_code`**: + - A string or list of JS strings to execute. + - Great for “Load More” buttons or user interactions. + +6. **`wait_for`**: + - A CSS or JS expression to wait for before extracting content. + - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`. + +7. **`screenshot`** & **`pdf`**: + - If `True`, captures a screenshot or PDF after the page is fully loaded. + - The results go to `result.screenshot` (base64) or `result.pdf` (bytes). + +8. **`verbose`**: + - Logs additional runtime details. + - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`. + +**Minimal Example**: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +crawl_conf = CrawlerRunConfig( + js_code="document.querySelector('button#loadMore')?.click()", + wait_for="css:.loaded-content", + screenshot=True +) + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=crawl_conf) + print(result.screenshot[:100]) # Base64-encoded PNG snippet +``` + +--- + +## 3. Putting It All Together + +In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` depending on each call’s needs: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def main(): + # 1) Browser config: headless, bigger viewport, no proxy + browser_conf = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=720 + ) + + # 2) Example extraction strategy + schema = { + "name": "Articles", + "baseSelector": "div.article", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + extraction = JsonCssExtractionStrategy(schema) + + # 3) Crawler run config: skip cache, use extraction + run_conf = CrawlerRunConfig( + extraction_strategy=extraction, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_conf) as crawler: + # 4) Execute the crawl + result = await crawler.arun(url="https://example.com/news", config=run_conf) + + if result.success: + print("Extracted content:", result.extracted_content) + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 4. Next Steps + +For a **detailed list** of available parameters (including advanced ones), see: + +- [BrowserConfig and CrawlerRunConfig Reference](../api/parameters.md) + +You can explore topics like: + +- **Custom Hooks & Auth** (Inject JavaScript or handle login forms). +- **Session Management** (Re-use pages, preserve state across multiple calls). +- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior). +- **Advanced Caching** (Fine-tune read/write cache modes). + +--- + +## 5. Conclusion + +**BrowserConfig** and **CrawlerRunConfig** give you straightforward ways to define: + +- **Which** browser to launch, how it should run, and any proxy or user agent needs. +- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc. + +Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling! \ No newline at end of file diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/core/cache-modes.md similarity index 86% rename from docs/md_v2/basic/cache-modes.md rename to docs/md_v2/core/cache-modes.md index 73460e57..b0aab78a 100644 --- a/docs/md_v2/basic/cache-modes.md +++ b/docs/md_v2/core/cache-modes.md @@ -49,7 +49,8 @@ from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import CrawlerRunConfig async def use_proxy(): - config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Use CacheMode in CrawlerRunConfig + # Use CacheMode in CrawlerRunConfig + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", @@ -71,11 +72,4 @@ if __name__ == "__main__": | `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` | | `disable_cache=True` | `cache_mode=CacheMode.DISABLED`| | `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` | -| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` | - -## Suppressing Deprecation Warnings -If you need time to migrate, you can temporarily suppress deprecation warnings: -```python -# In your config.py -SHOW_DEPRECATION_WARNINGS = False -``` +| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` | \ No newline at end of file diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md new file mode 100644 index 00000000..9774f9a7 --- /dev/null +++ b/docs/md_v2/core/content-selection.md @@ -0,0 +1,332 @@ +# Content Selection + +Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters. + +Below, we show how to configure these parameters and combine them for precise control. + +--- + +## 1. CSS-Based Selection + +A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + # e.g., first 30 items from Hacker News + css_selector=".athing:nth-child(-n+30)" + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com/newest", + config=config + ) + print("Partial HTML length:", len(result.cleaned_html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Result**: Only elements matching that selector remain in `result.cleaned_html`. + +--- + +## 2. Content Filtering & Exclusions + +### 2.1 Basic Overview + +```python +config = CrawlerRunConfig( + # Content thresholds + word_count_threshold=10, # Minimum words per block + + # Tag exclusions + excluded_tags=['form', 'header', 'footer', 'nav'], + + # Link filtering + exclude_external_links=True, + exclude_social_media_links=True, + # Block entire domains + exclude_domains=["adtrackers.com", "spammynews.org"], + exclude_social_media_domains=["facebook.com", "twitter.com"], + + # Media filtering + exclude_external_images=True +) +``` + +**Explanation**: + +- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers. +- **`excluded_tags`**: Removes entire tags (``, `
`, `