From b99d20b7258c9a78f6304de3b494bd31f8bcee33 Mon Sep 17 00:00:00 2001 From: unclecode Date: Tue, 8 Oct 2024 18:10:57 +0800 Subject: [PATCH 1/8] Add pypi_build.sh to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a593ae1d..85882f69 100644 --- a/.gitignore +++ b/.gitignore @@ -200,4 +200,5 @@ test_env/ todo.md git_changes.py -git_changes.md \ No newline at end of file +git_changes.md +pypi_build.sh \ No newline at end of file From ff3524d9b1f76bb06a43a7721eb958db9bd01463 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 13:42:42 +0800 Subject: [PATCH 2/8] feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts - Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py --- .gitignore | 4 ++- CHANGELOG.md | 33 +++++++++++++++++++ crawl4ai/__init__.py | 2 +- crawl4ai/async_crawler_strategy.py | 34 ++++++++++++++++--- crawl4ai/async_database.py | 21 ++++++++++-- crawl4ai/async_webcrawler.py | 4 +-- crawl4ai/content_scrapping_strategy.py | 6 ++-- docs/examples/quickstart_async.py | 45 ++++++++++++++++++++------ 8 files changed, 127 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 85882f69..8b8f014c 100644 --- a/.gitignore +++ b/.gitignore @@ -201,4 +201,6 @@ test_env/ todo.md git_changes.py git_changes.md -pypi_build.sh \ No newline at end of file +pypi_build.sh + +.tests/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b564ed..701d6903 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## [0.3.6] - 2024-10-12 + +### Added +- New `.tests/` directory added to `.gitignore` +- Screenshot functionality: + - Added `screenshot` column to the database schema + - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy` + - Added option to capture screenshots when crawling +- Delayed content retrieval: + - New `get_delayed_content` method in `AsyncCrawlResponse` +- Database schema updates: + - Auto-update mechanism for database schema + - New columns: 'media', 'links', 'metadata', 'screenshot' +- LLM extraction examples in `quickstart_async.py`: + - Support for OpenAI, Hugging Face, and Ollama models + +### Changed +- Updated version number to 0.3.6 in `__init__.py` +- Improved error handling and logging in various components +- Enhanced `WebScrappingStrategy` to handle image processing more efficiently +- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values + +### Fixed +- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags + +### Removed +- Removed `pypi_build.sh` from version control (added to `.gitignore`) + +### Developer Notes +- Added examples for using different LLM providers in `quickstart_async.py` +- Improved error messages for better debugging +- Enhanced type hinting throughout the codebase + ## [v0.3.5] - 2024-09-02 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 186730e8..04da30f8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -__version__ = "0.3.5" +__version__ = "0.3.6" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 987925f8..28795a3e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1,7 +1,7 @@ import asyncio import base64, time from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional +from typing import Callable, Dict, Any, List, Optional, Awaitable import os from playwright.async_api import async_playwright, Page, Browser, Error from io import BytesIO @@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel): response_headers: Dict[str, str] status_code: int screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + + class Config: + arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) - response = await page.goto(url, wait_until="domcontentloaded", timeout=60000) + response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)) await self.execute_hook('after_goto', page) # Get status code and headers @@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + screenshot_data = await self.take_screenshot(url) + html = await page.content() page = await self.execute_hook('before_return_html', page, html) @@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "status_code": status_code }, f) - response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content + ) return response except Error as e: raise Error(f"Failed to crawl {url}: {str(e)}") @@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] - async def take_screenshot(self, url: str) -> str: + async def take_screenshot(self, url: str, wait_time = 1000) -> str: async with await self.browser.new_context(user_agent=self.user_agent) as context: page = await context.new_page() try: - await page.goto(url, wait_until="domcontentloaded") + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + # Wait for a specified time (default is 1 second) + await page.wait_for_timeout(wait_time) screenshot = await page.screenshot(full_page=True) return base64.b64encode(screenshot).decode('utf-8') except Exception as e: diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index baa53255..61d98e9c 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -29,14 +29,31 @@ class AsyncDatabaseManager: ) ''') await db.commit() + await self.update_db_schema() - async def aalter_db_add_screenshot(self, new_column: str = "media"): + async def update_db_schema(self): + async with aiosqlite.connect(self.db_path) as db: + # Check if the 'media' column exists + cursor = await db.execute("PRAGMA table_info(crawled_data)") + columns = await cursor.fetchall() + column_names = [column[1] for column in columns] + + if 'media' not in column_names: + await self.aalter_db_add_column('media') + + # Check for other missing columns and add them if necessary + for column in ['links', 'metadata', 'screenshot']: + if column not in column_names: + await self.aalter_db_add_column(column) + + async def aalter_db_add_column(self, new_column: str): try: async with aiosqlite.connect(self.db_path) as db: await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') await db.commit() + print(f"Added column '{new_column}' to the database.") except Exception as e: - print(f"Error altering database to add screenshot column: {e}") + print(f"Error altering database to add {new_column} column: {e}") async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: try: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 88c05f03..d308e930 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -202,11 +202,11 @@ class AsyncWebCrawler: ) if result is None: - raise ValueError(f"Failed to extract content from the website: {url}") + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") except InvalidCSSSelectorError as e: raise ValueError(str(e)) except Exception as e: - raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}") + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index e3d2c57f..afd75892 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy): if isinstance(element, Comment): element.extract() return False + + # if element.name == 'img': + # process_image(element, url, 0, 1) + # return True if element.name in ['script', 'style', 'link', 'meta', 'noscript']: - if element.name == 'img': - process_image(element, url, 0, 1) element.decompose() return False diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 1a2d9570..836bdb1d 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -66,6 +66,29 @@ async def use_proxy(): # ) # print(result.markdown[:500]) # Print first 500 characters + +async def capture_and_save_screenshot(url: str, output_path: str): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=url, + screenshot=True, + bypass_cache=True + ) + + if result.success and result.screenshot: + import base64 + + # Decode the base64 screenshot data + screenshot_data = base64.b64decode(result.screenshot) + + # Save the screenshot as a JPEG file + with open(output_path, 'wb') as f: + f.write(screenshot_data) + + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") @@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) -async def extract_structured_data_using_llm(): - print("\n--- Extracting Structured Data with OpenAI ---") - print( - "Note: Set your OpenAI API key as an environment variable to run this example." - ) - if not os.getenv("OPENAI_API_KEY"): - print("OpenAI API key not found. Skipping this example.") +async def extract_structured_data_using_llm(provider: str, api_token: str = None): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") return async with AsyncWebCrawler(verbose=True) as crawler: @@ -87,8 +108,8 @@ async def extract_structured_data_using_llm(): url="https://openai.com/api/pricing/", word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY"), + provider=provider, + api_token=api_token, schema=OpenAIModelFee.schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -390,7 +411,13 @@ async def main(): await js_and_css() await use_proxy() await extract_structured_data_using_css_extractor() + + # LLM extraction examples await extract_structured_data_using_llm() + await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("ollama/llama3.2") + # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() From 9b2b267820c79fd9c45094e0d9fece57c82aa533 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 13:42:56 +0800 Subject: [PATCH 3/8] CHANGELOG UPDATE --- CHANGELOG.md | 68 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 701d6903..873af87f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,37 +1,51 @@ # Changelog -## [0.3.6] - 2024-10-12 +## [v0.3.6] - 2024-10-12 -### Added -- New `.tests/` directory added to `.gitignore` -- Screenshot functionality: - - Added `screenshot` column to the database schema - - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy` - - Added option to capture screenshots when crawling -- Delayed content retrieval: - - New `get_delayed_content` method in `AsyncCrawlResponse` -- Database schema updates: - - Auto-update mechanism for database schema - - New columns: 'media', 'links', 'metadata', 'screenshot' -- LLM extraction examples in `quickstart_async.py`: - - Support for OpenAI, Hugging Face, and Ollama models +### 1. Screenshot Capture +- **What's new**: Added ability to capture screenshots during crawling. +- **Why it matters**: You can now visually verify the content of crawled pages, which is useful for debugging and content verification. +- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. -### Changed -- Updated version number to 0.3.6 in `__init__.py` -- Improved error handling and logging in various components -- Enhanced `WebScrappingStrategy` to handle image processing more efficiently -- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values +### 2. Delayed Content Retrieval +- **What's new**: Introduced `get_delayed_content` method in `AsyncCrawlResponse`. +- **Why it matters**: Allows you to retrieve content after a specified delay, useful for pages that load content dynamically. +- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. -### Fixed -- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags +### 3. Custom Page Timeout +- **What's new**: Added `page_timeout` parameter to control page load timeout. +- **Why it matters**: Gives you more control over crawling behavior, especially for slow-loading pages. +- **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`. -### Removed -- Removed `pypi_build.sh` from version control (added to `.gitignore`) +### 4. Enhanced LLM Support +- **What's new**: Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). +- **Why it matters**: Provides more flexibility in choosing AI models for content extraction. +- **How to use**: Specify the desired provider when using `LLMExtractionStrategy`. -### Developer Notes -- Added examples for using different LLM providers in `quickstart_async.py` -- Improved error messages for better debugging -- Enhanced type hinting throughout the codebase +## Improvements + +### 1. Database Schema Auto-updates +- **What's new**: Automatic database schema updates. +- **Why it matters**: Ensures your database stays compatible with the latest version without manual intervention. + +### 2. Enhanced Error Handling +- **What's new**: Improved error messages and logging. +- **Why it matters**: Makes debugging easier with more informative error messages. + +### 3. Optimized Image Processing +- **What's new**: Refined image handling in `WebScrappingStrategy`. +- **Why it matters**: Improves the accuracy of content extraction for pages with images. + +## Bug Fixes + +- Fixed an issue where image tags were being prematurely removed during content extraction. + +## Developer Notes + +- Added examples for using different LLM providers in `quickstart_async.py`. +- Enhanced type hinting throughout the codebase for better development experience. + +We're constantly working to improve crawl4ai. These updates aim to provide you with more control, flexibility, and reliability in your web crawling tasks. As always, we appreciate your feedback and suggestions for future improvements! ## [v0.3.5] - 2024-09-02 From 68e9144ce3c8821849358b48f57e74d7504bb32b Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 14:48:22 +0800 Subject: [PATCH 4/8] feat: Enhance crawling control and LLM extraction flexibility - Add before_retrieve_html hook and delay_before_return_html option - Implement flexible page_timeout for smart_wait function - Support extra_args and custom headers in LLM extraction - Allow arbitrary kwargs in AsyncWebCrawler initialization - Improve perform_completion_with_backoff for custom API calls - Update examples with new features and diverse LLM providers --- CHANGELOG.md | 33 ++++++++++++++++++++++++++++-- crawl4ai/async_crawler_strategy.py | 12 ++++++++--- crawl4ai/async_webcrawler.py | 6 +++--- crawl4ai/extraction_strategy.py | 9 +++++++- crawl4ai/utils.py | 12 ++++++++++- docs/examples/quickstart_async.py | 16 +++++++++++++-- 6 files changed, 76 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 873af87f..197fa32b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,36 @@ # Changelog -## [v0.3.6] - 2024-10-12 +## [v0.3.6] - 2024-10-12 - Part 1 + +### 1. Improved Crawling Control +- **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`. +- **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content. + - Useful for pages with delayed content loading. +- **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout. + - Provides better handling for slow-loading pages. + +### 2. Enhanced LLM Extraction Strategy +- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. +- **Custom Headers**: Users can now pass custom headers to the extraction strategy. + - Enables more flexibility when interacting with different LLM APIs. + +### 3. AsyncWebCrawler Improvements +- **Flexible Initialization**: `AsyncWebCrawler` now accepts arbitrary keyword arguments. + - These are passed directly to the crawler strategy, allowing for more customized setups. + +### 4. Utility Function Enhancements +- **Improved API Interaction**: `perform_completion_with_backoff` function now supports additional arguments. + - Allows for more customized API calls to LLM providers. + +## Examples and Documentation +- Updated `quickstart_async.py` with examples of using custom headers in LLM extraction. +- Added more diverse examples of LLM provider usage, including OpenAI, Hugging Face, and Ollama. + +## Developer Notes +- Refactored code for better maintainability and flexibility. +- Enhanced error handling and logging for improved debugging experience. + +## [v0.3.6] - 2024-10-12 - Part 2 ### 1. Screenshot Capture - **What's new**: Added ability to capture screenshots during crawling. @@ -45,7 +75,6 @@ - Added examples for using different LLM providers in `quickstart_async.py`. - Enhanced type hinting throughout the codebase for better development experience. -We're constantly working to improve crawl4ai. These updates aim to provide you with more control, flexibility, and reliability in your web crawling tasks. As always, we appreciate your feedback and suggestions for future improvements! ## [v0.3.5] - 2024-09-02 diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 28795a3e..c74aff13 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -63,7 +63,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'on_execution_started': None, 'before_goto': None, 'after_goto': None, - 'before_return_html': None + 'before_return_html': None, + 'before_retrieve_html': None } async def __aenter__(self): @@ -295,7 +296,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): wait_for = kwargs.get("wait_for") if wait_for: try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000)) + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") @@ -304,8 +305,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("screenshot"): screenshot_data = await self.take_screenshot(url) + await self.execute_hook('before_retrieve_html', page) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html") + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) html = await page.content() - page = await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html) if self.verbose: print(f"[LOG] ✅ Crawled {url} successfully!") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d308e930..ba82d28f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -23,17 +23,17 @@ class AsyncWebCrawler: self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_by_pass_cache: bool = False, - verbose: bool = False, + **kwargs, ): self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - verbose=verbose + **kwargs ) self.always_by_pass_cache = always_by_pass_cache self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) self.ready = False - self.verbose = verbose + self.verbose = kwargs.get("verbose", False) async def __aenter__(self): await self.crawler_strategy.__aenter__() diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 48491067..210a360b 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.apply_chunking = kwargs.get("apply_chunking", True) self.base_url = kwargs.get("base_url", None) + self.extra_args = kwargs.get("extra_args", {}) if not self.apply_chunking: self.chunk_token_threshold = 1e9 @@ -111,7 +112,13 @@ class LLMExtractionStrategy(ExtractionStrategy): "{" + variable + "}", variable_values[variable] ) - response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema") + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = json.loads(blocks) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 71a36aed..77671a20 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -775,7 +775,14 @@ def extract_xml_data(tags, string): return data # Function to perform the completion with exponential backoff -def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None): +def perform_completion_with_backoff( + provider, + prompt_with_variables, + api_token, + json_response = False, + base_url=None, + **kwargs + ): from litellm import completion from litellm.exceptions import RateLimitError max_attempts = 3 @@ -784,6 +791,9 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token, extra_args = {} if json_response: extra_args["response_format"] = { "type": "json_object" } + + if kwargs.get("extra_args"): + extra_args.update(kwargs["extra_args"]) for attempt in range(max_attempts): try: diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 836bdb1d..9f00e323 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -96,13 +96,17 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) -async def extract_structured_data_using_llm(provider: str, api_token: str = None): +async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): print(f"\n--- Extracting Structured Data with {provider} ---") if api_token is None and provider != "ollama": print(f"API token is required for {provider}. Skipping this example.") return + extra_args = {} + if extra_headers: + extra_args["extra_headers"] = extra_headers + async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url="https://openai.com/api/pricing/", @@ -115,6 +119,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. Do not miss any models in the entire content. One extracted model JSON format should look like this: {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", + extra_args=extra_args ), bypass_cache=True, ) @@ -414,9 +419,16 @@ async def main(): # LLM extraction examples await extract_structured_data_using_llm() - await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("ollama/llama3.2") + + # You always can pass custom headers to the extraction strategy + custom_headers = { + "Authorization": "Bearer your-custom-token", + "X-Custom-Header": "Some-Value" + } + await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() From b9bbd4237355afb3fcd6b8ebc407d8b61b84a21c Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 13 Oct 2024 14:37:45 +0800 Subject: [PATCH 5/8] Update Quickstart examples --- docs/examples/quickstart_async.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f00e323..27a162e3 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -10,6 +10,7 @@ import time import json import os import re +from typing import Dict from bs4 import BeautifulSoup from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler @@ -18,6 +19,8 @@ from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, ) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + print("Crawl4AI: Advanced Web Crawling and Data Extraction") print("GitHub Repository: https://github.com/unclecode/crawl4ai") print("Twitter: @unclecode") @@ -30,7 +33,7 @@ async def simple_crawl(): result = await crawler.arun(url="https://www.nbcnews.com/business") print(result.markdown[:500]) # Print first 500 characters -async def js_and_css(): +async def simple_example_with_running_js_code(): print("\n--- Executing JavaScript and Using CSS Selectors ---") # New code to handle the wait_for parameter wait_for = """() => { @@ -47,12 +50,21 @@ async def js_and_css(): result = await crawler.arun( url="https://www.nbcnews.com/business", js_code=js_code, - # css_selector="article.tease-card", # wait_for=wait_for, bypass_cache=True, ) print(result.markdown[:500]) # Print first 500 characters +async def simple_example_with_css_selector(): + print("\n--- Using CSS Selectors ---") + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + css_selector=".wide-tease-item__description", + bypass_cache=True, + ) + print(result.markdown[:500]) # Print first 500 characters + async def use_proxy(): print("\n--- Using a Proxy ---") print( @@ -66,7 +78,6 @@ async def use_proxy(): # ) # print(result.markdown[:500]) # Print first 500 characters - async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( @@ -413,8 +424,10 @@ async def speed_comparison(): async def main(): await simple_crawl() - await js_and_css() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() await use_proxy() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) await extract_structured_data_using_css_extractor() # LLM extraction examples From 320afdea64f92c9a5942e901f4a9016ea7ab13f1 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 21:03:28 +0800 Subject: [PATCH 6/8] feat: Enhance crawler flexibility and LLM extraction capabilities - Add browser type selection (Chromium, Firefox, WebKit) - Implement iframe content extraction - Improve image processing and dimension updates - Add custom headers support in AsyncPlaywrightCrawlerStrategy - Enhance delayed content retrieval with new parameter - Optimize HTML sanitization and Markdown conversion - Update examples in quickstart_async.py for new features --- .gitignore | 3 +- crawl4ai/async_crawler_strategy.py | 125 ++++++++++++++++++- crawl4ai/content_scrapping_strategy.py | 13 +- crawl4ai/prompts.py | 4 +- crawl4ai/utils.py | 160 ++++++++++++------------- crawl4ai/web_crawler.py | 1 + docs/examples/quickstart_async.py | 25 ++++ 7 files changed, 238 insertions(+), 93 deletions(-) diff --git a/.gitignore b/.gitignore index 8b8f014c..e5718a14 100644 --- a/.gitignore +++ b/.gitignore @@ -203,4 +203,5 @@ git_changes.py git_changes.md pypi_build.sh -.tests/ \ No newline at end of file +.tests/ +git_changes.py \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index c74aff13..e9699953 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -50,7 +50,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") self.proxy = kwargs.get("proxy") self.headless = kwargs.get("headless", True) - self.headers = {} + self.browser_type = kwargs.get("browser_type", "chromium") # New parameter + self.headers = kwargs.get("headers", {}) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -80,7 +81,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.browser is None: browser_args = { "headless": self.headless, - # "headless": False, "args": [ "--disable-gpu", "--disable-dev-shm-usage", @@ -95,7 +95,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): browser_args["proxy"] = proxy_settings - self.browser = await self.playwright.chromium.launch(**browser_args) + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + await self.execute_hook('on_browser_created', self.browser) async def close(self): @@ -145,7 +152,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): wait_for = wait_for.strip() @@ -209,6 +215,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Error in wait condition: {str(e)}") + async def process_iframes(self, page): + # Find all iframes + iframes = await page.query_selector_all('iframe') + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate('() => document.body.innerHTML') + + # Generate a unique class name for this iframe + class_name = f'extracted-iframe-content-{i}' + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace('`', '\\`') + await page.evaluate(f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """) + else: + print(f"Warning: Could not access content frame for iframe {i}") + except Exception as e: + print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: response_headers = {} status_code = None @@ -263,6 +311,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): status_code = 200 response_headers = {} + await page.wait_for_selector('body') await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") @@ -305,11 +354,78 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("screenshot"): screenshot_data = await self.take_screenshot(url) + + # New code to update image dimensions + update_image_dimensions_js = """ + () => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes('placeholder') || img.src.includes('icon')) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll('img')).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute('width', img.naturalWidth); + img.setAttribute('height', img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach(img => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + setTimeout(() => resolve(), 5000); + }); + } + """ + await page.evaluate(update_image_dimensions_js) + + # Wait a bit for any onload events to complete + await page.wait_for_timeout(100) + + # Process iframes + if kwargs.get("process_iframes", False): + page = await self.process_iframes(page) + await self.execute_hook('before_retrieve_html', page) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html") if delay_before_return_html: await asyncio.sleep(delay_before_return_html) + html = await page.content() await self.execute_hook('before_return_html', page, html) @@ -398,7 +514,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}") - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count()) semaphore = asyncio.Semaphore(semaphore_count) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index afd75892..68f03412 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -16,8 +16,6 @@ from .utils import ( CustomHTML2Text ) - - class ContentScrappingStrategy(ABC): @abstractmethod def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: @@ -129,7 +127,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) image_format = os.path.splitext(img.get('src',''))[1].lower() # Remove . from format - image_format = image_format.strip('.') + image_format = image_format.strip('.').split('?')[0] score = 0 if height_value: if height_unit == 'px' and height_value > 150: @@ -158,6 +156,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): return None return { 'src': img.get('src', ''), + 'data-src': img.get('data-src', ''), 'alt': img.get('alt', ''), 'desc': find_closest_parent_with_useful_text(img), 'score': score, @@ -275,11 +274,14 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') - cleaned_html = sanitize_html(cleaned_html) h = CustomHTML2Text() h.ignore_links = True - markdown = h.handle(cleaned_html) + h.body_width = 0 + try: + markdown = h.handle(cleaned_html) + except Exception as e: + markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') try: @@ -288,6 +290,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): print('Error extracting metadata:', str(e)) meta = {} + cleaned_html = sanitize_html(cleaned_html) return { 'markdown': markdown, 'cleaned_html': cleaned_html, diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index a55d6fca..7a963e6d 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -1,4 +1,4 @@ -PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage: +PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: {URL} And here is the cleaned HTML content of that webpage: @@ -79,7 +79,7 @@ To generate the JSON objects: 2. For each block: a. Assign it an index based on its order in the content. b. Analyze the content and generate ONE semantic tag that describe what the block is about. - c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. + c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 77671a20..efb5d79b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string): return parsed_objects, unparsed_segments def sanitize_html(html): - # Replace all weird and special characters with an empty string + # Replace all unwanted and special characters with an empty string sanitized_html = html # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) @@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, if tag.name != 'img': tag.attrs = {} - # Extract all img tgas inti [{src: '', alt: ''}] + # Extract all img tgas int0 [{src: '', alt: ''}] media = { 'images': [], 'videos': [], @@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, img.decompose() - # Create a function that replace content of all"pre" tage with its inner text + # Create a function that replace content of all"pre" tag with its inner text def replace_pre_tags_with_text(node): for child in node.find_all('pre'): # set child inner html to its text @@ -502,7 +502,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: current_tag = tag while current_tag: current_tag = current_tag.parent - # Get the text content of the parent tag + # Get the text content from the parent tag if current_tag: text_content = current_tag.get_text(separator=' ',strip=True) # Check if the text content has at least word_count_threshold @@ -511,88 +511,88 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: return None def process_image(img, url, index, total_images): - #Check if an image has valid display and inside undesired html elements - def is_valid_image(img, parent, parent_classes): - style = img.get('style', '') - src = img.get('src', '') - classes_to_check = ['button', 'icon', 'logo'] - tags_to_check = ['button', 'input'] - return all([ - 'display:none' not in style, - src, - not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), - parent.name not in tags_to_check - ]) + #Check if an image has valid display and inside undesired html elements + def is_valid_image(img, parent, parent_classes): + style = img.get('style', '') + src = img.get('src', '') + classes_to_check = ['button', 'icon', 'logo'] + tags_to_check = ['button', 'input'] + return all([ + 'display:none' not in style, + src, + not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), + parent.name not in tags_to_check + ]) - #Score an image for it's usefulness - def score_image_for_usefulness(img, base_url, index, images_count): - # Function to parse image height/width value and units - def parse_dimension(dimension): - if dimension: - match = re.match(r"(\d+)(\D*)", dimension) - if match: - number = int(match.group(1)) - unit = match.group(2) or 'px' # Default unit is 'px' if not specified - return number, unit - return None, None + #Score an image for it's usefulness + def score_image_for_usefulness(img, base_url, index, images_count): + # Function to parse image height/width value and units + def parse_dimension(dimension): + if dimension: + match = re.match(r"(\d+)(\D*)", dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None - # Fetch image file metadata to extract size and extension - def fetch_image_file_size(img, base_url): - #If src is relative path construct full URL, if not it may be CDN URL - img_url = urljoin(base_url,img.get('src')) - try: - response = requests.head(img_url) - if response.status_code == 200: - return response.headers.get('Content-Length',None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None - except InvalidSchema as e: + # Fetch image file metadata to extract size and extension + def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") return None - finally: - return + except InvalidSchema as e: + return None + finally: + return - image_height = img.get('height') - height_value, height_unit = parse_dimension(image_height) - image_width = img.get('width') - width_value, width_unit = parse_dimension(image_width) - image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) - image_format = os.path.splitext(img.get('src',''))[1].lower() - # Remove . from format - image_format = image_format.strip('.') - score = 0 - if height_value: - if height_unit == 'px' and height_value > 150: - score += 1 - if height_unit in ['%','vh','vmin','vmax'] and height_value >30: - score += 1 - if width_value: - if width_unit == 'px' and width_value > 150: - score += 1 - if width_unit in ['%','vh','vmin','vmax'] and width_value >30: - score += 1 - if image_size > 10000: + image_height = img.get('height') + height_value, height_unit = parse_dimension(image_height) + image_width = img.get('width') + width_value, width_unit = parse_dimension(image_width) + image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) + image_format = os.path.splitext(img.get('src',''))[1].lower() + # Remove . from format + image_format = image_format.strip('.') + score = 0 + if height_value: + if height_unit == 'px' and height_value > 150: score += 1 - if img.get('alt') != '': - score+=1 - if any(image_format==format for format in ['jpg','png','webp']): - score+=1 - if index/images_count<0.5: - score+=1 - return score + if height_unit in ['%','vh','vmin','vmax'] and height_value >30: + score += 1 + if width_value: + if width_unit == 'px' and width_value > 150: + score += 1 + if width_unit in ['%','vh','vmin','vmax'] and width_value >30: + score += 1 + if image_size > 10000: + score += 1 + if img.get('alt') != '': + score+=1 + if any(image_format==format for format in ['jpg','png','webp']): + score+=1 + if index/images_count<0.5: + score+=1 + return score - if not is_valid_image(img, img.parent, img.parent.get('class', [])): - return None - score = score_image_for_usefulness(img, url, index, total_images) - if score <= IMAGE_SCORE_THRESHOLD: - return None - return { - 'src': img.get('src', ''), - 'alt': img.get('alt', ''), - 'desc': find_closest_parent_with_useful_text(img), - 'score': score, - 'type': 'image' - } + if not is_valid_image(img, img.parent, img.parent.get('class', [])): + return None + score = score_image_for_usefulness(img, url, index, total_images) + if score <= IMAGE_SCORE_THRESHOLD: + return None + return { + 'src': img.get('src', '').replace('\\"', '"').strip(), + 'alt': img.get('alt', ''), + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } def process_element(element: element.PageElement) -> bool: try: diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 7dea56ca..20e9b04e 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -12,6 +12,7 @@ from typing import List from concurrent.futures import ThreadPoolExecutor from .config import * import warnings +import json warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".') diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 27a162e3..f6c16a4e 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -357,6 +357,28 @@ async def crawl_dynamic_content_pages_method_3(): await crawler.crawler_strategy.kill_session(session_id) print(f"Successfully crawled {len(all_commits)} commits across 3 pages") +async def crawl_custom_browser_type(): + # Use Firefox + start = time.time() + async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + + # Use WebKit + start = time.time() + async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + + # Use Chromium (default) + start = time.time() + async with AsyncWebCrawler(verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + async def speed_comparison(): # print("\n--- Speed Comparison ---") # print("Firecrawl (simulated):") @@ -446,6 +468,9 @@ async def main(): # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() + + await crawl_custom_browser_type() + await speed_comparison() From 6aa803d712a44c2144bb61a515b09e2815d1ac6d Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 21:03:40 +0800 Subject: [PATCH 7/8] Update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e5718a14..c6ac6610 100644 --- a/.gitignore +++ b/.gitignore @@ -204,4 +204,5 @@ git_changes.md pypi_build.sh .tests/ -git_changes.py \ No newline at end of file +git_changes.py +git_changes.md \ No newline at end of file From 2b73bdf6b09585fc52bf20b4e88f9eae8159135d Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 21:04:02 +0800 Subject: [PATCH 8/8] Update changelog --- CHANGELOG.md | 114 ++++++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 197fa32b..a377d794 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [v0.3.6] - 2024-10-12 - Part 1 +## [v0.3.6] - 2024-10-12 ### 1. Improved Crawling Control - **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`. @@ -8,73 +8,75 @@ - Useful for pages with delayed content loading. - **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout. - Provides better handling for slow-loading pages. - -### 2. Enhanced LLM Extraction Strategy -- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. -- **Custom Headers**: Users can now pass custom headers to the extraction strategy. - - Enables more flexibility when interacting with different LLM APIs. - -### 3. AsyncWebCrawler Improvements -- **Flexible Initialization**: `AsyncWebCrawler` now accepts arbitrary keyword arguments. - - These are passed directly to the crawler strategy, allowing for more customized setups. - -### 4. Utility Function Enhancements -- **Improved API Interaction**: `perform_completion_with_backoff` function now supports additional arguments. - - Allows for more customized API calls to LLM providers. - -## Examples and Documentation -- Updated `quickstart_async.py` with examples of using custom headers in LLM extraction. -- Added more diverse examples of LLM provider usage, including OpenAI, Hugging Face, and Ollama. - -## Developer Notes -- Refactored code for better maintainability and flexibility. -- Enhanced error handling and logging for improved debugging experience. - -## [v0.3.6] - 2024-10-12 - Part 2 - -### 1. Screenshot Capture -- **What's new**: Added ability to capture screenshots during crawling. -- **Why it matters**: You can now visually verify the content of crawled pages, which is useful for debugging and content verification. -- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. - -### 2. Delayed Content Retrieval -- **What's new**: Introduced `get_delayed_content` method in `AsyncCrawlResponse`. -- **Why it matters**: Allows you to retrieve content after a specified delay, useful for pages that load content dynamically. -- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. - -### 3. Custom Page Timeout -- **What's new**: Added `page_timeout` parameter to control page load timeout. -- **Why it matters**: Gives you more control over crawling behavior, especially for slow-loading pages. - **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`. -### 4. Enhanced LLM Support -- **What's new**: Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). -- **Why it matters**: Provides more flexibility in choosing AI models for content extraction. -- **How to use**: Specify the desired provider when using `LLMExtractionStrategy`. +### 2. Browser Type Selection +- Added support for different browser types (Chromium, Firefox, WebKit). +- Users can now specify the browser type when initializing AsyncWebCrawler. +- **How to use**: Set `browser_type="firefox"` or `browser_type="webkit"` when initializing AsyncWebCrawler. -## Improvements +### 3. Screenshot Capture +- Added ability to capture screenshots during crawling. +- Useful for debugging and content verification. +- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. -### 1. Database Schema Auto-updates -- **What's new**: Automatic database schema updates. -- **Why it matters**: Ensures your database stays compatible with the latest version without manual intervention. +### 4. Enhanced LLM Extraction Strategy +- Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). +- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. +- **Custom Headers**: Users can now pass custom headers to the extraction strategy. +- **How to use**: Specify the desired provider and custom arguments when using `LLMExtractionStrategy`. -### 2. Enhanced Error Handling -- **What's new**: Improved error messages and logging. -- **Why it matters**: Makes debugging easier with more informative error messages. +### 5. iframe Content Extraction +- New feature to process and extract content from iframes. +- **How to use**: Set `process_iframes=True` in the crawl method. -### 3. Optimized Image Processing -- **What's new**: Refined image handling in `WebScrappingStrategy`. -- **Why it matters**: Improves the accuracy of content extraction for pages with images. +### 6. Delayed Content Retrieval +- Introduced `get_delayed_content` method in `AsyncCrawlResponse`. +- Allows retrieval of content after a specified delay, useful for dynamically loaded content. +- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. + +## Improvements and Optimizations + +### 1. AsyncWebCrawler Enhancements +- **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy. +- Allows for more customized setups. + +### 2. Image Processing Optimization +- Enhanced image handling in WebScrappingStrategy. +- Added filtering for small, invisible, or irrelevant images. +- Improved image scoring system for better content relevance. +- Implemented JavaScript-based image dimension updating for more accurate representation. + +### 3. Database Schema Auto-updates +- Automatic database schema updates ensure compatibility with the latest version. + +### 4. Enhanced Error Handling and Logging +- Improved error messages and logging for easier debugging. + +### 5. Content Extraction Refinements +- Refined HTML sanitization process. +- Improved handling of base64 encoded images. +- Enhanced Markdown conversion process. +- Optimized content extraction algorithms. + +### 6. Utility Function Enhancements +- `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers. ## Bug Fixes - - Fixed an issue where image tags were being prematurely removed during content extraction. +## Examples and Documentation +- Updated `quickstart_async.py` with examples of: + - Using custom headers in LLM extraction. + - Different LLM provider usage (OpenAI, Hugging Face, Ollama). + - Custom browser type usage. + ## Developer Notes +- Refactored code for better maintainability, flexibility, and performance. +- Enhanced type hinting throughout the codebase for improved development experience. +- Expanded error handling for more robust operation. -- Added examples for using different LLM providers in `quickstart_async.py`. -- Enhanced type hinting throughout the codebase for better development experience. - +These updates significantly enhance the flexibility, accuracy, and robustness of crawl4ai, providing users with more control and options for their web crawling and content extraction tasks. ## [v0.3.5] - 2024-09-02