From 7c1705712dddc0d80ad33fdacc6e37e9272d83aa Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 1 Mar 2025 18:17:11 +0530 Subject: [PATCH 01/36] fix: https://github.com/unclecode/crawl4ai/issues/756 --- crawl4ai/content_scraping_strategy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 46761013..719cab8e 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -471,6 +471,9 @@ class WebScrapingStrategy(ContentScrapingStrategy): return False keep_element = False + # Special case for table elements - always preserve structure + if element.name in ["tr", "td", "th"]: + keep_element = True exclude_domains = kwargs.get("exclude_domains", []) # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) @@ -1130,6 +1133,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): "source", "track", "wbr", + "tr", + "td", + "th", } for el in reversed(list(root.iterdescendants())): From 5edfea279d6add5a2a2914f862a5d6af67e7b6b5 Mon Sep 17 00:00:00 2001 From: jawshoeadan <62785552+jawshoeadan@users.noreply.github.com> Date: Sun, 2 Mar 2025 16:58:00 +0100 Subject: [PATCH 02/36] Fix LiteLLM branding and link --- docs/md_v2/extraction/llm-strategies.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md index dc2dba1a..d1f68239 100644 --- a/docs/md_v2/extraction/llm-strategies.md +++ b/docs/md_v2/extraction/llm-strategies.md @@ -2,7 +2,7 @@ In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that: -1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more). +1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more). 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results. 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach. @@ -18,9 +18,9 @@ In some cases, you need to extract **complex or unstructured** information from --- -## 2. Provider-Agnostic via LightLLM +## 2. Provider-Agnostic via LiteLLM -Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide: +Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide: - **`provider`**: The `/` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.). - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it. @@ -288,7 +288,7 @@ if __name__ == "__main__": ## 11. Conclusion -**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind: +**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind: - Put your LLM strategy **in `CrawlerRunConfig`**. - Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees. @@ -319,4 +319,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS --- -That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling! \ No newline at end of file +That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling! From 1e819cdb2663d93d3d204760c107182a58d9c77c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 3 Mar 2025 11:53:15 +0530 Subject: [PATCH 03/36] fixes: https://github.com/unclecode/crawl4ai/issues/774 --- docs/md_v2/api/parameters.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index ed3828c8..b8a1a213 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -230,6 +230,7 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) +``` ## 2.4 Compliance & Ethics From 504207faa61c8b52f8e9e781529248a898288310 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 3 Mar 2025 19:24:44 +0530 Subject: [PATCH 04/36] docs: update text in llm-strategies.md to reflect new changes in LlmConfig --- docs/md_v2/extraction/llm-strategies.md | 32 ++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md index 4effb74b..d40be2db 100644 --- a/docs/md_v2/extraction/llm-strategies.md +++ b/docs/md_v2/extraction/llm-strategies.md @@ -20,11 +20,17 @@ In some cases, you need to extract **complex or unstructured** information from ## 2. Provider-Agnostic via LiteLLM +You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters). + +```python +llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +``` + Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide: - **`provider`**: The `/` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.). - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it. -- **`api_base`** (optional): If your provider has a custom endpoint. +- **`base_url`** (optional): If your provider has a custom endpoint. This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily. @@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`. -1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`. -2. **`api_token`** (str): The API key or token for that model. May not be needed for local models. -3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`. -4. **`extraction_type`** (str): `"schema"` or `"block"`. -5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.” -6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM. -7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity. -8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`. -9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include: +1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`. +2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`. +3. **`extraction_type`** (str): `"schema"` or `"block"`. +4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.” +5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM. +6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity. +7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`. +8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include: - `"markdown"`: The raw markdown (default). - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter. - `"html"`: The cleaned or raw HTML. -10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc. -11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known). +9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc. +10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known). **Example**: @@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel): async def main(): # LLM extraction strategy llm_strat = LLMExtractionStrategy( - provider="openai/gpt-4", - api_token=os.getenv('OPENAI_API_KEY'), + llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')), schema=KnowledgeGraph.schema_json(), extraction_type="schema", instruction="Extract entities and relationships from the content. Return valid JSON.", From 341b7a5f2a4ff900242b7847389d7f6caf28fe2e Mon Sep 17 00:00:00 2001 From: dvschuyl Date: Tue, 11 Mar 2025 11:05:14 +0100 Subject: [PATCH 05/36] =?UTF-8?q?=F0=9F=90=9B=20Truncate=20width=20to=20in?= =?UTF-8?q?teger=20string=20in=20parse=5Fsrcset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawl4ai/content_scraping_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 46761013..a7c51dd0 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -48,7 +48,7 @@ def parse_srcset(s: str) -> List[Dict]: if len(parts) >= 1: url = parts[0] width = ( - parts[1].rstrip("w") + parts[1].rstrip("w").split('.')[0] if len(parts) > 1 and parts[1].endswith("w") else None ) From a3954dd4c69a73ec1561e0dd695a72cfcd13abf7 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 14 Mar 2025 09:39:10 +0530 Subject: [PATCH 06/36] refactor: Move the checking of protocol and prepending protocol inside api handlers --- deploy/docker/api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index cc103905..c5700a9e 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -48,6 +48,8 @@ async def handle_llm_qa( ) -> str: """Process QA using LLM with crawled content as context.""" try: + if not url.startswith(('http://', 'https://')): + url = 'https://' + url # Extract base URL by finding last '?q=' occurrence last_q_index = url.rfind('?q=') if last_q_index != -1: @@ -61,7 +63,7 @@ async def handle_llm_qa( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=result.error_message ) - content = result.markdown.fit_markdown + content = result.markdown.fit_markdown or result.markdown.raw_markdown # Create prompt and get LLM response prompt = f"""Use the following content as context to answer the question. @@ -377,6 +379,7 @@ async def handle_crawl_request( ) -> dict: """Handle non-streaming crawl requests.""" try: + urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) From c190ba816d88753bb0bc927a8225898b7c3e9de6 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 14 Mar 2025 09:40:50 +0530 Subject: [PATCH 07/36] refactor: Instead of custom validation of question, rely on the built in FastAPI validator, so generated API docs also reflects this expectation correctly --- deploy/docker/server.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/deploy/docker/server.py b/deploy/docker/server.py index edb55130..40df17d5 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -101,13 +101,9 @@ async def get_markdown( async def llm_endpoint( request: Request, url: str = Path(...), - q: Optional[str] = Query(None), + q: str = Query(...), token_data: Optional[Dict] = Depends(token_dependency) ): - if not q: - raise HTTPException(status_code=400, detail="Query parameter 'q' is required") - if not url.startswith(('http://', 'https://')): - url = 'https://' + url try: answer = await handle_llm_qa(url, q, config) return JSONResponse({"answer": answer}) @@ -136,7 +132,6 @@ async def crawl( ): if not crawl_request.urls: raise HTTPException(status_code=400, detail="At least one URL required") - results = await handle_crawl_request( urls=crawl_request.urls, browser_config=crawl_request.browser_config, From 9109ecd8fc50ce9c9b87bd8e58aa863648556f82 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 18 Mar 2025 15:26:20 +0530 Subject: [PATCH 08/36] chore: Raise an exception with clear messaging when body tag is missing in the fetched html. The message should warn users to add appropriate wait_for condition to wait until body tag is loaded into DOM. fixes: https://github.com/unclecode/crawl4ai/issues/804 --- crawl4ai/content_scraping_strategy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ef622abe..215e7cda 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -862,6 +862,8 @@ class WebScrapingStrategy(ContentScrapingStrategy): parser_type = kwargs.get("parser", "lxml") soup = BeautifulSoup(html, parser_type) body = soup.body + if body is None: + raise Exception("'' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.") base_domain = get_base_domain(url) try: From 529a79725e267e0abd119482bc498d74a414176d Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 18 Mar 2025 16:14:00 +0530 Subject: [PATCH 09/36] docs: remove hallucinations from docs for CrawlerRunConfig + Add chunking strategy docs in the table --- docs/md_v2/api/parameters.md | 3 ++- docs/md_v2/core/browser-crawler-config.md | 26 ----------------------- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index b3e4349b..7e615a8c 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -69,7 +69,8 @@ We group them by category. | **Parameter** | **Type / Default** | **What It Does** | |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------| | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | -| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | +| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). +| **`chunking_strategy`** | `ChunkingStrategy` (default: RegexChunking) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | | **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. | | **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. | diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 0d97e0fc..a080fca3 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -136,11 +136,6 @@ class CrawlerRunConfig: wait_for=None, screenshot=False, pdf=False, - enable_rate_limiting=False, - rate_limit_config=None, - memory_threshold_percent=70.0, - check_interval=1.0, - max_session_permit=20, display_mode=None, verbose=True, stream=False, # Enable streaming for arun_many() @@ -183,25 +178,7 @@ class CrawlerRunConfig: - Logs additional runtime details. - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`. -9. **`enable_rate_limiting`**: - - If `True`, enables rate limiting for batch processing. - - Requires `rate_limit_config` to be set. -10. **`memory_threshold_percent`**: - - The memory threshold (as a percentage) to monitor. - - If exceeded, the crawler will pause or slow down. - -11. **`check_interval`**: - - The interval (in seconds) to check system resources. - - Affects how often memory and CPU usage are monitored. - -12. **`max_session_permit`**: - - The maximum number of concurrent crawl sessions. - - Helps prevent overwhelming the system. - -13. **`display_mode`**: - - The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - - Affects how much information is printed during the crawl. ### Helper Methods @@ -236,9 +213,6 @@ The `clone()` method: --- - - - ## 3. LLMConfig Essentials ### Key fields to note From 4359b1200377d86af3cd10fa98f91cf599b16d6a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 18 Mar 2025 17:20:24 +0530 Subject: [PATCH 10/36] docs + fix: Update example for full page screenshot & PDF export. Fix the bug Error: crawl4ai.async_webcrawler.AsyncWebCrawler.aprocess_html() got multiple values for keyword argument - for screenshot param. https://github.com/unclecode/crawl4ai/issues/822#issuecomment-2732602118 --- crawl4ai/async_webcrawler.py | 10 +++------- .../full_page_screenshot_and_pdf_export.md | 16 +++++++++------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 430e26a0..3aa7701a 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -398,7 +398,7 @@ class AsyncWebCrawler: html=html, extracted_content=extracted_content, config=config, # Pass the config object instead of individual parameters - screenshot=screenshot_data, + screenshot_data=screenshot_data, pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, @@ -482,7 +482,7 @@ class AsyncWebCrawler: html: str, extracted_content: str, config: CrawlerRunConfig, - screenshot: str, + screenshot_data: str, pdf_data: str, verbose: bool, **kwargs, @@ -495,7 +495,7 @@ class AsyncWebCrawler: html: Raw HTML content extracted_content: Previously extracted content (if any) config: Configuration object controlling processing behavior - screenshot: Screenshot data (if any) + screenshot_data: Screenshot data (if any) pdf_data: PDF data (if any) verbose: Whether to enable verbose logging **kwargs: Additional parameters for backwards compatibility @@ -620,10 +620,6 @@ class AsyncWebCrawler: params={"url": _url, "timing": time.perf_counter() - t1}, ) - # Handle screenshot and PDF data - screenshot_data = None if not screenshot else screenshot - pdf_data = None if not pdf_data else pdf_data - # Apply HTML formatting if requested if config.prettiify: cleaned_html = fast_format_html(cleaned_html) diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md index 8522675c..bf11f8db 100644 --- a/docs/examples/full_page_screenshot_and_pdf_export.md +++ b/docs/examples/full_page_screenshot_and_pdf_export.md @@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page **Simple Example:** ```python -import os, sys +import os +import sys import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig # Adjust paths as needed parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -26,9 +27,11 @@ async def main(): # Request both PDF and screenshot result = await crawler.arun( url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', - cache_mode=CacheMode.BYPASS, - pdf=True, - screenshot=True + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + pdf=True, + screenshot=True + ) ) if result.success: @@ -40,9 +43,8 @@ async def main(): # Save PDF if result.pdf: - pdf_bytes = b64decode(result.pdf) with open(os.path.join(__location__, "page.pdf"), "wb") as f: - f.write(pdf_bytes) + f.write(result.pdf) if __name__ == "__main__": asyncio.run(main()) From eedda1ae5ca0fa38ee72fa424a7255bab698efc3 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 20 Mar 2025 18:56:19 +0530 Subject: [PATCH 11/36] fix: Truncate long urls in middle than end since users are confused that same url is being scraped several times. Also remove labels on status and timer to be replaced with symbols to save space and display more URL --- crawl4ai/async_logger.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index 6f89c217..c733c31a 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -37,11 +37,11 @@ class AsyncLoggerBase(ABC): pass @abstractmethod - def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100): pass @abstractmethod - def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100): pass class AsyncLogger(AsyncLoggerBase): @@ -110,6 +110,14 @@ class AsyncLogger(AsyncLoggerBase): def _get_icon(self, tag: str) -> str: """Get the icon for a tag, defaulting to info icon if not found.""" return self.icons.get(tag, self.icons["INFO"]) + + def _shorten(self, text, length, placeholder="..."): + """Truncate text in the middle if longer than length, or pad if shorter.""" + if len(text) <= length: + return text.ljust(length) # Pad with spaces to reach desired length + half = (length - len(placeholder)) // 2 + shortened = text[:half] + placeholder + text[-half:] + return shortened.ljust(length) # Also pad shortened text to consistent length def _write_to_file(self, message: str): """Write a message to the log file if configured.""" @@ -210,7 +218,7 @@ class AsyncLogger(AsyncLoggerBase): success: bool, timing: float, tag: str = "FETCH", - url_length: int = 50, + url_length: int = 100, ): """ Convenience method for logging URL fetch status. @@ -224,12 +232,11 @@ class AsyncLogger(AsyncLoggerBase): """ self._log( level=LogLevel.SUCCESS if success else LogLevel.ERROR, - message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + message="{url} | {status} | ⏱: {timing:.2f}s", tag=tag, params={ - "url": url, - "url_length": url_length, - "status": success, + "url": self._shorten(url, url_length), + "status": "✓" if success else "✗", "timing": timing, }, colors={ @@ -252,9 +259,9 @@ class AsyncLogger(AsyncLoggerBase): """ self._log( level=LogLevel.ERROR, - message="{url:.{url_length}}... | Error: {error}", + message="{url} | Error: {error}", tag=tag, - params={"url": url, "url_length": url_length, "error": error}, + params={"url": self.shorten(url,url_length), "error": error}, ) class AsyncFileLogger(AsyncLoggerBase): @@ -298,13 +305,13 @@ class AsyncFileLogger(AsyncLoggerBase): """Log an error message to file.""" self._write_to_file("ERROR", message, tag) - def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100): """Log URL fetch status to file.""" status = "SUCCESS" if success else "FAILED" message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s" self._write_to_file("URL_STATUS", message, tag) - def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100): """Log error status to file.""" message = f"{url[:url_length]}... | Error: {error}" self._write_to_file("ERROR", message, tag) From ac2f9ae533b7560f057d8558ff84c8fca4f647ee Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 20 Mar 2025 18:59:15 +0530 Subject: [PATCH 12/36] fix: streamline url status logging via single entrypoint i.e. logger.url_status --- crawl4ai/async_webcrawler.py | 158 ++++++++++++++++++++--------------- deps.txt | 115 +++++++++++++++++++++++++ 2 files changed, 205 insertions(+), 68 deletions(-) create mode 100644 deps.txt diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index a6374e89..98111e4b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,12 +10,17 @@ import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult +from .models import ( + CrawlResult, + MarkdownGenerationResult, + DispatchResult, + ScrapingResult, +) from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import IdentityChunking from .content_filter_strategy import * # noqa: F403 -from .extraction_strategy import * # noqa: F403 +from .extraction_strategy import * # noqa: F403 from .extraction_strategy import NoExtractionStrategy from .async_crawler_strategy import ( AsyncCrawlerStrategy, @@ -30,7 +35,7 @@ from .markdown_generation_strategy import ( from .deep_crawling import DeepCrawlDecorator from .async_logger import AsyncLogger, AsyncLoggerBase from .async_configs import BrowserConfig, CrawlerRunConfig -from .async_dispatcher import * # noqa: F403 +from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .utils import ( @@ -44,9 +49,10 @@ from .utils import ( from typing import Union, AsyncGenerator -CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) +CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult) # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] + class CrawlResultContainer(Generic[CrawlResultT]): def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): # Normalize to a list @@ -68,20 +74,21 @@ class CrawlResultContainer(Generic[CrawlResultT]): # Delegate attribute access to the first element. if self._results: return getattr(self._results[0], attr) - raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + raise AttributeError( + f"{self.__class__.__name__} object has no attribute '{attr}'" + ) def __repr__(self): return f"{self.__class__.__name__}({self._results!r})" + # Redefine the union type. Now synchronous calls always return a container, # while stream mode is handled with an AsyncGenerator. RunManyReturn = Union[ - CrawlResultContainer[CrawlResultT], - AsyncGenerator[CrawlResultT, None] + CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None] ] - class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. @@ -193,7 +200,7 @@ class AsyncWebCrawler: # Decorate arun method with deep crawling capabilities self._deep_handler = DeepCrawlDecorator(self) - self.arun = self._deep_handler(self.arun) + self.arun = self._deep_handler(self.arun) async def start(self): """ @@ -210,26 +217,39 @@ class AsyncWebCrawler: AsyncWebCrawler: The initialized crawler instance """ # Check for builtin browser if requested - if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url: + if ( + self.browser_config.browser_mode == "builtin" + and not self.browser_config.cdp_url + ): # Import here to avoid circular imports from .browser_profiler import BrowserProfiler + profiler = BrowserProfiler(logger=self.logger) - + # Get builtin browser info or launch if needed browser_info = profiler.get_builtin_browser_info() if not browser_info: - self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER") + self.logger.info( + "Builtin browser not found, launching new instance...", + tag="BROWSER", + ) cdp_url = await profiler.launch_builtin_browser() if not cdp_url: - self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER") + self.logger.warning( + "Failed to launch builtin browser, falling back to dedicated browser", + tag="BROWSER", + ) else: self.browser_config.cdp_url = cdp_url self.browser_config.use_managed_browser = True else: - self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER") - self.browser_config.cdp_url = browser_info.get('cdp_url') + self.logger.info( + f"Using existing builtin browser at {browser_info.get('cdp_url')}", + tag="BROWSER", + ) + self.browser_config.cdp_url = browser_info.get("cdp_url") self.browser_config.use_managed_browser = True - + await self.crawler_strategy.__aenter__() await self.awarmup() return self @@ -305,7 +325,7 @@ class AsyncWebCrawler: # Auto-start if not ready if not self.ready: await self.start() - + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") @@ -319,9 +339,7 @@ class AsyncWebCrawler: config.cache_mode = CacheMode.ENABLED # Create cache context - cache_context = CacheContext( - url, config.cache_mode, False - ) + cache_context = CacheContext(url, config.cache_mode, False) # Initialize processing variables async_response: AsyncCrawlResponse = None @@ -351,7 +369,7 @@ class AsyncWebCrawler: # if config.screenshot and not screenshot or config.pdf and not pdf: if config.screenshot and not screenshot_data: cached_result = None - + if config.pdf and not pdf_data: cached_result = None @@ -383,14 +401,18 @@ class AsyncWebCrawler: # Check robots.txt if enabled if config and config.check_robots_txt: - if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent): + if not await self.robots_parser.can_fetch( + url, self.browser_config.user_agent + ): return CrawlResult( url=url, html="", success=False, status_code=403, error_message="Access denied by robots.txt", - response_headers={"X-Robots-Status": "Blocked by robots.txt"} + response_headers={ + "X-Robots-Status": "Blocked by robots.txt" + }, ) ############################## @@ -417,7 +439,7 @@ class AsyncWebCrawler: ############################################################### # Process the HTML content, Call CrawlerStrategy.process_html # ############################################################### - crawl_result : CrawlResult = await self.aprocess_html( + crawl_result: CrawlResult = await self.aprocess_html( url=url, html=html, extracted_content=extracted_content, @@ -441,18 +463,11 @@ class AsyncWebCrawler: crawl_result.success = bool(html) crawl_result.session_id = getattr(config, "session_id", None) - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", + self.logger.url_status( + url=cache_context.display_url, + success=crawl_result.success, + timing=time.perf_counter() - start_time, tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s", - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW, - }, ) # Update cache if appropriate @@ -462,17 +477,12 @@ class AsyncWebCrawler: return CrawlResultContainer(crawl_result) else: - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": True, - "timing": f"{time.perf_counter() - start_time:.2f}s", - }, - colors={"status": Fore.GREEN, "timing": Fore.YELLOW}, + self.logger.url_status( + url=cache_context.display_url, + success=True, + timing=time.perf_counter() - start_time, + tag="COMPLETE" ) - cached_result.success = bool(html) cached_result.session_id = getattr(config, "session_id", None) cached_result.redirected_url = cached_result.redirected_url or url @@ -494,7 +504,7 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResultContainer( + return CrawlResultContainer( CrawlResult( url=url, html="", success=False, error_message=error_message ) @@ -539,15 +549,14 @@ class AsyncWebCrawler: # Process HTML content params = config.__dict__.copy() - params.pop("url", None) + params.pop("url", None) # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) - ################################ # Scraping Strategy Execution # ################################ - result : ScrapingResult = scraping_strategy.scrap(url, html, **params) + result: ScrapingResult = scraping_strategy.scrap(url, html, **params) if result is None: raise ValueError( @@ -593,11 +602,17 @@ class AsyncWebCrawler: ) # Log processing completion - self.logger.info( - message="{url:.50}... | Time: {timing}s", + self.logger.url_status( + url=_url, + success=True, + timing=int((time.perf_counter() - t1) * 1000) / 1000, tag="SCRAPE", - params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, ) + # self.logger.info( + # message="{url:.50}... | Time: {timing}s", + # tag="SCRAPE", + # params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, + # ) ################################ # Structured Content Extraction # @@ -667,7 +682,7 @@ class AsyncWebCrawler: async def arun_many( self, urls: List[str], - config: Optional[CrawlerRunConfig] = None, + config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility # word_count_threshold=MIN_WORD_THRESHOLD, @@ -681,8 +696,8 @@ class AsyncWebCrawler: # pdf: bool = False, # user_agent: str = None, # verbose=True, - **kwargs - ) -> RunManyReturn: + **kwargs, + ) -> RunManyReturn: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. @@ -738,28 +753,35 @@ class AsyncWebCrawler: def transform_result(task_result): return ( - setattr(task_result.result, 'dispatch_result', - DispatchResult( - task_id=task_result.task_id, - memory_usage=task_result.memory_usage, - peak_memory=task_result.peak_memory, - start_time=task_result.start_time, - end_time=task_result.end_time, - error_message=task_result.error_message, - ) - ) or task_result.result + setattr( + task_result.result, + "dispatch_result", + DispatchResult( + task_id=task_result.task_id, + memory_usage=task_result.memory_usage, + peak_memory=task_result.peak_memory, + start_time=task_result.start_time, + end_time=task_result.end_time, + error_message=task_result.error_message, + ), ) + or task_result.result + ) stream = config.stream - + if stream: + async def result_transformer(): - async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config): + async for task_result in dispatcher.run_urls_stream( + crawler=self, urls=urls, config=config + ): yield transform_result(task_result) + return result_transformer() else: _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) - return [transform_result(res) for res in _results] + return [transform_result(res) for res in _results] async def aclear_cache(self): """Clear the cache database.""" diff --git a/deps.txt b/deps.txt new file mode 100644 index 00000000..1d085f0f --- /dev/null +++ b/deps.txt @@ -0,0 +1,115 @@ +aiofiles==24.1.0 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiolimiter==1.2.1 +aiosignal==1.3.2 +aiosqlite==0.20.0 +annotated-types==0.7.0 +anyio==4.8.0 +attrs==24.3.0 +beautifulsoup4==4.12.3 +certifi==2024.12.14 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.1 +click==8.1.8 +colorama==0.4.6 +-e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI +cryptography==44.0.0 +cssselect==1.2.0 +Cython==3.0.12 +Deprecated==1.2.18 +distro==1.9.0 +dnspython==2.7.0 +email_validator==2.2.0 +fake-http-header==0.3.5 +fake-useragent==2.0.3 +fastapi==0.115.11 +faust-cchardet==2.1.19 +filelock==3.16.1 +frozenlist==1.5.0 +fsspec==2024.12.0 +ghp-import==2.1.0 +greenlet==3.1.1 +gunicorn==23.0.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.27.2 +huggingface-hub==0.27.1 +humanize==4.12.1 +idna==3.10 +importlib_metadata==8.5.0 +iniconfig==2.0.0 +Jinja2==3.1.5 +jiter==0.8.2 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jwt==1.3.1 +limits==4.2 +litellm==1.59.0 +lxml==5.3.0 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mergedeep==1.3.4 +mkdocs==1.6.1 +mkdocs-get-deps==0.2.0 +mkdocs-terminal==4.7.0 +mockito==1.5.3 +multidict==6.1.0 +nltk==3.9.1 +numpy==2.2.2 +openai==1.59.9 +packaging==24.2 +pathspec==0.12.1 +pdf2image==1.17.0 +pillow==10.4.0 +platformdirs==4.3.6 +playwright==1.49.1 +pluggy==1.5.0 +prometheus-fastapi-instrumentator==7.0.2 +prometheus_client==0.21.1 +propcache==0.2.1 +psutil==6.1.1 +pycparser==2.22 +pydantic==2.10.5 +pydantic_core==2.27.2 +pyee==12.0.0 +Pygments==2.19.1 +pymdown-extensions==10.14.3 +pyOpenSSL==25.0.0 +pytest==8.3.4 +pytest-mockito==0.0.4 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +PyYAML==6.0.2 +pyyaml_env_tag==0.1 +rank-bm25==0.2.2 +redis==5.2.1 +referencing==0.36.1 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +six==1.17.0 +slowapi==0.1.9 +sniffio==1.3.1 +snowballstemmer==2.2.0 +soupsieve==2.6 +starlette==0.46.1 +tenacity==9.0.0 +tf-playwright-stealth==1.1.0 +tiktoken==0.8.0 +tokenizers==0.21.0 +tqdm==4.67.1 +typing_extensions==4.12.2 +urllib3==2.3.0 +uvicorn==0.34.0 +validators==0.34.0 +watchdog==6.0.0 +wrapt==1.17.2 +xxhash==3.5.0 +yarl==1.18.3 +zipp==3.21.0 From e0c2a7c2848102bc2001392f0ef4a33d679507f1 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 11:06:46 +0530 Subject: [PATCH 13/36] chore: remove mistakenly commited deps.txt file --- deps.txt | 115 ------------------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 deps.txt diff --git a/deps.txt b/deps.txt deleted file mode 100644 index 1d085f0f..00000000 --- a/deps.txt +++ /dev/null @@ -1,115 +0,0 @@ -aiofiles==24.1.0 -aiohappyeyeballs==2.4.4 -aiohttp==3.11.11 -aiolimiter==1.2.1 -aiosignal==1.3.2 -aiosqlite==0.20.0 -annotated-types==0.7.0 -anyio==4.8.0 -attrs==24.3.0 -beautifulsoup4==4.12.3 -certifi==2024.12.14 -cffi==1.17.1 -chardet==5.2.0 -charset-normalizer==3.4.1 -click==8.1.8 -colorama==0.4.6 --e git+https://github.com/unclecode/crawl4ai.git@4359b1200377d86af3cd10fa98f91cf599b16d6a#egg=Crawl4AI -cryptography==44.0.0 -cssselect==1.2.0 -Cython==3.0.12 -Deprecated==1.2.18 -distro==1.9.0 -dnspython==2.7.0 -email_validator==2.2.0 -fake-http-header==0.3.5 -fake-useragent==2.0.3 -fastapi==0.115.11 -faust-cchardet==2.1.19 -filelock==3.16.1 -frozenlist==1.5.0 -fsspec==2024.12.0 -ghp-import==2.1.0 -greenlet==3.1.1 -gunicorn==23.0.0 -h11==0.14.0 -httpcore==1.0.7 -httpx==0.27.2 -huggingface-hub==0.27.1 -humanize==4.12.1 -idna==3.10 -importlib_metadata==8.5.0 -iniconfig==2.0.0 -Jinja2==3.1.5 -jiter==0.8.2 -joblib==1.4.2 -jsonschema==4.23.0 -jsonschema-specifications==2024.10.1 -jwt==1.3.1 -limits==4.2 -litellm==1.59.0 -lxml==5.3.0 -Markdown==3.7 -markdown-it-py==3.0.0 -MarkupSafe==3.0.2 -mdurl==0.1.2 -mergedeep==1.3.4 -mkdocs==1.6.1 -mkdocs-get-deps==0.2.0 -mkdocs-terminal==4.7.0 -mockito==1.5.3 -multidict==6.1.0 -nltk==3.9.1 -numpy==2.2.2 -openai==1.59.9 -packaging==24.2 -pathspec==0.12.1 -pdf2image==1.17.0 -pillow==10.4.0 -platformdirs==4.3.6 -playwright==1.49.1 -pluggy==1.5.0 -prometheus-fastapi-instrumentator==7.0.2 -prometheus_client==0.21.1 -propcache==0.2.1 -psutil==6.1.1 -pycparser==2.22 -pydantic==2.10.5 -pydantic_core==2.27.2 -pyee==12.0.0 -Pygments==2.19.1 -pymdown-extensions==10.14.3 -pyOpenSSL==25.0.0 -pytest==8.3.4 -pytest-mockito==0.0.4 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -PyYAML==6.0.2 -pyyaml_env_tag==0.1 -rank-bm25==0.2.2 -redis==5.2.1 -referencing==0.36.1 -regex==2024.11.6 -requests==2.32.3 -rich==13.9.4 -rpds-py==0.22.3 -six==1.17.0 -slowapi==0.1.9 -sniffio==1.3.1 -snowballstemmer==2.2.0 -soupsieve==2.6 -starlette==0.46.1 -tenacity==9.0.0 -tf-playwright-stealth==1.1.0 -tiktoken==0.8.0 -tokenizers==0.21.0 -tqdm==4.67.1 -typing_extensions==4.12.2 -urllib3==2.3.0 -uvicorn==0.34.0 -validators==0.34.0 -watchdog==6.0.0 -wrapt==1.17.2 -xxhash==3.5.0 -yarl==1.18.3 -zipp==3.21.0 From 8b761f232be85acc5d480bcc999b59348a22fcbc Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 13:40:23 +0530 Subject: [PATCH 14/36] fix: improve logged url readability by decoding encoded urls --- crawl4ai/async_logger.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index c733c31a..7a7b08ac 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -4,6 +4,7 @@ from typing import Optional, Dict, Any from colorama import Fore, Style, init import os from datetime import datetime +from urllib.parse import unquote class LogLevel(Enum): @@ -230,12 +231,14 @@ class AsyncLogger(AsyncLoggerBase): tag: Tag for the message url_length: Maximum length for URL in log """ + decoded_url = unquote(url) + readable_url = self._shorten(decoded_url, url_length) self._log( level=LogLevel.SUCCESS if success else LogLevel.ERROR, message="{url} | {status} | ⏱: {timing:.2f}s", tag=tag, params={ - "url": self._shorten(url, url_length), + "url": readable_url, "status": "✓" if success else "✗", "timing": timing, }, @@ -257,11 +260,13 @@ class AsyncLogger(AsyncLoggerBase): tag: Tag for the message url_length: Maximum length for URL in log """ + decoded_url = unquote(url) + readable_url = self._shorten(decoded_url, url_length) self._log( level=LogLevel.ERROR, message="{url} | Error: {error}", tag=tag, - params={"url": self.shorten(url,url_length), "error": error}, + params={"url": readable_url, "error": error}, ) class AsyncFileLogger(AsyncLoggerBase): From 6740e87b4d24e5e5904a8100419f3b1e0eed501a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 13:41:31 +0530 Subject: [PATCH 15/36] fix: remove trailing slash when the path is empty. This is causing dupicate crawls --- crawl4ai/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index acaf7933..5b8af794 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2002,7 +2002,7 @@ def normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, netloc, - parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.path.rstrip('/'), # Normalize trailing slash parsed.params, query, fragment @@ -2030,7 +2030,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, parsed.netloc.lower(), - parsed.path, + parsed.path.rstrip('/'), parsed.params, parsed.query, '' # Remove fragment From f89113377aa2e7ac40023976e63cb2d1d9a93255 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 13:44:57 +0530 Subject: [PATCH 16/36] fix: Move adding of visited urls to the 'visited' set, when queueing the URLs instead of after dequeuing, this is to prevent duplicate crawls. https://github.com/unclecode/crawl4ai/issues/843 --- crawl4ai/deep_crawling/bfs_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 54b72ea3..48c116dd 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}") self.stats.urls_skipped += 1 continue - + + visited.add(base_url) valid_links.append((base_url, score)) # If we have more valid links than capacity, sort by score and take the top ones @@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): while current_level and not self._cancel_event.is_set(): next_level: List[Tuple[str, Optional[str]]] = [] urls = [url for url, _ in current_level] - visited.update(urls) # Clone the config to disable deep crawling recursion and enforce batch mode. batch_config = config.clone(deep_crawl_strategy=None, stream=False) From 471d110c5e496a1334422ee177e95cf1675ad37b Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 16:48:07 +0530 Subject: [PATCH 17/36] fix: url normalisation ref: https://github.com/unclecode/crawl4ai/issues/841 --- crawl4ai/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 5b8af794..fe725317 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1958,6 +1958,10 @@ def normalize_url(href, base_url): if not parsed_base.scheme or not parsed_base.netloc: raise ValueError(f"Invalid base URL format: {base_url}") + # Ensure base_url ends with a trailing slash if it's a directory path + if not base_url.endswith('/'): + base_url = base_url + '/' + # Use urljoin to handle all cases normalized = urljoin(base_url, href.strip()) return normalized From e01d1e73e167bb89d6656f0bdda359555a1c0be0 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 21 Mar 2025 17:34:13 +0530 Subject: [PATCH 18/36] fix: link normalisation in BestFirstStrategy --- crawl4ai/deep_crawling/bff_strategy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 4811ba14..65d4e819 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -11,6 +11,7 @@ from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn +from ..utils import normalize_url_for_deep_crawl from math import inf as infinity @@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): valid_links = [] for link in links: url = link.get("href") - if url in visited: + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, new_depth): self.stats.urls_skipped += 1 continue - valid_links.append(url) + valid_links.append(base_url) # If we have more valid links than capacity, limit them if len(valid_links) > remaining_capacity: From 2f0e2177512369f89ed7579e8e261c3a7133deda Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 25 Mar 2025 13:44:41 +0530 Subject: [PATCH 19/36] Chore: Add brotli as dependancy to fix: https://github.com/unclecode/crawl4ai/issues/867 --- pyproject.toml | 1 + requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ad07548d..247974c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "pyperclip>=1.8.2", "faust-cchardet>=2.1.19", "aiohttp>=3.11.11", + "brotli>=1.1.0", "humanize>=4.10.0", ] classifiers = [ diff --git a/requirements.txt b/requirements.txt index c1f36c56..5fe0cc4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ psutil>=6.1.1 nltk>=3.9.1 rich>=13.9.4 cssselect>=1.2.0 -faust-cchardet>=2.1.19 \ No newline at end of file +faust-cchardet>=2.1.19 +brotli>=1.1.0 \ No newline at end of file From e3111d0a328ae2a0c78464de83cfc986f807c28b Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 25 Mar 2025 13:46:55 +0530 Subject: [PATCH 20/36] fix: prevent session closing after each request to maintain connection pool. Fixes: https://github.com/unclecode/crawl4ai/issues/867 --- crawl4ai/async_crawler_strategy.py | 133 ++++++++++++++--------------- 1 file changed, 63 insertions(+), 70 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 37aa0962..2330b3f3 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1702,15 +1702,6 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: await self.close() - @contextlib.asynccontextmanager - async def _session_context(self): - try: - if not self._session: - await self.start() - yield self._session - finally: - await self.close() - def set_hook(self, hook_type: str, hook_func: Callable) -> None: if hook_type in self.hooks: self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) @@ -1787,75 +1778,77 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse: - async with self._session_context() as session: - timeout = ClientTimeout( - total=config.page_timeout or self.DEFAULT_TIMEOUT, - connect=10, - sock_read=30 - ) - - headers = dict(self._BASE_HEADERS) - if self.browser_config.headers: - headers.update(self.browser_config.headers) + if not self._session or self._session.closed: + await self.start() + + timeout = ClientTimeout( + total=config.page_timeout or self.DEFAULT_TIMEOUT, + connect=10, + sock_read=30 + ) + + headers = dict(self._BASE_HEADERS) + if self.browser_config.headers: + headers.update(self.browser_config.headers) - request_kwargs = { - 'timeout': timeout, - 'allow_redirects': self.browser_config.follow_redirects, - 'ssl': self.browser_config.verify_ssl, - 'headers': headers - } + request_kwargs = { + 'timeout': timeout, + 'allow_redirects': self.browser_config.follow_redirects, + 'ssl': self.browser_config.verify_ssl, + 'headers': headers + } - if self.browser_config.method == "POST": - if self.browser_config.data: - request_kwargs['data'] = self.browser_config.data - if self.browser_config.json: - request_kwargs['json'] = self.browser_config.json + if self.browser_config.method == "POST": + if self.browser_config.data: + request_kwargs['data'] = self.browser_config.data + if self.browser_config.json: + request_kwargs['json'] = self.browser_config.json - await self.hooks['before_request'](url, request_kwargs) + await self.hooks['before_request'](url, request_kwargs) - try: - async with session.request(self.browser_config.method, url, **request_kwargs) as response: - content = memoryview(await response.read()) - - if not (200 <= response.status < 300): - raise HTTPStatusError( - response.status, - f"Unexpected status code for {url}" - ) - - encoding = response.charset - if not encoding: - encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' - - result = AsyncCrawlResponse( - html=content.tobytes().decode(encoding, errors='replace'), - response_headers=dict(response.headers), - status_code=response.status, - redirected_url=str(response.url) + try: + async with self._session.request(self.browser_config.method, url, **request_kwargs) as response: + content = memoryview(await response.read()) + + if not (200 <= response.status < 300): + raise HTTPStatusError( + response.status, + f"Unexpected status code for {url}" ) - - await self.hooks['after_request'](result) - return result + + encoding = response.charset + if not encoding: + encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' + + result = AsyncCrawlResponse( + html=content.tobytes().decode(encoding, errors='replace'), + response_headers=dict(response.headers), + status_code=response.status, + redirected_url=str(response.url) + ) + + await self.hooks['after_request'](result) + return result - except aiohttp.ServerTimeoutError as e: - await self.hooks['on_error'](e) - raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - - except aiohttp.ClientConnectorError as e: - await self.hooks['on_error'](e) - raise ConnectionError(f"Connection failed: {str(e)}") - - except aiohttp.ClientError as e: - await self.hooks['on_error'](e) - raise HTTPCrawlerError(f"HTTP client error: {str(e)}") + except aiohttp.ServerTimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - except asyncio.exceptions.TimeoutError as e: - await self.hooks['on_error'](e) - raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + except aiohttp.ClientConnectorError as e: + await self.hooks['on_error'](e) + raise ConnectionError(f"Connection failed: {str(e)}") - except Exception as e: - await self.hooks['on_error'](e) - raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") + except aiohttp.ClientError as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP client error: {str(e)}") + + except asyncio.exceptions.TimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except Exception as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") async def crawl( self, From 585e5e5973a264ac22343f9a4fdef54048b3b31f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 25 Mar 2025 15:17:59 +0530 Subject: [PATCH 21/36] fix: https://github.com/unclecode/crawl4ai/issues/733 --- crawl4ai/async_webcrawler.py | 3 ++- crawl4ai/content_scraping_strategy.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 98111e4b..91b98d7f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -448,6 +448,7 @@ class AsyncWebCrawler: pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -596,7 +597,7 @@ class AsyncWebCrawler: markdown_result: MarkdownGenerationResult = ( markdown_generator.generate_markdown( cleaned_html=cleaned_html, - base_url=url, + base_url=params.get("redirected_url", url), # html2text_options=kwargs.get('html2text', {}) ) ) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 215e7cda..0848d655 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -128,7 +128,8 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: ScrapingResult: A structured result containing the scraped content. """ - raw_result = self._scrap(url, html, is_async=False, **kwargs) + actual_url = kwargs.get("redirected_url", url) + raw_result = self._scrap(actual_url, html, is_async=False, **kwargs) if raw_result is None: return ScrapingResult( cleaned_html="", From 57e0423b3a6ddb9147fce898a2e5c0afaaead90d Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 28 Mar 2025 12:56:37 +0530 Subject: [PATCH 22/36] fix:target_element should not affect link extraction. -> https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 59 ++++++++------------------- 1 file changed, 16 insertions(+), 43 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 0848d655..11835d62 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -897,29 +897,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): for element in body.select(excluded_selector): element.extract() - # if False and css_selector: - # selected_elements = body.select(css_selector) - # if not selected_elements: - # return { - # "markdown": "", - # "cleaned_html": "", - # "success": True, - # "media": {"images": [], "videos": [], "audios": []}, - # "links": {"internal": [], "external": []}, - # "metadata": {}, - # "message": f"No elements found for CSS selector: {css_selector}", - # } - # # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") - # body = soup.new_tag("div") - # for el in selected_elements: - # body.append(el) - content_element = None if target_elements: try: for_content_targeted_element = [] for target_element in target_elements: - for_content_targeted_element.extend(body.select(target_element)) + # Creating a fresh parse of HTML for each selector to prevent element extraction + # from modifying the original DOM tree; this keeps the original body + # intact for link processing. This is better performant than deepcopy. + fresh_body = BeautifulSoup(html, "html.parser") + for_content_targeted_element.extend(fresh_body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: content_element.append(el) @@ -927,7 +914,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None else: - content_element = body + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -1536,34 +1523,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} - # Handle CSS selector targeting - # if css_selector: - # try: - # selected_elements = body.cssselect(css_selector) - # if not selected_elements: - # return { - # "markdown": "", - # "cleaned_html": "", - # "success": True, - # "media": {"images": [], "videos": [], "audios": []}, - # "links": {"internal": [], "external": []}, - # "metadata": meta, - # "message": f"No elements found for CSS selector: {css_selector}", - # } - # body = lhtml.Element("div") - # body.extend(selected_elements) - # except Exception as e: - # self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") - # return None - content_element = None if target_elements: try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) + for target_element in target_elements: + # Creating a fresh parse of HTML for each selector to prevent element extraction + # from modifying the original DOM tree; this keeps the original body + # intact for link processing. This is better performant than deepcopy. + fresh_body = lhtml.document_fromstring(html) + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(fresh_body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None From d8cbeff38643a119cc1534aa6176a5b45effc685 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Fri, 28 Mar 2025 19:31:05 +0530 Subject: [PATCH 23/36] fix: https://github.com/unclecode/crawl4ai/issues/842 --- crawl4ai/async_crawler_strategy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 2330b3f3..ddd6348e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Close the browser and clean up resources. """ await self.browser_manager.close() + # Explicitly reset the static Playwright instance + BrowserManager._playwright_instance = None async def kill_session(self, session_id: str): """ From 1119f2f5b50a3e8ae77c0baf93490329ed678ef9 Mon Sep 17 00:00:00 2001 From: "maggie.wang" Date: Mon, 31 Mar 2025 14:05:54 +0800 Subject: [PATCH 24/36] fix: https://github.com/unclecode/crawl4ai/issues/911 --- crawl4ai/async_crawler_strategy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index ddd6348e..7eef0196 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -820,7 +820,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for selector in selectors: try: - content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''") + content = await page.evaluate( + f"""Array.from(document.querySelectorAll("{selector}")) + .map(el => el.outerHTML) + .join('')""" + ) html_parts.append(content) except Error as e: print(f"Warning: Could not get content for selector '{selector}': {str(e)}") From ef1f0c410246c77ed6e68cb17574cde8a8aaab94 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 31 Mar 2025 12:43:32 +0530 Subject: [PATCH 25/36] fix:https://github.com/unclecode/crawl4ai/issues/701 --- crawl4ai/js_snippet/remove_overlay_elements.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js index 0400d89c..9d93b4ac 100644 --- a/crawl4ai/js_snippet/remove_overlay_elements.js +++ b/crawl4ai/js_snippet/remove_overlay_elements.js @@ -115,5 +115,6 @@ async () => { document.body.style.overflow = "auto"; // Wait a bit for any animations to complete - await new Promise((resolve) => setTimeout(resolve, 100)); + document.body.scrollIntoView(false); + await new Promise((resolve) => setTimeout(resolve, 250)); }; From 757e3177ed6cfed0cbd9b9f01c0c330ba5d6f18f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 31 Mar 2025 17:10:04 +0530 Subject: [PATCH 26/36] fix: https://github.com/unclecode/crawl4ai/issues/839 --- crawl4ai/async_crawler_strategy.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 7eef0196..f18a3c1d 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -532,14 +532,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if console_log_type == "error": self.logger.error( message=f"Console error: {msg}", # Use f-string for variable interpolation - tag="CONSOLE", - params={"msg": msg.text}, + tag="CONSOLE" ) elif console_log_type == "debug": self.logger.debug( message=f"Console: {msg}", # Use f-string for variable interpolation - tag="CONSOLE", - params={"msg": msg.text}, + tag="CONSOLE" ) page.on("console", log_consol) From 73fda8a6ec8ef35cdb63e1bae74411976d4e63b9 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Apr 2025 13:47:13 +0530 Subject: [PATCH 27/36] fix: address the PR review: https://github.com/unclecode/crawl4ai/pull/899#discussion_r2024639193 --- crawl4ai/content_scraping_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 91b1c674..eaed0816 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -905,7 +905,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): # Creating a fresh parse of HTML for each selector to prevent element extraction # from modifying the original DOM tree; this keeps the original body # intact for link processing. This is better performant than deepcopy. - fresh_body = BeautifulSoup(html, "html.parser") + fresh_body = BeautifulSoup(html, "lxml") for_content_targeted_element.extend(fresh_body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: From 4133e5460d734262f621bfa1edc9c4f168579fd9 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Apr 2025 17:42:24 +0530 Subject: [PATCH 28/36] typo-fix: https://github.com/unclecode/crawl4ai/pull/918 --- crawl4ai/content_scraping_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index eaed0816..0a157a08 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1605,7 +1605,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): # Remove empty elements self.remove_empty_elements_fast(body, 1) - # Remvoe unneeded attributes + # Remove unneeded attributes self.remove_unwanted_attributes_fast( body, keep_data_attributes=kwargs.get("keep_data_attributes", False) ) From 7155778eac65d9e9d7b09a4e6a4d6526ece2f476 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Apr 2025 17:42:51 +0530 Subject: [PATCH 29/36] chore: move from faust-cchardet to chardet --- crawl4ai/async_crawler_strategy.py | 4 ++-- pyproject.toml | 2 +- requirements.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index f18a3c1d..301d925f 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -24,7 +24,7 @@ from .browser_manager import BrowserManager import aiofiles import aiohttp -import cchardet +import chardet from aiohttp.client import ClientTimeout from urllib.parse import urlparse from types import MappingProxyType @@ -1822,7 +1822,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): encoding = response.charset if not encoding: - encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' + encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' result = AsyncCrawlResponse( html=content.tobytes().decode(encoding, errors='replace'), diff --git a/pyproject.toml b/pyproject.toml index 247974c5..032e5cd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "fake-useragent>=2.0.3", "click>=8.1.7", "pyperclip>=1.8.2", - "faust-cchardet>=2.1.19", + "chardet>=5.2.0", "aiohttp>=3.11.11", "brotli>=1.1.0", "humanize>=4.10.0", diff --git a/requirements.txt b/requirements.txt index 5fe0cc4c..0bb596d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,5 +21,5 @@ psutil>=6.1.1 nltk>=3.9.1 rich>=13.9.4 cssselect>=1.2.0 -faust-cchardet>=2.1.19 +chardet>=5.2.0 brotli>=1.1.0 \ No newline at end of file From 6f7ab9c92722f85db0e8aaa5fcf4d4275c6bc230 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 8 Apr 2025 18:31:00 +0530 Subject: [PATCH 30/36] fix: Revert changes to session management in AsyncHttpWebcrawler and solve the underlying issue by removing the session closure in finally block of session context. --- crawl4ai/async_crawler_strategy.py | 133 +++++++++++++++-------------- 1 file changed, 70 insertions(+), 63 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 301d925f..1e987450 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1706,6 +1706,15 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: await self.close() + @contextlib.asynccontextmanager + async def _session_context(self): + try: + if not self._session: + await self.start() + yield self._session + finally: + pass + def set_hook(self, hook_type: str, hook_func: Callable) -> None: if hook_type in self.hooks: self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) @@ -1782,77 +1791,75 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse: - if not self._session or self._session.closed: - await self.start() - - timeout = ClientTimeout( - total=config.page_timeout or self.DEFAULT_TIMEOUT, - connect=10, - sock_read=30 - ) - - headers = dict(self._BASE_HEADERS) - if self.browser_config.headers: - headers.update(self.browser_config.headers) + async with self._session_context() as session: + timeout = ClientTimeout( + total=config.page_timeout or self.DEFAULT_TIMEOUT, + connect=10, + sock_read=30 + ) + + headers = dict(self._BASE_HEADERS) + if self.browser_config.headers: + headers.update(self.browser_config.headers) - request_kwargs = { - 'timeout': timeout, - 'allow_redirects': self.browser_config.follow_redirects, - 'ssl': self.browser_config.verify_ssl, - 'headers': headers - } + request_kwargs = { + 'timeout': timeout, + 'allow_redirects': self.browser_config.follow_redirects, + 'ssl': self.browser_config.verify_ssl, + 'headers': headers + } - if self.browser_config.method == "POST": - if self.browser_config.data: - request_kwargs['data'] = self.browser_config.data - if self.browser_config.json: - request_kwargs['json'] = self.browser_config.json + if self.browser_config.method == "POST": + if self.browser_config.data: + request_kwargs['data'] = self.browser_config.data + if self.browser_config.json: + request_kwargs['json'] = self.browser_config.json - await self.hooks['before_request'](url, request_kwargs) + await self.hooks['before_request'](url, request_kwargs) - try: - async with self._session.request(self.browser_config.method, url, **request_kwargs) as response: - content = memoryview(await response.read()) - - if not (200 <= response.status < 300): - raise HTTPStatusError( - response.status, - f"Unexpected status code for {url}" + try: + async with session.request(self.browser_config.method, url, **request_kwargs) as response: + content = memoryview(await response.read()) + + if not (200 <= response.status < 300): + raise HTTPStatusError( + response.status, + f"Unexpected status code for {url}" + ) + + encoding = response.charset + if not encoding: + encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' + + result = AsyncCrawlResponse( + html=content.tobytes().decode(encoding, errors='replace'), + response_headers=dict(response.headers), + status_code=response.status, + redirected_url=str(response.url) ) - - encoding = response.charset - if not encoding: - encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' - - result = AsyncCrawlResponse( - html=content.tobytes().decode(encoding, errors='replace'), - response_headers=dict(response.headers), - status_code=response.status, - redirected_url=str(response.url) - ) - - await self.hooks['after_request'](result) - return result + + await self.hooks['after_request'](result) + return result - except aiohttp.ServerTimeoutError as e: - await self.hooks['on_error'](e) - raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + except aiohttp.ServerTimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except aiohttp.ClientConnectorError as e: + await self.hooks['on_error'](e) + raise ConnectionError(f"Connection failed: {str(e)}") + + except aiohttp.ClientError as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP client error: {str(e)}") - except aiohttp.ClientConnectorError as e: - await self.hooks['on_error'](e) - raise ConnectionError(f"Connection failed: {str(e)}") + except asyncio.exceptions.TimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - except aiohttp.ClientError as e: - await self.hooks['on_error'](e) - raise HTTPCrawlerError(f"HTTP client error: {str(e)}") - - except asyncio.exceptions.TimeoutError as e: - await self.hooks['on_error'](e) - raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - - except Exception as e: - await self.hooks['on_error'](e) - raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") + except Exception as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") async def crawl( self, From d84508b4d5dad7c3b8f9b772cedfdc08c89ab2a9 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 12 Apr 2025 12:05:17 +0530 Subject: [PATCH 31/36] fix: revert the old target_elms code in regular webscraping strategy --- crawl4ai/content_scraping_strategy.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 81fe9d4e..0a93352b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -908,11 +908,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: for_content_targeted_element = [] for target_element in target_elements: - # Creating a fresh parse of HTML for each selector to prevent element extraction - # from modifying the original DOM tree; this keeps the original body - # intact for link processing. This is better performant than deepcopy. - fresh_body = BeautifulSoup(html, "lxml") - for_content_targeted_element.extend(fresh_body.select(target_element)) + for_content_targeted_element.extend(body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: content_element.append(el) @@ -920,7 +916,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None else: - content_element = body + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS From 9fc5d315af570f51c5068f7aea95e6597c9773c9 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 12 Apr 2025 12:07:04 +0530 Subject: [PATCH 32/36] fix: revert the old target_elms code in LXMLwebscraping strategy --- crawl4ai/content_scraping_strategy.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 0a93352b..814e4b2b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1535,17 +1535,11 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): content_element = None if target_elements: try: - content_element = lhtml.Element("div") + for_content_targeted_element = [] for target_element in target_elements: - # Creating a fresh parse of HTML for each selector to prevent element extraction - # from modifying the original DOM tree; this keeps the original body - # intact for link processing. This is better performant than deepcopy. - fresh_body = lhtml.document_fromstring(html) - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(fresh_body.cssselect(target_element)) - content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None From 7d8e81fb2e04b4c0844b37491664b05f65441567 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 12 Apr 2025 12:44:00 +0530 Subject: [PATCH 33/36] fix: fix target_elements, in a less invasive and more efficient way simply by changing order of execution :) https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 58 +++++++++++++-------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 814e4b2b..aa69c5fb 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -901,22 +901,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): element.extract() else: for element in body.select(excluded_selector): - element.extract() - - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.select(target_element)) - content_element = soup.new_tag("div") - for el in for_content_targeted_element: - content_element.append(el) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body + element.extract() kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -976,6 +961,20 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = "" try: + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.select(target_element)) + content_element = soup.new_tag("div") + for el in for_content_targeted_element: + content_element.append(el) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body str_body = content_element.encode_contents().decode("utf-8") except Exception: # Reset body to the original HTML @@ -1532,20 +1531,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) - content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body - # Remove script and style tags for tag in ["script", "style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): @@ -1614,6 +1599,19 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): ) # Generate output HTML + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body cleaned_html = lhtml.tostring( # body, content_element, From dcc265458cef022a6b03bcaa47686e08869bcb02 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 14 Apr 2025 12:39:05 +0530 Subject: [PATCH 34/36] fix: Add a nominal wait time for remove overlay elements since it's already controllable through delay_before_return_html --- crawl4ai/js_snippet/remove_overlay_elements.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js index 9d93b4ac..a50d9427 100644 --- a/crawl4ai/js_snippet/remove_overlay_elements.js +++ b/crawl4ai/js_snippet/remove_overlay_elements.js @@ -116,5 +116,5 @@ async () => { // Wait a bit for any animations to complete document.body.scrollIntoView(false); - await new Promise((resolve) => setTimeout(resolve, 250)); + await new Promise((resolve) => setTimeout(resolve, 50)); }; From c2902fd200fa5ad354da33d8528a12844b3c75be Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 19 Apr 2025 19:46:20 +0530 Subject: [PATCH 35/36] reverse:last change in order of execution for it introduced a new issue in content generated. https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 58 ++++++++++++++------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index aa69c5fb..814e4b2b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -901,7 +901,22 @@ class WebScrapingStrategy(ContentScrapingStrategy): element.extract() else: for element in body.select(excluded_selector): - element.extract() + element.extract() + + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.select(target_element)) + content_element = soup.new_tag("div") + for el in for_content_targeted_element: + content_element.append(el) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -961,20 +976,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = "" try: - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.select(target_element)) - content_element = soup.new_tag("div") - for el in for_content_targeted_element: - content_element.append(el) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body str_body = content_element.encode_contents().decode("utf-8") except Exception: # Reset body to the original HTML @@ -1531,6 +1532,20 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body + # Remove script and style tags for tag in ["script", "style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): @@ -1599,19 +1614,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): ) # Generate output HTML - content_element = None - if target_elements: - try: - for_content_targeted_element = [] - for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) - content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) - except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") - return None - else: - content_element = body cleaned_html = lhtml.tostring( # body, content_element, From d2648eaa39d4232b3de6a27a1170b5fef8ecc389 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 19 Apr 2025 20:08:36 +0530 Subject: [PATCH 36/36] fix: solved with deepcopy of elements https://github.com/unclecode/crawl4ai/issues/902 --- crawl4ai/content_scraping_strategy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 814e4b2b..1dfbce84 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -28,6 +28,7 @@ from lxml import etree from lxml import html as lhtml from typing import List from .models import ScrapingResult, MediaItem, Link, Media, Links +import copy # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r"^og:") @@ -911,7 +912,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): for_content_targeted_element.extend(body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: - content_element.append(el) + content_element.append(copy.deepcopy(el)) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None @@ -1539,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): for target_element in target_elements: for_content_targeted_element.extend(body.cssselect(target_element)) content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) + content_element.extend(copy.deepcopy(for_content_targeted_element)) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None